PHPIDS
Current file: /home/mario/workspace/php-ids.org/trunk/lib/IDS/vendors/htmlpurifier/HTMLPurifier/Lexer.php
Legend: executed not executed dead code

  Coverage
  Classes Methods Lines
Total
100.00 %100.00%
100.00% 1 / 1
66.67 %66.67%
66.67% 6 / 9
59.09 %59.09%
59.09% 39 / 66
 
HTMLPurifier_Lexer
100.00 %100.00%
100.00% 1 / 1
66.67 %66.67%
66.67% 6 / 9
59.09 %59.09%
59.09% 39 / 66
 public static function create($config)
100.00 %100.00%
100.00% 1 / 1
45.16 %45.16%
45.16% 14 / 31
 public function __construct()
100.00 %100.00%
100.00% 1 / 1
100.00 %100.00%
100.00% 2 / 2
 public function parseData($string)
100.00 %100.00%
100.00% 1 / 1
100.00 %100.00%
100.00% 4 / 4
 public function tokenizeHTML($string, $config, $context)
0.00 %0.00%
0.00% 0 / 1
0.00 %0.00%
0.00% 0 / 2
 protected static function escapeCDATA($string)
100.00 %100.00%
100.00% 1 / 1
100.00 %100.00%
100.00% 4 / 4
 protected static function escapeCommentedCDATA($string)
0.00 %0.00%
0.00% 0 / 1
0.00 %0.00%
0.00% 0 / 4
 protected static function CDATACallback($matches)
0.00 %0.00%
0.00% 0 / 1
0.00 %0.00%
0.00% 0 / 1
 public function normalize($html, $config, $context)
100.00 %100.00%
100.00% 1 / 1
83.33 %83.33%
83.33% 10 / 12
 public function extractBody($html)
100.00 %100.00%
100.00% 1 / 1
80.00 %80.00%
80.00% 4 / 5


       1                 : <?php                                                                                                                 
       2                 :                                                                                                                       
       3                 : /**                                                                                                                   
       4                 :  * Forgivingly lexes HTML (SGML-style) markup into tokens.                                                            
       5                 :  *                                                                                                                    
       6                 :  * A lexer parses a string of SGML-style markup and converts them into                                                
       7                 :  * corresponding tokens.  It doesn't check for well-formedness, although its                                          
       8                 :  * internal mechanism may make this automatic (such as the case of                                                    
       9                 :  * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose                                           
      10                 :  * from.                                                                                                              
      11                 :  *                                                                                                                    
      12                 :  * A lexer is HTML-oriented: it might work with XML, but it's not                                                     
      13                 :  * recommended, as we adhere to a subset of the specification for optimization                                        
      14                 :  * reasons. This might change in the future. Also, most tokenizers are not                                            
      15                 :  * expected to handle DTDs or PIs.                                                                                    
      16                 :  *                                                                                                                    
      17                 :  * This class should not be directly instantiated, but you may use create() to                                        
      18                 :  * retrieve a default copy of the lexer.  Being a supertype, this class                                               
      19                 :  * does not actually define any implementation, but offers commonly used                                              
      20                 :  * convenience functions for subclasses.                                                                              
      21                 :  *                                                                                                                    
      22                 :  * @note The unit tests will instantiate this class for testing purposes, as                                          
      23                 :  *       many of the utility functions require a class to be instantiated.                                            
      24                 :  *       This means that, even though this class is not runnable, it will                                             
      25                 :  *       not be declared abstract.                                                                                    
      26                 :  *                                                                                                                    
      27                 :  * @par                                                                                                               
      28                 :  *                                                                                                                    
      29                 :  * @note                                                                                                              
      30                 :  * We use tokens rather than create a DOM representation because DOM would:                                           
      31                 :  *                                                                                                                    
      32                 :  * @par                                                                                                               
      33                 :  *  -# Require more processing and memory to create,                                                                  
      34                 :  *  -# Is not streamable, and                                                                                         
      35                 :  *  -# Has the entire document structure (html and body not needed).                                                  
      36                 :  *                                                                                                                    
      37                 :  * @par                                                                                                               
      38                 :  * However, DOM is helpful in that it makes it easy to move around nodes                                              
      39                 :  * without a lot of lookaheads to see when a tag is closed. This is a                                                 
      40                 :  * limitation of the token system and some workarounds would be nice.                                                 
      41                 :  */                                                                                                                   
      42                 : class HTMLPurifier_Lexer                                                                                              
      43               1 : {                                                                                                                     
      44                 :                                                                                                                       
      45                 :     // -- STATIC ----------------------------------------------------------                                           
      46                 :                                                                                                                       
      47                 :     /**                                                                                                               
      48                 :      * Retrieves or sets the default Lexer as a Prototype Factory.                                                    
      49                 :      *                                                                                                                
      50                 :      * By default HTMLPurifier_Lexer_DOMLex will be returned. There are                                               
      51                 :      * a few exceptions involving special features that only DirectLex                                                
      52                 :      * implements.                                                                                                    
      53                 :      *                                                                                                                
      54                 :      * @note The behavior of this class has changed, rather than accepting                                            
      55                 :      *       a prototype object, it now accepts a configuration object.                                               
      56                 :      *       To specify your own prototype, set %Core.LexerImpl to it.                                                
      57                 :      *       This change in behavior de-singletonizes the lexer object.                                               
      58                 :      *                                                                                                                
      59                 :      * @param $config Instance of HTMLPurifier_Config                                                                 
      60                 :      * @return Concrete lexer.                                                                                        
      61                 :      */                                                                                                               
      62                 :     public static function create($config) {                                                                          
      63                 :                                                                                                                       
      64               2 :         if (!($config instanceof HTMLPurifier_Config)) {                                                              
      65               0 :             $lexer = $config;                                                                                         
      66               0 :             trigger_error("Passing a prototype to                                                                     
      67                 :               HTMLPurifier_Lexer::create() is deprecated, please instead                                              
      68               0 :               use %Core.LexerImpl", E_USER_WARNING);                                                                  
      69               0 :         } else {                                                                                                      
      70               2 :             $lexer = $config->get('Core', 'LexerImpl');                                                               
      71                 :         }                                                                                                             
      72                 :                                                                                                                       
      73               2 :         if (is_object($lexer)) {                                                                                      
      74               0 :             return $lexer;                                                                                            
      75                 :         }                                                                                                             
      76                 :                                                                                                                       
      77               2 :         if (is_null($lexer)) { do {                                                                                   
      78                 :             // auto-detection algorithm                                                                               
      79                 :                                                                                                                       
      80                 :             // once PHP DOM implements native line numbers, or we                                                     
      81                 :             // hack out something using XSLT, remove this stipulation                                                 
      82               2 :             $line_numbers = $config->get('Core', 'MaintainLineNumbers');                                              
      83                 :             if (                                                                                                      
      84               2 :                 $line_numbers === true ||                                                                             
      85               2 :                 ($line_numbers === null && $config->get('Core', 'CollectErrors'))                                     
      86               2 :             ) {                                                                                                       
      87               0 :                 $lexer = 'DirectLex';                                                                                 
      88               0 :                 break;                                                                                                
      89               0 :             }                                                                                                         
      90                 :                                                                                                                       
      91               2 :             if (class_exists('DOMDocument')) {                                                                        
      92                 :                 // check for DOM support, because, surprisingly enough,                                               
      93                 :                 // it's *not* part of the core!                                                                       
      94               2 :                 $lexer = 'DOMLex';                                                                                    
      95               2 :             } else {                                                                                                  
      96               0 :                 $lexer = 'DirectLex';                                                                                 
      97                 :             }                                                                                                         
      98                 :                                                                                                                       
      99               2 :         } while(0); } // do..while so we can break                                                                    
     100                 :                                                                                                                       
     101                 :         // instantiate recognized string names                                                                        
     102                 :         switch ($lexer) {                                                                                             
     103               2 :             case 'DOMLex':                                                                                            
     104               2 :                 return new HTMLPurifier_Lexer_DOMLex();                                                               
     105               0 :             case 'DirectLex':                                                                                         
     106               0 :                 return new HTMLPurifier_Lexer_DirectLex();                                                            
     107               0 :             case 'PH5P':                                                                                              
     108               0 :                 return new HTMLPurifier_Lexer_PH5P();                                                                 
     109               0 :             default:                                                                                                  
     110               0 :                 trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
     111               0 :         }                                                                                                             
     112                 :                                                                                                                       
     113               0 :     }                                                                                                                 
     114                 :                                                                                                                       
     115                 :     // -- CONVENIENCE MEMBERS ---------------------------------------------                                           
     116                 :                                                                                                                       
     117                 :     public function __construct() {                                                                                   
     118               2 :         $this->_entity_parser = new HTMLPurifier_EntityParser();                                                      
     119               2 :     }                                                                                                                 
     120                 :                                                                                                                       
     121                 :     /**                                                                                                               
     122                 :      * Most common entity to raw value conversion table for special entities.                                         
     123                 :      */                                                                                                               
     124                 :     protected $_special_entity2str =                                                                                  
     125                 :             array(                                                                                                    
     126                 :                     '&quot;' => '"',                                                                                  
     127                 :                     '&amp;'  => '&',                                                                                  
     128                 :                     '&lt;'   => '<',                                                                                  
     129                 :                     '&gt;'   => '>',                                                                                  
     130                 :                     '&#39;'  => "'",                                                                                  
     131                 :                     '&#039;' => "'",                                                                                  
     132                 :                     '&#x27;' => "'"                                                                                   
     133                 :             );                                                                                                        
     134                 :                                                                                                                       
     135                 :     /**                                                                                                               
     136                 :      * Parses special entities into the proper characters.                                                            
     137                 :      *                                                                                                                
     138                 :      * This string will translate escaped versions of the special characters                                          
     139                 :      * into the correct ones.                                                                                         
     140                 :      *                                                                                                                
     141                 :      * @warning                                                                                                       
     142                 :      * You should be able to treat the output of this function as                                                     
     143                 :      * completely parsed, but that's only because all other entities should                                           
     144                 :      * have been handled previously in substituteNonSpecialEntities()                                                 
     145                 :      *                                                                                                                
     146                 :      * @param $string String character data to be parsed.                                                             
     147                 :      * @returns Parsed character data.                                                                                
     148                 :      */                                                                                                               
     149                 :     public function parseData($string) {                                                                              
     150                 :                                                                                                                       
     151                 :         // following functions require at least one character                                                         
     152               1 :         if ($string === '') return '';                                                                                
     153                 :                                                                                                                       
     154                 :         // subtracts amps that cannot possibly be escaped                                                             
     155               1 :         $num_amp = substr_count($string, '&') - substr_count($string, '& ') -                                         
     156               1 :             ($string[strlen($string)-1] === '&' ? 1 : 0);                                                             
     157                 :                                                                                                                       
     158               1 :         if (!$num_amp) return $string; // abort if no entities                                                        
     159                 :         $num_esc_amp = substr_count($string, '&amp;');                                                                
     160                 :         $string = strtr($string, $this->_special_entity2str);                                                         
     161                 :                                                                                                                       
     162                 :         // code duplication for sake of optimization, see above                                                       
     163                 :         $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -                                       
     164                 :             ($string[strlen($string)-1] === '&' ? 1 : 0);                                                             
     165                 :                                                                                                                       
     166                 :         if ($num_amp_2 <= $num_esc_amp) return $string;                                                               
     167                 :                                                                                                                       
     168                 :         // hmm... now we have some uncommon entities. Use the callback.                                               
     169                 :         $string = $this->_entity_parser->substituteSpecialEntities($string);                                          
     170                 :         return $string;                                                                                               
     171                 :     }                                                                                                                 
     172                 :                                                                                                                       
     173                 :     /**                                                                                                               
     174                 :      * Lexes an HTML string into tokens.                                                                              
     175                 :      *                                                                                                                
     176                 :      * @param $string String HTML.                                                                                    
     177                 :      * @return HTMLPurifier_Token array representation of HTML.                                                       
     178                 :      */                                                                                                               
     179                 :     public function tokenizeHTML($string, $config, $context) {                                                        
     180               0 :         trigger_error('Call to abstract class', E_USER_ERROR);                                                        
     181               0 :     }                                                                                                                 
     182                 :                                                                                                                       
     183                 :     /**                                                                                                               
     184                 :      * Translates CDATA sections into regular sections (through escaping).                                            
     185                 :      *                                                                                                                
     186                 :      * @param $string HTML string to process.                                                                         
     187                 :      * @returns HTML with CDATA sections escaped.                                                                     
     188                 :      */                                                                                                               
     189                 :     protected static function escapeCDATA($string) {                                                                  
     190               2 :         return preg_replace_callback(                                                                                 
     191               2 :             '/<!\[CDATA\[(.+?)\]\]>/s',                                                                               
     192               2 :             array('HTMLPurifier_Lexer', 'CDATACallback'),                                                             
     193                 :             $string                                                                                                   
     194               2 :         );                                                                                                            
     195                 :     }                                                                                                                 
     196                 :                                                                                                                       
     197                 :     /**                                                                                                               
     198                 :      * Special CDATA case that is especially convoluted for <script>                                                  
     199                 :      */                                                                                                               
     200                 :     protected static function escapeCommentedCDATA($string) {                                                         
     201               0 :         return preg_replace_callback(                                                                                 
     202               0 :             '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',                                                        
     203               0 :             array('HTMLPurifier_Lexer', 'CDATACallback'),                                                             
     204                 :             $string                                                                                                   
     205               0 :         );                                                                                                            
     206                 :     }                                                                                                                 
     207                 :                                                                                                                       
     208                 :     /**                                                                                                               
     209                 :      * Callback function for escapeCDATA() that does the work.                                                        
     210                 :      *                                                                                                                
     211                 :      * @warning Though this is public in order to let the callback happen,                                            
     212                 :      *          calling it directly is not recommended.                                                               
     213                 :      * @params $matches PCRE matches array, with index 0 the entire match                                             
     214                 :      *                  and 1 the inside of the CDATA section.                                                        
     215                 :      * @returns Escaped internals of the CDATA section.                                                               
     216                 :      */                                                                                                               
     217                 :     protected static function CDATACallback($matches) {                                                               
     218                 :         // not exactly sure why the character set is needed, but whatever                                             
     219               0 :         return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');                                                    
     220                 :     }                                                                                                                 
     221                 :                                                                                                                       
     222                 :     /**                                                                                                               
     223                 :      * Takes a piece of HTML and normalizes it by converting entities, fixing                                         
     224                 :      * encoding, extracting bits, and other good stuff.                                                               
     225                 :      * @todo Consider making protected                                                                                
     226                 :      */                                                                                                               
     227                 :     public function normalize($html, $config, $context) {                                                             
     228                 :                                                                                                                       
     229                 :         // extract body from document if applicable                                                                   
     230               2 :         if ($config->get('Core', 'ConvertDocumentToFragment')) {                                                      
     231               2 :             $html = $this->extractBody($html);                                                                        
     232               2 :         }                                                                                                             
     233                 :                                                                                                                       
     234                 :         // normalize newlines to \n                                                                                   
     235               2 :         $html = str_replace("\r\n", "\n", $html);                                                                     
     236               2 :         $html = str_replace("\r", "\n", $html);                                                                       
     237                 :                                                                                                                       
     238               2 :         if ($config->get('HTML', 'Trusted')) {                                                                        
     239                 :             // escape convoluted CDATA                                                                                
     240               0 :             $html = $this->escapeCommentedCDATA($html);                                                               
     241               0 :         }                                                                                                             
     242                 :                                                                                                                       
     243                 :         // escape CDATA                                                                                               
     244               2 :         $html = $this->escapeCDATA($html);                                                                            
     245                 :                                                                                                                       
     246                 :         // expand entities that aren't the big five                                                                   
     247               2 :         $html = $this->_entity_parser->substituteNonSpecialEntities($html);                                           
     248                 :                                                                                                                       
     249                 :         // clean into wellformed UTF-8 string for an SGML context: this has                                           
     250                 :         // to be done after entity expansion because the entities sometimes                                           
     251                 :         // represent non-SGML characters (horror, horror!)                                                            
     252               2 :         $html = HTMLPurifier_Encoder::cleanUTF8($html);                                                               
     253                 :                                                                                                                       
     254               2 :         return $html;                                                                                                 
     255                 :     }                                                                                                                 
     256                 :                                                                                                                       
     257                 :     /**                                                                                                               
     258                 :      * Takes a string of HTML (fragment or document) and returns the content                                          
     259                 :      * @todo Consider making protected                                                                                
     260                 :      */                                                                                                               
     261                 :     public function extractBody($html) {                                                                              
     262               2 :         $matches = array();                                                                                           
     263               2 :         $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);                                         
     264               2 :         if ($result) {                                                                                                
     265               0 :             return $matches[1];                                                                                       
     266                 :         } else {                                                                                                      
     267               2 :             return $html;                                                                                             
     268                 :         }                                                                                                             
     269                 :     }                                                                                                                 
     270                 :                                                                                                                       
     271                 : }                                                                                                                     
     272                 :                                                                                                                       

Generated by PHPUnit 3.3.1 and Xdebug 2.0.2 at Thu Sep 25 18:42:10 CEST 2008.