Package encutils :: Module encutils
[hide private]
[frames] | no frames]

Source Code for Module encutils.encutils

  1  #!/usr/bin/env python 
  2  """Helper functions to detect encodings of text files. 
  3   
  4  ==================== 
  5        encutils 
  6  ==================== 
  7  :Author: Christof Hoeke 
  8  :License: This work is licensed under a 
  9      Creative Commons License http://creativecommons.org/licenses/by/2.5/ 
 10   
 11  Website: http://cthedot.de/encutils/ 
 12   
 13  Some basic helper functions to deal with encodings of text files (like 
 14  HTML, XHTML, XML, CSS, etc.) via HTTP and directly. 
 15   
 16  ``getEncodingInfo`` is probably the main function of interest which uses 
 17  other supplied functions itself and gathers all information together and 
 18  supplies an ``EncodingInfo`` object with the following properties: 
 19   
 20  - ``encoding``: The guessed encoding 
 21      Encoding is the explicit or implicit encoding or None and 
 22      always lowercase. 
 23   
 24  - from HTTP response     
 25      * ``http_encoding`` 
 26      * ``http_media_type`` 
 27   
 28  - from HTML <meta> element     
 29      * ``meta_encoding`` 
 30      * ``meta_media_type`` 
 31   
 32  - from XML declaration 
 33      * ``xml_encoding`` 
 34   
 35  Requires Python 2.3 or later 
 36   
 37  references 
 38  ========== 
 39  XML 
 40      RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt) 
 41       
 42      easier explained in  
 43          - http://feedparser.org/docs/advanced.html 
 44          - http://www.xml.com/pub/a/2004/07/21/dive.html 
 45           
 46  HTML 
 47      http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2 
 48   
 49  TODO: 
 50      - HTML meta elements in comments? (use HTMLParser?) 
 51      - parse @charset of HTML elements? 
 52      - check for more texttypes if only text given 
 53  """ 
 54  __docformat__ = 'restructuredtext' 
 55  __version__ = '0.7a1' 
 56  __all__ = ['buildlog', 
 57             'encodingByMediaType', 
 58             'getHTTPInfo', 
 59             'getMetaInfo', 
 60             'detectXMLEncoding', 
 61             'getEncodingInfo', 
 62             'guessEncoding', 
 63             'tryEncodings', 
 64             'EncodingInfo'] 
 65   
 66  import cgi 
 67  import httplib 
 68  import re 
 69  import StringIO 
 70  import sys 
 71  import types 
 72   
 73  True = not 0 
 74  False = not True 
 75   
 76   
 77  _XML_APPLICATION_TYPE = 0 
 78  """ 
 79  application/xml, application/xml-dtd, 
 80  application/xml-external-parsed-entity, or 
 81  a subtype like application/rss+xml. 
 82  """ 
 83  _XML_TEXT_TYPE = 1 
 84  """ 
 85  text/xml, text/xml-external-parsed-entity, or a subtype like 
 86  text/AnythingAtAll+xml 
 87  """ 
 88  _HTML_TEXT_TYPE = 2 
 89  """ 
 90  text/html 
 91  """ 
 92  _TEXT_TYPE = 3 
 93  """ 
 94  any other of text/* like text/plain, text/css, ... 
 95  """ 
 96  _OTHER_TYPE = 4 
 97  """ 
 98  types not fitting in above types 
 99  """ 
100   
101   
102 -class EncodingInfo(object):
103 """ 104 All encoding related information, returned by ``getEncodingInfo`` 105 106 - ``encoding``: The guessed encoding 107 Encoding is the explicit or implicit encoding or None and 108 always lowercase. 109 110 - from HTTP response 111 * ``http_encoding`` 112 * ``http_media_type`` 113 114 - from HTML <meta> element 115 * ``meta_encoding`` 116 * ``meta_media_type`` 117 118 - from XML declaration 119 * ``xml_encoding`` 120 121 - ``mismatch``: True if mismatch between XML declaration and HTTP header 122 Mismatch is True if any mismatches between HTTP header, XML 123 declaration or textcontent (meta) are found. More detailed mismatch 124 reports are written to the optional log or ``logtext`` 125 126 Mismatches are not nessecarily errors as preferences are defined. 127 For details see the specifications. 128 129 - ``logtext``: if no log was given log reports are given here 130 """ 131
132 - def __init__(self):
133 """ 134 initializes all possible properties to ``None``, see class 135 description 136 """ 137 self.encoding = self.mismatch = self.logtext =\ 138 self.http_encoding = self.http_media_type =\ 139 self.meta_encoding = self.meta_media_type =\ 140 self.xml_encoding =\ 141 None
142
143 - def __str__(self):
144 """ 145 ``str(EncodingInfo())`` is the guessed encoding itself 146 """ 147 if self.encoding: 148 return self.encoding 149 else: 150 return u''
151 152
153 -def buildlog(logname='encutils', level='INFO', stream=sys.stderr, 154 filename=None, filemode="w", 155 format='%(levelname)s\t%(message)s'):
156 """ 157 helper to build a basic log 158 159 - if ``filename`` is given returns a log logging to ``filename`` with 160 mode ``filemode`` 161 - else uses a log streaming to ``stream`` which defaults to 162 ``sys.stderr`` 163 - ``level`` defines the level of the log 164 - ``format`` defines the formatter format of the log 165 166 returns a log with the name ``logname`` 167 """ 168 import logging 169 170 log = logging.getLogger(logname) 171 172 if filename: 173 hdlr = logging.FileHandler(filename, filemode) 174 else: 175 hdlr = logging.StreamHandler(stream) 176 177 formatter = logging.Formatter(format) 178 hdlr.setFormatter(formatter) 179 180 log.addHandler(hdlr) 181 log.setLevel(logging.__dict__.get(level, logging.INFO)) 182 183 return log
184 185
186 -def _getTextTypeByMediaType(media_type, log=None):
187 """ 188 returns type as defined by constants above 189 """ 190 if not media_type: 191 return _OTHER_TYPE 192 193 xml_application_types = [ 194 ur'application/.*?\+xml', 195 u'application/xml', 196 u'application/xml-dtd', 197 u'application/xml-external-parsed-entity'] 198 xml_text_types = [ 199 ur'text\/.*?\+xml', 200 u'text/xml', 201 u'text/xml-external-parsed-entity'] 202 203 media_type = media_type.strip().lower() 204 205 if media_type in xml_application_types or\ 206 re.match(xml_application_types[0], media_type, re.I|re.S|re.X): 207 xmltype = _XML_APPLICATION_TYPE 208 elif media_type in xml_text_types or\ 209 re.match(xml_text_types[0], media_type, re.I|re.S|re.X): 210 xmltype = _XML_TEXT_TYPE 211 elif media_type == u'text/html': 212 xmltype = _HTML_TEXT_TYPE 213 elif media_type.startswith(u'text/'): 214 xmltype = _TEXT_TYPE 215 else: 216 xmltype = _OTHER_TYPE 217 218 return xmltype
219 220
221 -def _getTextType(text, log=None):
222 """ 223 checks if given text is XML (**naive test!**) 224 used if no content-type given 225 """ 226 if text[:30].find(u'<?xml version=') != -1: 227 return _XML_APPLICATION_TYPE 228 else: 229 return _OTHER_TYPE
230 231
232 -def encodingByMediaType(media_type, log=None):
233 """ 234 Returns a default encoding for the given media_type. 235 For example ``'utf-8'`` for ``media_type='application/xml'``. 236 237 Refers to RFC 3023 and HTTP MIME specification. 238 239 If no default encoding is available returns ``None``. 240 """ 241 defaultencodings = { 242 _XML_APPLICATION_TYPE: u'utf-8', 243 _XML_TEXT_TYPE: u'ascii', 244 _HTML_TEXT_TYPE: u'iso-8859-1', # should be None? 245 _TEXT_TYPE: u'iso-8859-1', # should be None? 246 _OTHER_TYPE: None} 247 248 texttype = _getTextTypeByMediaType(media_type) 249 encoding = defaultencodings.get(texttype, None) 250 251 if log: 252 if not encoding: 253 log.debug(u'"%s" Media-Type has no default encoding', 254 media_type) 255 else: 256 log.debug( 257 u'Default encoding for Media Type "%s" : %s', 258 media_type, encoding) 259 return encoding
260 261
262 -def getHTTPInfo(response, log=None):
263 """ 264 Returns ``(media_type, encoding)`` information from the response' 265 Content-Type HTTP header. (Case of headers is ignored.) 266 May be ``(None, None)`` e.g. if no Content-Type header is 267 available. 268 """ 269 info = response.info() 270 media_type = info.gettype() 271 encoding = info.getparam('charset') 272 273 if encoding: 274 encoding = encoding.lower() 275 276 if log: 277 log.info(u'HTTP media_type: %s', media_type) 278 log.info(u'HTTP encoding : %s', encoding) 279 280 return media_type, encoding
281 282
283 -def getMetaInfo(text, log=None):
284 """ 285 Returns (media_type, encoding) information from (first) 286 X/HTML Content-Type ``<meta>`` element if available. 287 288 Normally in X/HTML: 289 ``<meta http-equiv="Content-Type" content="media_type; 290 charset=encoding"/>`` 291 """ 292 ctmetas = re.findall(ur'''<meta.*? 293 http-equiv\s* = \s*['"]\s*Content-Type\s*['"]\s* 294 .*?\/?> 295 ''', text, re.I|re.S|re.U|re.X) 296 297 if ctmetas: 298 first = ctmetas[0] 299 value = re.findall(ur''' 300 content\s*=\s* # content= 301 ['"]\s* # " or ' 302 (.*?) # find only value text 303 \s*['"] # " or ' 304 ''' 305 ,first, re.I|re.S|re.U|re.X) 306 if value: 307 media_type, params = cgi.parse_header(value[0]) 308 encoding = params.get('charset') # defaults to None 309 if log: 310 log.debug(u'HTML <meta> : %s', value[0]) 311 log.info(u'HTML META media_type: %s', media_type) 312 log.info(u'HTML META encoding : %s', encoding) 313 else: 314 media_type = encoding = None 315 316 return media_type, encoding
317 318
319 -def detectXMLEncoding(fp, log=None):
320 """ 321 Attempts to detect the character encoding of the xml file 322 given by a file object fp. fp must not be a codec wrapped file 323 object! fp may also be a string or unicode string 324 325 The return value can be: 326 - if detection of the BOM succeeds, the codec name of the 327 corresponding unicode charset is returned 328 329 - if BOM detection fails, the xml declaration is searched for 330 the encoding attribute and its value returned. the "<" 331 character has to be the very first in the file then (it's xml 332 standard after all). 333 334 - if BOM and xml declaration fail, utf-8 is returned according 335 to XML 1.0. 336 337 Based on a recipe by Lars Tiede: 338 http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841 339 which itself is based on Paul Prescotts recipe: 340 http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257 341 """ 342 if type(fp) in types.StringTypes: 343 fp = StringIO.StringIO(fp) 344 345 ### detection using BOM 346 347 ## the BOMs we know, by their pattern 348 bomDict={ # bytepattern : name 349 (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be", 350 (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le", 351 (0xFE, 0xFF, None, None) : "utf_16_be", 352 (0xFF, 0xFE, None, None) : "utf_16_le", 353 (0xEF, 0xBB, 0xBF, None) : "utf-8", 354 } 355 356 ## go to beginning of file and get the first 4 bytes 357 oldFP = fp.tell() 358 fp.seek(0) 359 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) 360 361 ## try bom detection using 4 bytes, 3 bytes, or 2 bytes 362 bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) 363 if not bomDetection: 364 bomDetection = bomDict.get((byte1, byte2, byte3, None)) 365 if not bomDetection: 366 bomDetection = bomDict.get((byte1, byte2, None, None)) 367 368 ## if BOM detected, we're done :-) 369 if bomDetection: 370 if log: 371 log.info(u'XML BOM encoding: %s' % bomDetection) 372 fp.seek(oldFP) 373 return bomDetection 374 375 ## still here? BOM detection failed. 376 ## now that BOM detection has failed we assume one byte character 377 ## encoding behaving ASCII 378 379 ### search xml declaration for encoding attribute 380 381 ## assume xml declaration fits into the first 2 KB (*cough*) 382 fp.seek(0) 383 buffer = fp.read(2048) 384 385 ## set up regular expression 386 xmlDeclPattern = r""" 387 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte 388 .+? # some chars (version info), matched minimal 389 encoding= # encoding attribute begins 390 ["'] # attribute start delimiter 391 (?P<encstr> # what's matched in the brackets will be named encstr 392 [^"']+ # every character not delimiter (not overly exact!) 393 ) # closes the brackets pair for the named group 394 ["'] # attribute end delimiter 395 .*? # some chars optionally (standalone decl or whitespace) 396 \?> # xmldecl end 397 """ 398 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE) 399 400 ## search and extract encoding string 401 match = xmlDeclRE.search(buffer) 402 fp.seek(oldFP) 403 if match: 404 enc = match.group("encstr").lower() 405 if log: 406 log.info(u'XML encoding="%s"' % enc) 407 return enc 408 else: 409 if log: 410 log.info(u'XML encoding default utf-8') 411 return u'utf-8'
412 413
414 -def tryEncodings(text, log=None):
415 """ 416 If installed uses chardet http://chardet.feedparser.org/ to detect 417 encoding, else tries different encodings on text and returns the one 418 that does not raise an exception which is not very advanced or may 419 be totally wrong. 420 421 Returns working encoding or None if no encoding does work at all. 422 423 The returned encoding might nevertheless be not the one intended by the 424 author as it is only checked if the text might be encoded in that 425 encoding. Some texts might be working in "iso-8859-1" *and* 426 "windows-1252" *and* "ascii" *and* "utf-8" and ... 427 """ 428 try: 429 import chardet 430 encoding = chardet.detect(text)["encoding"] 431 432 except ImportError: 433 msg = 'Using simplified encoding detection, you might want to install chardet instead.' 434 if log: 435 log.warn(msg) 436 else: 437 print msg 438 439 encodings = ( 440 'ascii', 441 'iso-8859-1', 442 'windows-1252', 443 'utf-8' 444 ) 445 encoding = None 446 for e in encodings: 447 try: 448 text.encode(e) 449 except (UnicodeEncodeError, UnicodeDecodeError): 450 pass 451 else: 452 encoding = e 453 break 454 455 return encoding
456 457
458 -def getEncodingInfo(response=None, text=u'', log=None):
459 """ 460 Finds all encoding related information in given ``text``. 461 Uses information in headers of supplied HTTPResponse, possible XML 462 declaration and X/HTML ``<meta>`` elements. 463 ``text`` will mostly be HTML or XML. 464 465 For certain text mismatches may be reported which are not really 466 mismatches. These false warning appear if e.g. a HTTP mime-type of 467 ``text/html`` is sent (which is also used for XHTML sometimes) and HTML 468 is actually served. In this case the XML default of 'utf-8' which is 469 not relevant may nevertheless be reported to mismatch with 470 HTTP or ``<meta>``-element information. 471 472 Parameters 473 - ``response``: HTTP response object, 474 e.g. ``urllib.urlopen('url')`` 475 - ``text``: to guess encoding for, might include XML 476 prolog with encoding pseudo attribute or HTML meta element 477 - ``log``: an optional logging logger to which messages may go, if 478 no log given all log messages are available from resulting 479 ``EncodingInfo`` 480 481 Returns instance of ``EncodingInfo``. 482 483 484 How the resulting encoding is retrieved 485 ======================================= 486 487 XML 488 --- 489 RFC 3023 states if media type given in the Content-Type HTTP header is 490 application/xml, application/xml-dtd, 491 application/xml-external-parsed-entity, or any one of the subtypes of 492 application/xml such as application/atom+xml or application/rss+xml 493 etc then the character encoding is determined in this order: 494 495 1. the encoding given in the charset parameter of the Content-Type HTTP 496 header, or 497 2. the encoding given in the encoding attribute of the XML declaration 498 within the document, or 499 3. utf-8. 500 501 Mismatch possibilities: 502 - HTTP + XMLdecla 503 - HTTP + HTMLmeta 504 505 application/xhtml+xml ? 506 XMLdecla + HTMLmeta 507 508 If the media type given in the Content-Type HTTP header is text/xml, 509 text/xml-external-parsed-entity, or a subtype like text/Anything+xml, 510 the encoding attribute of the XML declaration is ignored completely 511 and the character encoding is determined in the order: 512 1. the encoding given in the charset parameter of the Content-Type HTTP 513 header, or 514 2. ascii. 515 516 Mismatch possibilities: 517 - HTTP + XMLdecla 518 - HTTP + HTMLmeta 519 520 text/xhtml+xml 521 XMLdecla + HTMLmeta 522 523 HTML 524 ---- 525 For HTML served as text/html: 526 http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2 527 528 1. An HTTP "charset" parameter in a "Content-Type" field. 529 (maybe defaults to ISO-8859-1, but should not assume this) 530 2. A META declaration with "http-equiv" set to "Content-Type" and a 531 value set for "charset". 532 3. The charset attribute set on an element that designates an external 533 resource. (NOT IMPLEMENTED HERE YET) 534 535 Mismatch possibilities: 536 - HTTP + HTMLmeta 537 """ 538 encinfo = EncodingInfo() 539 540 logstream = StringIO.StringIO() 541 if not log: 542 log = buildlog(stream=logstream, format='%(message)s') 543 544 # HTTP 545 if response: 546 encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo( 547 response, log) 548 texttype = _getTextTypeByMediaType(encinfo.http_media_type, log) 549 else: 550 # check if maybe XML or (TODO:) HTML 551 texttype = _getTextType(text, log) 552 553 # XML (also XHTML served as text/html) 554 if texttype == _XML_APPLICATION_TYPE or texttype == _XML_TEXT_TYPE or \ 555 texttype == _HTML_TEXT_TYPE: 556 encinfo.xml_encoding = detectXMLEncoding(text, log) 557 558 # HTML 559 if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE: 560 encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo( 561 text, log) 562 563 # guess 564 # 1. HTTP charset? 565 encinfo.encoding = encinfo.http_encoding 566 encinfo.mismatch = False 567 568 # 2. media_type? 569 # XML application/... 570 if texttype == _XML_APPLICATION_TYPE: 571 if not encinfo.encoding: 572 encinfo.encoding = encinfo.xml_encoding 573 # xml_encoding has default of utf-8 574 575 # text/html 576 elif texttype == _HTML_TEXT_TYPE: 577 if not encinfo.encoding: 578 encinfo.encoding = encinfo.meta_encoding 579 if not encinfo.encoding: 580 encinfo.encoding = encodingByMediaType(encinfo.http_media_type) 581 if not encinfo.encoding: 582 encinfo.encoding = tryEncodings(text) 583 584 # text/... + xml or text/* 585 elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE: 586 if not encinfo.encoding: 587 encinfo.encoding = encodingByMediaType(encinfo.http_media_type) 588 589 590 # possible mismatches, checks if present at all and then if equal 591 # HTTP + XML 592 if encinfo.http_encoding and encinfo.xml_encoding and\ 593 encinfo.http_encoding <> encinfo.xml_encoding: 594 encinfo.mismatch = True 595 log.warn(u'"%s" (HTTP) <> "%s" (XML) encoding mismatch' % 596 (encinfo.http_encoding, encinfo.xml_encoding)) 597 # HTTP + Meta 598 if encinfo.http_encoding and encinfo.meta_encoding and\ 599 encinfo.http_encoding <> encinfo.meta_encoding: 600 encinfo.mismatch = True 601 log.warn(u'"%s" (HTTP) <> "%s" (HTML <meta>) encoding mismatch' % 602 (encinfo.http_encoding, encinfo.meta_encoding)) 603 # XML + Meta 604 if encinfo.xml_encoding and encinfo.meta_encoding and\ 605 encinfo.xml_encoding <> encinfo.meta_encoding: 606 encinfo.mismatch = True 607 log.warn(u'"%s" (XML) <> "%s" (HTML <meta>) encoding mismatch' % 608 (encinfo.xml_encoding, encinfo.meta_encoding)) 609 610 log.info(u'Encoding guessed: %s (Mismatch: %s)', 611 encinfo.encoding, encinfo.mismatch) 612 613 encinfo.logtext = logstream.getvalue() 614 return encinfo
615 616 617 if __name__ == '__main__': 618 import pydoc 619 pydoc.help(__name__) 620