Package encutils
[hide private]
[frames] | no frames]

Source Code for Package encutils

  1  #!/usr/bin/env python 
  2  """encutils - encoding detection collection for Python 
  3   
  4  encutils 
  5  ======== 
  6  :Author: Christof Hoeke, see http://cthedot.de/encutils/ 
  7  :Copyright: 2005-2008: Christof Hoeke 
  8  :License: encutils has a dual-license, please choose whatever you prefer: 
  9   
 10      * encutils is published under the `LGPL 3 or later <http://cthedot.de/encutils/license/>`__ 
 11      * encutils is published under the   
 12        `Creative Commons License <http://creativecommons.org/licenses/by/3.0/>`__. 
 13         
 14      This file is part of encutils. 
 15   
 16      encutils is free software: you can redistribute it and/or modify 
 17      it under the terms of the GNU Lesser General Public License as published by 
 18      the Free Software Foundation, either version 3 of the License, or 
 19      (at your option) any later version. 
 20   
 21      encutils is distributed in the hope that it will be useful, 
 22      but WITHOUT ANY WARRANTY; without even the implied warranty of 
 23      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 24      GNU Lesser General Public License for more details. 
 25   
 26      You should have received a copy of the GNU Lesser General Public License 
 27      along with encutils.  If not, see <http://www.gnu.org/licenses/>. 
 28    
 29   
 30  A collection of helper functions to detect encodings of text files (like HTML, XHTML, XML, CSS, etc.) retrieved via HTTP, file or string. 
 31   
 32  ``getEncodingInfo`` is probably the main function of interest which uses 
 33  other supplied functions itself and gathers all information together and 
 34  supplies an ``EncodingInfo`` object with the following properties: 
 35   
 36  - ``encoding``: The guessed encoding 
 37      Encoding is the explicit or implicit encoding or None and 
 38      always lowercase. 
 39   
 40  - from HTTP response     
 41      * ``http_encoding`` 
 42      * ``http_media_type`` 
 43   
 44  - from HTML <meta> element     
 45      * ``meta_encoding`` 
 46      * ``meta_media_type`` 
 47   
 48  - from XML declaration 
 49      * ``xml_encoding`` 
 50   
 51  example:: 
 52   
 53      >>> import encutils 
 54      >>> info = encutils.getEncodingInfo(url='http://cthedot.de/encutils/') 
 55       
 56      >>> print info  # = str(info) 
 57      utf-8 
 58       
 59      >>> info        # = repr(info) 
 60      <encutils.EncodingInfo object encoding='utf-8' mismatch=False at 0xb86d30> 
 61       
 62      >>> print info.logtext 
 63      HTTP media_type: text/html 
 64      HTTP encoding: utf-8 
 65      HTML META media_type: text/html 
 66      HTML META encoding: utf-8 
 67      Encoding (probably): utf-8 (Mismatch: False) 
 68   
 69   
 70  references 
 71  ========== 
 72  XML 
 73      RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt) 
 74       
 75      easier explained in  
 76          - http://feedparser.org/docs/advanced.html 
 77          - http://www.xml.com/pub/a/2004/07/21/dive.html 
 78           
 79  HTML 
 80      http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2 
 81   
 82  TODO 
 83  ==== 
 84  - parse @charset of HTML elements? 
 85  - check for more texttypes if only text given 
 86       
 87  """ 
 88  __all__ = ['buildlog', 
 89             'encodingByMediaType', 
 90             'getHTTPInfo', 
 91             'getMetaInfo', 
 92             'detectXMLEncoding', 
 93             'getEncodingInfo', 
 94             'tryEncodings', 
 95             'EncodingInfo'] 
 96  __docformat__ = 'restructuredtext' 
 97  __author__ = 'Christof Hoeke' 
 98  __version__ = '0.8.3 $Id: __init__.py 1138 2008-03-15 18:24:46Z cthedot $' 
 99   
100  import cgi 
101  import HTMLParser 
102  import httplib 
103  import re 
104  import StringIO 
105  import sys 
106  import types 
107  import urllib 
108   
109 -class _MetaHTMLParser(HTMLParser.HTMLParser):
110 """parses given data for <meta http-equiv="content-type">""" 111 content_type = None 112
113 - def handle_starttag(self, tag, attrs):
114 if tag == 'meta' and not self.content_type: 115 atts = dict([(a.lower(), v.lower()) for a, v in attrs]) 116 if atts.get('http-equiv', u'').strip() == u'content-type': 117 self.content_type = atts.get('content')
118 119 120 # application/xml, application/xml-dtd, application/xml-external-parsed-entity, or a subtype like application/rss+xml. 121 _XML_APPLICATION_TYPE = 0 122 123 # text/xml, text/xml-external-parsed-entity, or a subtype like text/AnythingAtAll+xml 124 _XML_TEXT_TYPE = 1 125 126 # text/html 127 _HTML_TEXT_TYPE = 2 128 129 # any other of text/* like text/plain, ... 130 _TEXT_TYPE = 3 131 132 # any text/* like which defaults to UTF-8 encoding, for now only text/css 133 _TEXT_UTF8 = 5 134 135 # types not fitting in above types 136 _OTHER_TYPE = 4 137
138 -class EncodingInfo(object):
139 """ 140 All encoding related information, returned by ``getEncodingInfo`` 141 142 - ``encoding``: The guessed encoding 143 Encoding is the explicit or implicit encoding or None and 144 always lowercase. 145 146 - from HTTP response 147 * ``http_encoding`` 148 * ``http_media_type`` 149 150 - from HTML <meta> element 151 * ``meta_encoding`` 152 * ``meta_media_type`` 153 154 - from XML declaration 155 * ``xml_encoding`` 156 157 - ``mismatch``: True if mismatch between XML declaration and HTTP header 158 Mismatch is True if any mismatches between HTTP header, XML 159 declaration or textcontent (meta) are found. More detailed mismatch 160 reports are written to the optional log or ``logtext`` 161 162 Mismatches are not necessarily errors as preferences are defined. 163 For details see the specifications. 164 165 - ``logtext``: if no log was given log reports are given here 166 167 """
168 - def __init__(self):
169 """ 170 initializes all possible properties to ``None``, see class 171 description 172 """ 173 self.encoding = self.mismatch = self.logtext =\ 174 self.http_encoding = self.http_media_type =\ 175 self.meta_encoding = self.meta_media_type =\ 176 self.xml_encoding =\ 177 None
178
179 - def __str__(self):
180 """ 181 ``str(EncodingInfo())`` outputs the guessed encoding itself or the empty string 182 """ 183 if self.encoding: 184 return self.encoding 185 else: 186 return u''
187
188 - def __repr__(self):
189 return "<%s.%s object encoding=%r mismatch=%s at 0x%x>" % ( 190 self.__class__.__module__, self.__class__.__name__, 191 self.encoding, self.mismatch, id(self))
192 193
194 -def buildlog(logname='encutils', level='INFO', stream=sys.stderr, 195 filename=None, filemode="w", 196 format='%(levelname)s\t%(message)s'):
197 """ 198 helper to build a basic log 199 200 - if ``filename`` is given returns a log logging to ``filename`` with 201 mode ``filemode`` 202 - else uses a log streaming to ``stream`` which defaults to 203 ``sys.stderr`` 204 - ``level`` defines the level of the log 205 - ``format`` defines the formatter format of the log 206 207 returns a log with the name ``logname`` 208 """ 209 import logging 210 211 log = logging.getLogger(logname) 212 213 if filename: 214 hdlr = logging.FileHandler(filename, filemode) 215 else: 216 hdlr = logging.StreamHandler(stream) 217 218 formatter = logging.Formatter(format) 219 hdlr.setFormatter(formatter) 220 221 log.addHandler(hdlr) 222 log.setLevel(logging.__dict__.get(level, logging.INFO)) 223 224 return log
225
226 -def _getTextTypeByMediaType(media_type, log=None):
227 """ 228 returns type as defined by constants above 229 """ 230 if not media_type: 231 return _OTHER_TYPE 232 233 xml_application_types = [ 234 ur'application/.*?\+xml', 235 u'application/xml', 236 u'application/xml-dtd', 237 u'application/xml-external-parsed-entity'] 238 xml_text_types = [ 239 ur'text\/.*?\+xml', 240 u'text/xml', 241 u'text/xml-external-parsed-entity'] 242 243 media_type = media_type.strip().lower() 244 245 if media_type in xml_application_types or\ 246 re.match(xml_application_types[0], media_type, re.I|re.S|re.X): 247 return _XML_APPLICATION_TYPE 248 elif media_type in xml_text_types or\ 249 re.match(xml_text_types[0], media_type, re.I|re.S|re.X): 250 return _XML_TEXT_TYPE 251 elif media_type == u'text/html': 252 return _HTML_TEXT_TYPE 253 elif media_type == u'text/css': 254 return _TEXT_UTF8 255 elif media_type.startswith(u'text/'): 256 return _TEXT_TYPE 257 else: 258 return _OTHER_TYPE
259
260 -def _getTextType(text, log=None):
261 """ 262 checks if given text is XML (**naive test!**) 263 used if no content-type given 264 """ 265 if text[:30].find(u'<?xml version=') != -1: 266 return _XML_APPLICATION_TYPE 267 else: 268 return _OTHER_TYPE
269
270 -def encodingByMediaType(media_type, log=None):
271 """ 272 Returns a default encoding for the given media_type. 273 For example ``'utf-8'`` for ``media_type='application/xml'``. 274 275 Refers to RFC 3023 and HTTP MIME specification. 276 277 If no default encoding is available returns ``None``. 278 """ 279 defaultencodings = { 280 _XML_APPLICATION_TYPE: u'utf-8', 281 _XML_TEXT_TYPE: u'ascii', 282 _HTML_TEXT_TYPE: u'iso-8859-1', # should be None? 283 _TEXT_TYPE: u'iso-8859-1', # should be None? 284 _TEXT_UTF8: u'utf-8', 285 _OTHER_TYPE: None} 286 287 texttype = _getTextTypeByMediaType(media_type) 288 encoding = defaultencodings.get(texttype, None) 289 290 if log: 291 if not encoding: 292 log.debug(u'"%s" Media-Type has no default encoding', 293 media_type) 294 else: 295 log.debug( 296 u'Default encoding for Media Type "%s": %s', 297 media_type, encoding) 298 return encoding
299
300 -def getHTTPInfo(response, log=None):
301 """ 302 Returns ``(media_type, encoding)`` information from the response' 303 Content-Type HTTP header. (Case of headers is ignored.) 304 May be ``(None, None)`` e.g. if no Content-Type header is 305 available. 306 """ 307 info = response.info() 308 media_type = info.gettype() 309 encoding = info.getparam('charset') 310 311 if encoding: 312 encoding = encoding.lower() 313 314 if log: 315 log.info(u'HTTP media_type: %s', media_type) 316 log.info(u'HTTP encoding: %s', encoding) 317 318 return media_type, encoding
319
320 -def getMetaInfo(text, log=None):
321 """ 322 Returns (media_type, encoding) information from (first) 323 X/HTML Content-Type ``<meta>`` element if available. 324 325 Normally in X/HTML: 326 ``<meta http-equiv="Content-Type" content="media_type; 327 charset=encoding"/>`` 328 """ 329 p = _MetaHTMLParser() 330 p.feed(text) 331 if p.content_type: 332 media_type, params = cgi.parse_header(p.content_type) 333 encoding = params.get('charset') # defaults to None 334 if encoding: 335 encoding = encoding.lower() 336 if log: 337 log.info(u'HTML META media_type: %s', media_type) 338 log.info(u'HTML META encoding: %s', encoding) 339 else: 340 media_type = encoding = None 341 342 return media_type, encoding
343
344 -def detectXMLEncoding(fp, log=None, includeDefault=True):
345 """ 346 Attempts to detect the character encoding of the xml file 347 given by a file object fp. fp must not be a codec wrapped file 348 object! fp may also be a string or unicode string 349 350 The return value can be: 351 - if detection of the BOM succeeds, the codec name of the 352 corresponding unicode charset is returned 353 354 - if BOM detection fails, the xml declaration is searched for 355 the encoding attribute and its value returned. the "<" 356 character has to be the very first in the file then (it's xml 357 standard after all). 358 359 - if BOM and xml declaration fail, utf-8 is returned according 360 to XML 1.0. 361 362 Based on a recipe by Lars Tiede: 363 http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841 364 which itself is based on Paul Prescotts recipe: 365 http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257 366 """ 367 if type(fp) in types.StringTypes: 368 fp = StringIO.StringIO(fp) 369 370 ### detection using BOM 371 372 ## the BOMs we know, by their pattern 373 bomDict={ # bytepattern: name 374 (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be", 375 (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le", 376 (0xFE, 0xFF, None, None) : "utf_16_be", 377 (0xFF, 0xFE, None, None) : "utf_16_le", 378 (0xEF, 0xBB, 0xBF, None) : "utf-8", 379 } 380 381 ## go to beginning of file and get the first 4 bytes 382 oldFP = fp.tell() 383 fp.seek(0) 384 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) 385 386 ## try bom detection using 4 bytes, 3 bytes, or 2 bytes 387 bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) 388 if not bomDetection: 389 bomDetection = bomDict.get((byte1, byte2, byte3, None)) 390 if not bomDetection: 391 bomDetection = bomDict.get((byte1, byte2, None, None)) 392 393 ## if BOM detected, we're done :-) 394 if bomDetection: 395 if log: 396 log.info(u'XML BOM encoding: %s' % bomDetection) 397 fp.seek(oldFP) 398 return bomDetection 399 400 ## still here? BOM detection failed. 401 ## now that BOM detection has failed we assume one byte character 402 ## encoding behaving ASCII 403 404 ### search xml declaration for encoding attribute 405 406 ## assume xml declaration fits into the first 2 KB (*cough*) 407 fp.seek(0) 408 buffer = fp.read(2048) 409 410 ## set up regular expression 411 xmlDeclPattern = r""" 412 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte 413 .+? # some chars (version info), matched minimal 414 encoding= # encoding attribute begins 415 ["'] # attribute start delimiter 416 (?P<encstr> # what's matched in the brackets will be named encstr 417 [^"']+ # every character not delimiter (not overly exact!) 418 ) # closes the brackets pair for the named group 419 ["'] # attribute end delimiter 420 .*? # some chars optionally (standalone decl or whitespace) 421 \?> # xmldecl end 422 """ 423 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE) 424 425 ## search and extract encoding string 426 match = xmlDeclRE.search(buffer) 427 fp.seek(oldFP) 428 if match: 429 enc = match.group("encstr").lower() 430 if log: 431 log.info(u'XML encoding="%s"' % enc) 432 return enc 433 else: 434 if includeDefault: 435 if log: 436 log.info(u'XML encoding default utf-8') 437 return u'utf-8' 438 else: 439 return None
440
441 -def tryEncodings(text, log=None):
442 """ 443 If installed uses chardet http://chardet.feedparser.org/ to detect 444 encoding, else tries different encodings on text and returns the one 445 that does not raise an exception which is not very advanced or may 446 be totally wrong. 447 448 Returns working encoding or None if no encoding does work at all. 449 450 The returned encoding might nevertheless be not the one intended by the 451 author as it is only checked if the text might be encoded in that 452 encoding. Some texts might be working in "iso-8859-1" *and* 453 "windows-1252" *and* "ascii" *and* "utf-8" and ... 454 """ 455 try: 456 import chardet 457 encoding = chardet.detect(text)["encoding"] 458 459 except ImportError: 460 msg = 'Using simplified encoding detection, you might want to install chardet.' 461 if log: 462 log.warn(msg) 463 else: 464 print msg 465 466 encodings = ( 467 'ascii', 468 'iso-8859-1', 469 'windows-1252', 470 'utf-8' 471 ) 472 encoding = None 473 for e in encodings: 474 try: 475 text.encode(e) 476 except (UnicodeEncodeError, UnicodeDecodeError): 477 pass 478 else: 479 encoding = e 480 break 481 482 return encoding
483
484 -def getEncodingInfo(response=None, text=u'', log=None, url=None):
485 """ 486 Finds all encoding related information in given ``text``. 487 Uses information in headers of supplied HTTPResponse, possible XML 488 declaration and X/HTML ``<meta>`` elements. 489 ``text`` will mostly be HTML or XML. 490 491 Parameters 492 - ``response``: HTTP response object, 493 e.g. ``urllib.urlopen('url')`` 494 - ``text``: to guess encoding for, might include XML 495 prolog with encoding pseudo attribute or HTML meta element 496 - ``log``: an optional logging logger to which messages may go, if 497 no log given all log messages are available from resulting 498 ``EncodingInfo`` 499 500 May also simply be called with ``getEncodingInfo(url='URL')`` which fetches 501 the url and all needed information. 502 503 Returns instance of ``EncodingInfo``. 504 505 How the resulting encoding is retrieved 506 ======================================= 507 XML 508 --- 509 RFC 3023 states if media type given in the Content-Type HTTP header is 510 application/xml, application/xml-dtd, 511 application/xml-external-parsed-entity, or any one of the subtypes of 512 application/xml such as application/atom+xml or application/rss+xml 513 etc then the character encoding is determined in this order: 514 515 1. the encoding given in the charset parameter of the Content-Type HTTP 516 header, or 517 2. the encoding given in the encoding attribute of the XML declaration 518 within the document, or 519 3. utf-8. 520 521 Mismatch possibilities: 522 - HTTP + XMLdecla 523 - HTTP + HTMLmeta 524 525 application/xhtml+xml ? 526 XMLdecla + HTMLmeta 527 528 If the media type given in the Content-Type HTTP header is text/xml, 529 text/xml-external-parsed-entity, or a subtype like text/Anything+xml, 530 the encoding attribute of the XML declaration is ignored completely 531 and the character encoding is determined in the order: 532 1. the encoding given in the charset parameter of the Content-Type HTTP 533 header, or 534 2. ascii. 535 536 Mismatch possibilities: 537 - HTTP + XMLdecla 538 - HTTP + HTMLmeta 539 540 text/xhtml+xml 541 XMLdecla + HTMLmeta 542 543 HTML 544 ---- 545 For HTML served as text/html: 546 http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2 547 548 1. An HTTP "charset" parameter in a "Content-Type" field. 549 (maybe defaults to ISO-8859-1, but should not assume this) 550 2. A META declaration with "http-equiv" set to "Content-Type" and a 551 value set for "charset". 552 3. The charset attribute set on an element that designates an external 553 resource. (NOT IMPLEMENTED HERE YET) 554 555 Mismatch possibilities: 556 - HTTP + HTMLmeta 557 558 TEXT 559 ---- 560 For most text/* types the encoding will be reported as iso-8859-1. 561 Exceptions are XML formats send as text/* mime type (see above) and 562 text/css which has a default encoding of UTF-8. 563 """ 564 if url: 565 try: 566 response = urllib.urlopen(url) 567 text = response.read() 568 except IOError, e: 569 print IOError(e) 570 sys.exit(1) 571 572 encinfo = EncodingInfo() 573 574 logstream = StringIO.StringIO() 575 if not log: 576 log = buildlog(stream=logstream, format='%(message)s') 577 578 # HTTP 579 if response: 580 encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo( 581 response, log) 582 texttype = _getTextTypeByMediaType(encinfo.http_media_type, log) 583 else: 584 # check if maybe XML or (TODO:) HTML 585 texttype = _getTextType(text, log) 586 587 # XML (also XHTML served as text/html) 588 if texttype == _XML_APPLICATION_TYPE or texttype == _XML_TEXT_TYPE: 589 encinfo.xml_encoding = detectXMLEncoding(text, log) 590 591 # XML (also XHTML served as text/html) 592 if texttype == _HTML_TEXT_TYPE: 593 encinfo.xml_encoding = detectXMLEncoding(text, log, includeDefault=False) 594 595 # HTML 596 if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE: 597 encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo( 598 text, log) 599 600 # guess 601 # 1. HTTP charset? 602 encinfo.encoding = encinfo.http_encoding 603 encinfo.mismatch = False 604 605 # 2. media_type? 606 # XML application/... 607 if texttype == _XML_APPLICATION_TYPE: 608 if not encinfo.encoding: 609 encinfo.encoding = encinfo.xml_encoding 610 # xml_encoding has default of utf-8 611 612 # text/html 613 elif texttype == _HTML_TEXT_TYPE: 614 if not encinfo.encoding: 615 encinfo.encoding = encinfo.meta_encoding 616 if not encinfo.encoding: 617 encinfo.encoding = encodingByMediaType(encinfo.http_media_type) 618 if not encinfo.encoding: 619 encinfo.encoding = tryEncodings(text) 620 621 # text/... + xml or text/* 622 elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE: 623 if not encinfo.encoding: 624 encinfo.encoding = encodingByMediaType(encinfo.http_media_type) 625 626 # possible mismatches, checks if present at all and then if equal 627 # HTTP + XML 628 if encinfo.http_encoding and encinfo.xml_encoding and\ 629 encinfo.http_encoding <> encinfo.xml_encoding: 630 encinfo.mismatch = True 631 log.warn(u'"%s" (HTTP) <> "%s" (XML) encoding mismatch' % 632 (encinfo.http_encoding, encinfo.xml_encoding)) 633 # HTTP + Meta 634 if encinfo.http_encoding and encinfo.meta_encoding and\ 635 encinfo.http_encoding <> encinfo.meta_encoding: 636 encinfo.mismatch = True 637 log.warn(u'"%s" (HTTP) <> "%s" (HTML <meta>) encoding mismatch' % 638 (encinfo.http_encoding, encinfo.meta_encoding)) 639 # XML + Meta 640 if encinfo.xml_encoding and encinfo.meta_encoding and\ 641 encinfo.xml_encoding <> encinfo.meta_encoding: 642 encinfo.mismatch = True 643 log.warn(u'"%s" (XML) <> "%s" (HTML <meta>) encoding mismatch' % 644 (encinfo.xml_encoding, encinfo.meta_encoding)) 645 646 log.info(u'Encoding (probably): %s (Mismatch: %s)', 647 encinfo.encoding, encinfo.mismatch) 648 649 encinfo.logtext = logstream.getvalue() 650 return encinfo
651 652 653 if __name__ == '__main__': 654 import pydoc 655 pydoc.help(__name__) 656