1
2 """Helper functions to detect encodings of text files.
3
4 ====================
5 encutils
6 ====================
7 :Author: Christof Hoeke
8 :License: This work is licensed under a
9 Creative Commons License http://creativecommons.org/licenses/by/2.5/
10
11 Website: http://cthedot.de/encutils/
12
13 Some basic helper functions to deal with encodings of text files (like
14 HTML, XHTML, XML, CSS, etc.) via HTTP and directly.
15
16 ``getEncodingInfo`` is probably the main function of interest which uses
17 other supplied functions itself and gathers all information together and
18 supplies an ``EncodingInfo`` object with the following properties:
19
20 - ``encoding``: The guessed encoding
21 Encoding is the explicit or implicit encoding or None and
22 always lowercase.
23
24 - from HTTP response
25 * ``http_encoding``
26 * ``http_media_type``
27
28 - from HTML <meta> element
29 * ``meta_encoding``
30 * ``meta_media_type``
31
32 - from XML declaration
33 * ``xml_encoding``
34
35 Requires Python 2.3 or later
36
37 references
38 ==========
39 XML
40 RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt)
41
42 easier explained in
43 - http://feedparser.org/docs/advanced.html
44 - http://www.xml.com/pub/a/2004/07/21/dive.html
45
46 HTML
47 http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
48
49 TODO:
50 - HTML meta elements in comments? (use HTMLParser?)
51 - parse @charset of HTML elements?
52 - check for more texttypes if only text given
53 """
54 __docformat__ = 'restructuredtext'
55 __version__ = '0.7a1'
56 __all__ = ['buildlog',
57 'encodingByMediaType',
58 'getHTTPInfo',
59 'getMetaInfo',
60 'detectXMLEncoding',
61 'getEncodingInfo',
62 'guessEncoding',
63 'tryEncodings',
64 'EncodingInfo']
65
66 import cgi
67 import httplib
68 import re
69 import StringIO
70 import sys
71 import types
72
73 True = not 0
74 False = not True
75
76
77 _XML_APPLICATION_TYPE = 0
78 """
79 application/xml, application/xml-dtd,
80 application/xml-external-parsed-entity, or
81 a subtype like application/rss+xml.
82 """
83 _XML_TEXT_TYPE = 1
84 """
85 text/xml, text/xml-external-parsed-entity, or a subtype like
86 text/AnythingAtAll+xml
87 """
88 _HTML_TEXT_TYPE = 2
89 """
90 text/html
91 """
92 _TEXT_TYPE = 3
93 """
94 any other of text/* like text/plain, text/css, ...
95 """
96 _OTHER_TYPE = 4
97 """
98 types not fitting in above types
99 """
100
101
103 """
104 All encoding related information, returned by ``getEncodingInfo``
105
106 - ``encoding``: The guessed encoding
107 Encoding is the explicit or implicit encoding or None and
108 always lowercase.
109
110 - from HTTP response
111 * ``http_encoding``
112 * ``http_media_type``
113
114 - from HTML <meta> element
115 * ``meta_encoding``
116 * ``meta_media_type``
117
118 - from XML declaration
119 * ``xml_encoding``
120
121 - ``mismatch``: True if mismatch between XML declaration and HTTP header
122 Mismatch is True if any mismatches between HTTP header, XML
123 declaration or textcontent (meta) are found. More detailed mismatch
124 reports are written to the optional log or ``logtext``
125
126 Mismatches are not nessecarily errors as preferences are defined.
127 For details see the specifications.
128
129 - ``logtext``: if no log was given log reports are given here
130 """
131
133 """
134 initializes all possible properties to ``None``, see class
135 description
136 """
137 self.encoding = self.mismatch = self.logtext =\
138 self.http_encoding = self.http_media_type =\
139 self.meta_encoding = self.meta_media_type =\
140 self.xml_encoding =\
141 None
142
144 """
145 ``str(EncodingInfo())`` is the guessed encoding itself
146 """
147 if self.encoding:
148 return self.encoding
149 else:
150 return u''
151
152
153 -def buildlog(logname='encutils', level='INFO', stream=sys.stderr,
154 filename=None, filemode="w",
155 format='%(levelname)s\t%(message)s'):
156 """
157 helper to build a basic log
158
159 - if ``filename`` is given returns a log logging to ``filename`` with
160 mode ``filemode``
161 - else uses a log streaming to ``stream`` which defaults to
162 ``sys.stderr``
163 - ``level`` defines the level of the log
164 - ``format`` defines the formatter format of the log
165
166 returns a log with the name ``logname``
167 """
168 import logging
169
170 log = logging.getLogger(logname)
171
172 if filename:
173 hdlr = logging.FileHandler(filename, filemode)
174 else:
175 hdlr = logging.StreamHandler(stream)
176
177 formatter = logging.Formatter(format)
178 hdlr.setFormatter(formatter)
179
180 log.addHandler(hdlr)
181 log.setLevel(logging.__dict__.get(level, logging.INFO))
182
183 return log
184
185
187 """
188 returns type as defined by constants above
189 """
190 if not media_type:
191 return _OTHER_TYPE
192
193 xml_application_types = [
194 ur'application/.*?\+xml',
195 u'application/xml',
196 u'application/xml-dtd',
197 u'application/xml-external-parsed-entity']
198 xml_text_types = [
199 ur'text\/.*?\+xml',
200 u'text/xml',
201 u'text/xml-external-parsed-entity']
202
203 media_type = media_type.strip().lower()
204
205 if media_type in xml_application_types or\
206 re.match(xml_application_types[0], media_type, re.I|re.S|re.X):
207 xmltype = _XML_APPLICATION_TYPE
208 elif media_type in xml_text_types or\
209 re.match(xml_text_types[0], media_type, re.I|re.S|re.X):
210 xmltype = _XML_TEXT_TYPE
211 elif media_type == u'text/html':
212 xmltype = _HTML_TEXT_TYPE
213 elif media_type.startswith(u'text/'):
214 xmltype = _TEXT_TYPE
215 else:
216 xmltype = _OTHER_TYPE
217
218 return xmltype
219
220
221 -def _getTextType(text, log=None):
222 """
223 checks if given text is XML (**naive test!**)
224 used if no content-type given
225 """
226 if text[:30].find(u'<?xml version=') != -1:
227 return _XML_APPLICATION_TYPE
228 else:
229 return _OTHER_TYPE
230
231
260
261
263 """
264 Returns ``(media_type, encoding)`` information from the response'
265 Content-Type HTTP header. (Case of headers is ignored.)
266 May be ``(None, None)`` e.g. if no Content-Type header is
267 available.
268 """
269 info = response.info()
270 media_type = info.gettype()
271 encoding = info.getparam('charset')
272
273 if encoding:
274 encoding = encoding.lower()
275
276 if log:
277 log.info(u'HTTP media_type: %s', media_type)
278 log.info(u'HTTP encoding : %s', encoding)
279
280 return media_type, encoding
281
282
317
318
320 """
321 Attempts to detect the character encoding of the xml file
322 given by a file object fp. fp must not be a codec wrapped file
323 object! fp may also be a string or unicode string
324
325 The return value can be:
326 - if detection of the BOM succeeds, the codec name of the
327 corresponding unicode charset is returned
328
329 - if BOM detection fails, the xml declaration is searched for
330 the encoding attribute and its value returned. the "<"
331 character has to be the very first in the file then (it's xml
332 standard after all).
333
334 - if BOM and xml declaration fail, utf-8 is returned according
335 to XML 1.0.
336
337 Based on a recipe by Lars Tiede:
338 http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841
339 which itself is based on Paul Prescotts recipe:
340 http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257
341 """
342 if type(fp) in types.StringTypes:
343 fp = StringIO.StringIO(fp)
344
345
346
347
348 bomDict={
349 (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
350 (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
351 (0xFE, 0xFF, None, None) : "utf_16_be",
352 (0xFF, 0xFE, None, None) : "utf_16_le",
353 (0xEF, 0xBB, 0xBF, None) : "utf-8",
354 }
355
356
357 oldFP = fp.tell()
358 fp.seek(0)
359 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
360
361
362 bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
363 if not bomDetection:
364 bomDetection = bomDict.get((byte1, byte2, byte3, None))
365 if not bomDetection:
366 bomDetection = bomDict.get((byte1, byte2, None, None))
367
368
369 if bomDetection:
370 if log:
371 log.info(u'XML BOM encoding: %s' % bomDetection)
372 fp.seek(oldFP)
373 return bomDetection
374
375
376
377
378
379
380
381
382 fp.seek(0)
383 buffer = fp.read(2048)
384
385
386 xmlDeclPattern = r"""
387 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte
388 .+? # some chars (version info), matched minimal
389 encoding= # encoding attribute begins
390 ["'] # attribute start delimiter
391 (?P<encstr> # what's matched in the brackets will be named encstr
392 [^"']+ # every character not delimiter (not overly exact!)
393 ) # closes the brackets pair for the named group
394 ["'] # attribute end delimiter
395 .*? # some chars optionally (standalone decl or whitespace)
396 \?> # xmldecl end
397 """
398 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
399
400
401 match = xmlDeclRE.search(buffer)
402 fp.seek(oldFP)
403 if match:
404 enc = match.group("encstr").lower()
405 if log:
406 log.info(u'XML encoding="%s"' % enc)
407 return enc
408 else:
409 if log:
410 log.info(u'XML encoding default utf-8')
411 return u'utf-8'
412
413
415 """
416 If installed uses chardet http://chardet.feedparser.org/ to detect
417 encoding, else tries different encodings on text and returns the one
418 that does not raise an exception which is not very advanced or may
419 be totally wrong.
420
421 Returns working encoding or None if no encoding does work at all.
422
423 The returned encoding might nevertheless be not the one intended by the
424 author as it is only checked if the text might be encoded in that
425 encoding. Some texts might be working in "iso-8859-1" *and*
426 "windows-1252" *and* "ascii" *and* "utf-8" and ...
427 """
428 try:
429 import chardet
430 encoding = chardet.detect(text)["encoding"]
431
432 except ImportError:
433 msg = 'Using simplified encoding detection, you might want to install chardet instead.'
434 if log:
435 log.warn(msg)
436 else:
437 print msg
438
439 encodings = (
440 'ascii',
441 'iso-8859-1',
442 'windows-1252',
443 'utf-8'
444 )
445 encoding = None
446 for e in encodings:
447 try:
448 text.encode(e)
449 except (UnicodeEncodeError, UnicodeDecodeError):
450 pass
451 else:
452 encoding = e
453 break
454
455 return encoding
456
457
459 """
460 Finds all encoding related information in given ``text``.
461 Uses information in headers of supplied HTTPResponse, possible XML
462 declaration and X/HTML ``<meta>`` elements.
463 ``text`` will mostly be HTML or XML.
464
465 For certain text mismatches may be reported which are not really
466 mismatches. These false warning appear if e.g. a HTTP mime-type of
467 ``text/html`` is sent (which is also used for XHTML sometimes) and HTML
468 is actually served. In this case the XML default of 'utf-8' which is
469 not relevant may nevertheless be reported to mismatch with
470 HTTP or ``<meta>``-element information.
471
472 Parameters
473 - ``response``: HTTP response object,
474 e.g. ``urllib.urlopen('url')``
475 - ``text``: to guess encoding for, might include XML
476 prolog with encoding pseudo attribute or HTML meta element
477 - ``log``: an optional logging logger to which messages may go, if
478 no log given all log messages are available from resulting
479 ``EncodingInfo``
480
481 Returns instance of ``EncodingInfo``.
482
483
484 How the resulting encoding is retrieved
485 =======================================
486
487 XML
488 ---
489 RFC 3023 states if media type given in the Content-Type HTTP header is
490 application/xml, application/xml-dtd,
491 application/xml-external-parsed-entity, or any one of the subtypes of
492 application/xml such as application/atom+xml or application/rss+xml
493 etc then the character encoding is determined in this order:
494
495 1. the encoding given in the charset parameter of the Content-Type HTTP
496 header, or
497 2. the encoding given in the encoding attribute of the XML declaration
498 within the document, or
499 3. utf-8.
500
501 Mismatch possibilities:
502 - HTTP + XMLdecla
503 - HTTP + HTMLmeta
504
505 application/xhtml+xml ?
506 XMLdecla + HTMLmeta
507
508 If the media type given in the Content-Type HTTP header is text/xml,
509 text/xml-external-parsed-entity, or a subtype like text/Anything+xml,
510 the encoding attribute of the XML declaration is ignored completely
511 and the character encoding is determined in the order:
512 1. the encoding given in the charset parameter of the Content-Type HTTP
513 header, or
514 2. ascii.
515
516 Mismatch possibilities:
517 - HTTP + XMLdecla
518 - HTTP + HTMLmeta
519
520 text/xhtml+xml
521 XMLdecla + HTMLmeta
522
523 HTML
524 ----
525 For HTML served as text/html:
526 http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
527
528 1. An HTTP "charset" parameter in a "Content-Type" field.
529 (maybe defaults to ISO-8859-1, but should not assume this)
530 2. A META declaration with "http-equiv" set to "Content-Type" and a
531 value set for "charset".
532 3. The charset attribute set on an element that designates an external
533 resource. (NOT IMPLEMENTED HERE YET)
534
535 Mismatch possibilities:
536 - HTTP + HTMLmeta
537 """
538 encinfo = EncodingInfo()
539
540 logstream = StringIO.StringIO()
541 if not log:
542 log = buildlog(stream=logstream, format='%(message)s')
543
544
545 if response:
546 encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo(
547 response, log)
548 texttype = _getTextTypeByMediaType(encinfo.http_media_type, log)
549 else:
550
551 texttype = _getTextType(text, log)
552
553
554 if texttype == _XML_APPLICATION_TYPE or texttype == _XML_TEXT_TYPE or \
555 texttype == _HTML_TEXT_TYPE:
556 encinfo.xml_encoding = detectXMLEncoding(text, log)
557
558
559 if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE:
560 encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo(
561 text, log)
562
563
564
565 encinfo.encoding = encinfo.http_encoding
566 encinfo.mismatch = False
567
568
569
570 if texttype == _XML_APPLICATION_TYPE:
571 if not encinfo.encoding:
572 encinfo.encoding = encinfo.xml_encoding
573
574
575
576 elif texttype == _HTML_TEXT_TYPE:
577 if not encinfo.encoding:
578 encinfo.encoding = encinfo.meta_encoding
579 if not encinfo.encoding:
580 encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
581 if not encinfo.encoding:
582 encinfo.encoding = tryEncodings(text)
583
584
585 elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE:
586 if not encinfo.encoding:
587 encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
588
589
590
591
592 if encinfo.http_encoding and encinfo.xml_encoding and\
593 encinfo.http_encoding <> encinfo.xml_encoding:
594 encinfo.mismatch = True
595 log.warn(u'"%s" (HTTP) <> "%s" (XML) encoding mismatch' %
596 (encinfo.http_encoding, encinfo.xml_encoding))
597
598 if encinfo.http_encoding and encinfo.meta_encoding and\
599 encinfo.http_encoding <> encinfo.meta_encoding:
600 encinfo.mismatch = True
601 log.warn(u'"%s" (HTTP) <> "%s" (HTML <meta>) encoding mismatch' %
602 (encinfo.http_encoding, encinfo.meta_encoding))
603
604 if encinfo.xml_encoding and encinfo.meta_encoding and\
605 encinfo.xml_encoding <> encinfo.meta_encoding:
606 encinfo.mismatch = True
607 log.warn(u'"%s" (XML) <> "%s" (HTML <meta>) encoding mismatch' %
608 (encinfo.xml_encoding, encinfo.meta_encoding))
609
610 log.info(u'Encoding guessed: %s (Mismatch: %s)',
611 encinfo.encoding, encinfo.mismatch)
612
613 encinfo.logtext = logstream.getvalue()
614 return encinfo
615
616
617 if __name__ == '__main__':
618 import pydoc
619 pydoc.help(__name__)
620