1
2 """encutils - encoding detection collection for Python
3
4 encutils
5 ========
6 :Author: Christof Hoeke, see http://cthedot.de/encutils/
7 :Copyright: 2005-2008: Christof Hoeke
8 :License: encutils has a dual-license, please choose whatever you prefer:
9
10 * encutils is published under the `LGPL 3 or later <http://cthedot.de/encutils/license/>`__
11 * encutils is published under the
12 `Creative Commons License <http://creativecommons.org/licenses/by/3.0/>`__.
13
14 This file is part of encutils.
15
16 encutils is free software: you can redistribute it and/or modify
17 it under the terms of the GNU Lesser General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or
19 (at your option) any later version.
20
21 encutils is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU Lesser General Public License for more details.
25
26 You should have received a copy of the GNU Lesser General Public License
27 along with encutils. If not, see <http://www.gnu.org/licenses/>.
28
29
30 A collection of helper functions to detect encodings of text files (like HTML, XHTML, XML, CSS, etc.) retrieved via HTTP, file or string.
31
32 ``getEncodingInfo`` is probably the main function of interest which uses
33 other supplied functions itself and gathers all information together and
34 supplies an ``EncodingInfo`` object with the following properties:
35
36 - ``encoding``: The guessed encoding
37 Encoding is the explicit or implicit encoding or None and
38 always lowercase.
39
40 - from HTTP response
41 * ``http_encoding``
42 * ``http_media_type``
43
44 - from HTML <meta> element
45 * ``meta_encoding``
46 * ``meta_media_type``
47
48 - from XML declaration
49 * ``xml_encoding``
50
51 example::
52
53 >>> import encutils
54 >>> info = encutils.getEncodingInfo(url='http://cthedot.de/encutils/')
55
56 >>> print info # = str(info)
57 utf-8
58
59 >>> info # = repr(info)
60 <encutils.EncodingInfo object encoding='utf-8' mismatch=False at 0xb86d30>
61
62 >>> print info.logtext
63 HTTP media_type: text/html
64 HTTP encoding: utf-8
65 HTML META media_type: text/html
66 HTML META encoding: utf-8
67 Encoding (probably): utf-8 (Mismatch: False)
68
69
70 references
71 ==========
72 XML
73 RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt)
74
75 easier explained in
76 - http://feedparser.org/docs/advanced.html
77 - http://www.xml.com/pub/a/2004/07/21/dive.html
78
79 HTML
80 http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
81
82 TODO
83 ====
84 - parse @charset of HTML elements?
85 - check for more texttypes if only text given
86
87 """
88 __all__ = ['buildlog',
89 'encodingByMediaType',
90 'getHTTPInfo',
91 'getMetaInfo',
92 'detectXMLEncoding',
93 'getEncodingInfo',
94 'tryEncodings',
95 'EncodingInfo']
96 __docformat__ = 'restructuredtext'
97 __author__ = 'Christof Hoeke'
98 __version__ = '0.8.3 $Id: __init__.py 1138 2008-03-15 18:24:46Z cthedot $'
99
100 import cgi
101 import HTMLParser
102 import httplib
103 import re
104 import StringIO
105 import sys
106 import types
107 import urllib
108
118
119
120
121 _XML_APPLICATION_TYPE = 0
122
123
124 _XML_TEXT_TYPE = 1
125
126
127 _HTML_TEXT_TYPE = 2
128
129
130 _TEXT_TYPE = 3
131
132
133 _TEXT_UTF8 = 5
134
135
136 _OTHER_TYPE = 4
137
139 """
140 All encoding related information, returned by ``getEncodingInfo``
141
142 - ``encoding``: The guessed encoding
143 Encoding is the explicit or implicit encoding or None and
144 always lowercase.
145
146 - from HTTP response
147 * ``http_encoding``
148 * ``http_media_type``
149
150 - from HTML <meta> element
151 * ``meta_encoding``
152 * ``meta_media_type``
153
154 - from XML declaration
155 * ``xml_encoding``
156
157 - ``mismatch``: True if mismatch between XML declaration and HTTP header
158 Mismatch is True if any mismatches between HTTP header, XML
159 declaration or textcontent (meta) are found. More detailed mismatch
160 reports are written to the optional log or ``logtext``
161
162 Mismatches are not necessarily errors as preferences are defined.
163 For details see the specifications.
164
165 - ``logtext``: if no log was given log reports are given here
166
167 """
169 """
170 initializes all possible properties to ``None``, see class
171 description
172 """
173 self.encoding = self.mismatch = self.logtext =\
174 self.http_encoding = self.http_media_type =\
175 self.meta_encoding = self.meta_media_type =\
176 self.xml_encoding =\
177 None
178
180 """
181 ``str(EncodingInfo())`` outputs the guessed encoding itself or the empty string
182 """
183 if self.encoding:
184 return self.encoding
185 else:
186 return u''
187
189 return "<%s.%s object encoding=%r mismatch=%s at 0x%x>" % (
190 self.__class__.__module__, self.__class__.__name__,
191 self.encoding, self.mismatch, id(self))
192
193
194 -def buildlog(logname='encutils', level='INFO', stream=sys.stderr,
195 filename=None, filemode="w",
196 format='%(levelname)s\t%(message)s'):
197 """
198 helper to build a basic log
199
200 - if ``filename`` is given returns a log logging to ``filename`` with
201 mode ``filemode``
202 - else uses a log streaming to ``stream`` which defaults to
203 ``sys.stderr``
204 - ``level`` defines the level of the log
205 - ``format`` defines the formatter format of the log
206
207 returns a log with the name ``logname``
208 """
209 import logging
210
211 log = logging.getLogger(logname)
212
213 if filename:
214 hdlr = logging.FileHandler(filename, filemode)
215 else:
216 hdlr = logging.StreamHandler(stream)
217
218 formatter = logging.Formatter(format)
219 hdlr.setFormatter(formatter)
220
221 log.addHandler(hdlr)
222 log.setLevel(logging.__dict__.get(level, logging.INFO))
223
224 return log
225
227 """
228 returns type as defined by constants above
229 """
230 if not media_type:
231 return _OTHER_TYPE
232
233 xml_application_types = [
234 ur'application/.*?\+xml',
235 u'application/xml',
236 u'application/xml-dtd',
237 u'application/xml-external-parsed-entity']
238 xml_text_types = [
239 ur'text\/.*?\+xml',
240 u'text/xml',
241 u'text/xml-external-parsed-entity']
242
243 media_type = media_type.strip().lower()
244
245 if media_type in xml_application_types or\
246 re.match(xml_application_types[0], media_type, re.I|re.S|re.X):
247 return _XML_APPLICATION_TYPE
248 elif media_type in xml_text_types or\
249 re.match(xml_text_types[0], media_type, re.I|re.S|re.X):
250 return _XML_TEXT_TYPE
251 elif media_type == u'text/html':
252 return _HTML_TEXT_TYPE
253 elif media_type == u'text/css':
254 return _TEXT_UTF8
255 elif media_type.startswith(u'text/'):
256 return _TEXT_TYPE
257 else:
258 return _OTHER_TYPE
259
260 -def _getTextType(text, log=None):
261 """
262 checks if given text is XML (**naive test!**)
263 used if no content-type given
264 """
265 if text[:30].find(u'<?xml version=') != -1:
266 return _XML_APPLICATION_TYPE
267 else:
268 return _OTHER_TYPE
269
299
301 """
302 Returns ``(media_type, encoding)`` information from the response'
303 Content-Type HTTP header. (Case of headers is ignored.)
304 May be ``(None, None)`` e.g. if no Content-Type header is
305 available.
306 """
307 info = response.info()
308 media_type = info.gettype()
309 encoding = info.getparam('charset')
310
311 if encoding:
312 encoding = encoding.lower()
313
314 if log:
315 log.info(u'HTTP media_type: %s', media_type)
316 log.info(u'HTTP encoding: %s', encoding)
317
318 return media_type, encoding
319
343
345 """
346 Attempts to detect the character encoding of the xml file
347 given by a file object fp. fp must not be a codec wrapped file
348 object! fp may also be a string or unicode string
349
350 The return value can be:
351 - if detection of the BOM succeeds, the codec name of the
352 corresponding unicode charset is returned
353
354 - if BOM detection fails, the xml declaration is searched for
355 the encoding attribute and its value returned. the "<"
356 character has to be the very first in the file then (it's xml
357 standard after all).
358
359 - if BOM and xml declaration fail, utf-8 is returned according
360 to XML 1.0.
361
362 Based on a recipe by Lars Tiede:
363 http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841
364 which itself is based on Paul Prescotts recipe:
365 http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257
366 """
367 if type(fp) in types.StringTypes:
368 fp = StringIO.StringIO(fp)
369
370
371
372
373 bomDict={
374 (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
375 (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
376 (0xFE, 0xFF, None, None) : "utf_16_be",
377 (0xFF, 0xFE, None, None) : "utf_16_le",
378 (0xEF, 0xBB, 0xBF, None) : "utf-8",
379 }
380
381
382 oldFP = fp.tell()
383 fp.seek(0)
384 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
385
386
387 bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
388 if not bomDetection:
389 bomDetection = bomDict.get((byte1, byte2, byte3, None))
390 if not bomDetection:
391 bomDetection = bomDict.get((byte1, byte2, None, None))
392
393
394 if bomDetection:
395 if log:
396 log.info(u'XML BOM encoding: %s' % bomDetection)
397 fp.seek(oldFP)
398 return bomDetection
399
400
401
402
403
404
405
406
407 fp.seek(0)
408 buffer = fp.read(2048)
409
410
411 xmlDeclPattern = r"""
412 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte
413 .+? # some chars (version info), matched minimal
414 encoding= # encoding attribute begins
415 ["'] # attribute start delimiter
416 (?P<encstr> # what's matched in the brackets will be named encstr
417 [^"']+ # every character not delimiter (not overly exact!)
418 ) # closes the brackets pair for the named group
419 ["'] # attribute end delimiter
420 .*? # some chars optionally (standalone decl or whitespace)
421 \?> # xmldecl end
422 """
423 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
424
425
426 match = xmlDeclRE.search(buffer)
427 fp.seek(oldFP)
428 if match:
429 enc = match.group("encstr").lower()
430 if log:
431 log.info(u'XML encoding="%s"' % enc)
432 return enc
433 else:
434 if includeDefault:
435 if log:
436 log.info(u'XML encoding default utf-8')
437 return u'utf-8'
438 else:
439 return None
440
442 """
443 If installed uses chardet http://chardet.feedparser.org/ to detect
444 encoding, else tries different encodings on text and returns the one
445 that does not raise an exception which is not very advanced or may
446 be totally wrong.
447
448 Returns working encoding or None if no encoding does work at all.
449
450 The returned encoding might nevertheless be not the one intended by the
451 author as it is only checked if the text might be encoded in that
452 encoding. Some texts might be working in "iso-8859-1" *and*
453 "windows-1252" *and* "ascii" *and* "utf-8" and ...
454 """
455 try:
456 import chardet
457 encoding = chardet.detect(text)["encoding"]
458
459 except ImportError:
460 msg = 'Using simplified encoding detection, you might want to install chardet.'
461 if log:
462 log.warn(msg)
463 else:
464 print msg
465
466 encodings = (
467 'ascii',
468 'iso-8859-1',
469 'windows-1252',
470 'utf-8'
471 )
472 encoding = None
473 for e in encodings:
474 try:
475 text.encode(e)
476 except (UnicodeEncodeError, UnicodeDecodeError):
477 pass
478 else:
479 encoding = e
480 break
481
482 return encoding
483
485 """
486 Finds all encoding related information in given ``text``.
487 Uses information in headers of supplied HTTPResponse, possible XML
488 declaration and X/HTML ``<meta>`` elements.
489 ``text`` will mostly be HTML or XML.
490
491 Parameters
492 - ``response``: HTTP response object,
493 e.g. ``urllib.urlopen('url')``
494 - ``text``: to guess encoding for, might include XML
495 prolog with encoding pseudo attribute or HTML meta element
496 - ``log``: an optional logging logger to which messages may go, if
497 no log given all log messages are available from resulting
498 ``EncodingInfo``
499
500 May also simply be called with ``getEncodingInfo(url='URL')`` which fetches
501 the url and all needed information.
502
503 Returns instance of ``EncodingInfo``.
504
505 How the resulting encoding is retrieved
506 =======================================
507 XML
508 ---
509 RFC 3023 states if media type given in the Content-Type HTTP header is
510 application/xml, application/xml-dtd,
511 application/xml-external-parsed-entity, or any one of the subtypes of
512 application/xml such as application/atom+xml or application/rss+xml
513 etc then the character encoding is determined in this order:
514
515 1. the encoding given in the charset parameter of the Content-Type HTTP
516 header, or
517 2. the encoding given in the encoding attribute of the XML declaration
518 within the document, or
519 3. utf-8.
520
521 Mismatch possibilities:
522 - HTTP + XMLdecla
523 - HTTP + HTMLmeta
524
525 application/xhtml+xml ?
526 XMLdecla + HTMLmeta
527
528 If the media type given in the Content-Type HTTP header is text/xml,
529 text/xml-external-parsed-entity, or a subtype like text/Anything+xml,
530 the encoding attribute of the XML declaration is ignored completely
531 and the character encoding is determined in the order:
532 1. the encoding given in the charset parameter of the Content-Type HTTP
533 header, or
534 2. ascii.
535
536 Mismatch possibilities:
537 - HTTP + XMLdecla
538 - HTTP + HTMLmeta
539
540 text/xhtml+xml
541 XMLdecla + HTMLmeta
542
543 HTML
544 ----
545 For HTML served as text/html:
546 http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
547
548 1. An HTTP "charset" parameter in a "Content-Type" field.
549 (maybe defaults to ISO-8859-1, but should not assume this)
550 2. A META declaration with "http-equiv" set to "Content-Type" and a
551 value set for "charset".
552 3. The charset attribute set on an element that designates an external
553 resource. (NOT IMPLEMENTED HERE YET)
554
555 Mismatch possibilities:
556 - HTTP + HTMLmeta
557
558 TEXT
559 ----
560 For most text/* types the encoding will be reported as iso-8859-1.
561 Exceptions are XML formats send as text/* mime type (see above) and
562 text/css which has a default encoding of UTF-8.
563 """
564 if url:
565 try:
566 response = urllib.urlopen(url)
567 text = response.read()
568 except IOError, e:
569 print IOError(e)
570 sys.exit(1)
571
572 encinfo = EncodingInfo()
573
574 logstream = StringIO.StringIO()
575 if not log:
576 log = buildlog(stream=logstream, format='%(message)s')
577
578
579 if response:
580 encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo(
581 response, log)
582 texttype = _getTextTypeByMediaType(encinfo.http_media_type, log)
583 else:
584
585 texttype = _getTextType(text, log)
586
587
588 if texttype == _XML_APPLICATION_TYPE or texttype == _XML_TEXT_TYPE:
589 encinfo.xml_encoding = detectXMLEncoding(text, log)
590
591
592 if texttype == _HTML_TEXT_TYPE:
593 encinfo.xml_encoding = detectXMLEncoding(text, log, includeDefault=False)
594
595
596 if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE:
597 encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo(
598 text, log)
599
600
601
602 encinfo.encoding = encinfo.http_encoding
603 encinfo.mismatch = False
604
605
606
607 if texttype == _XML_APPLICATION_TYPE:
608 if not encinfo.encoding:
609 encinfo.encoding = encinfo.xml_encoding
610
611
612
613 elif texttype == _HTML_TEXT_TYPE:
614 if not encinfo.encoding:
615 encinfo.encoding = encinfo.meta_encoding
616 if not encinfo.encoding:
617 encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
618 if not encinfo.encoding:
619 encinfo.encoding = tryEncodings(text)
620
621
622 elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE:
623 if not encinfo.encoding:
624 encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
625
626
627
628 if encinfo.http_encoding and encinfo.xml_encoding and\
629 encinfo.http_encoding <> encinfo.xml_encoding:
630 encinfo.mismatch = True
631 log.warn(u'"%s" (HTTP) <> "%s" (XML) encoding mismatch' %
632 (encinfo.http_encoding, encinfo.xml_encoding))
633
634 if encinfo.http_encoding and encinfo.meta_encoding and\
635 encinfo.http_encoding <> encinfo.meta_encoding:
636 encinfo.mismatch = True
637 log.warn(u'"%s" (HTTP) <> "%s" (HTML <meta>) encoding mismatch' %
638 (encinfo.http_encoding, encinfo.meta_encoding))
639
640 if encinfo.xml_encoding and encinfo.meta_encoding and\
641 encinfo.xml_encoding <> encinfo.meta_encoding:
642 encinfo.mismatch = True
643 log.warn(u'"%s" (XML) <> "%s" (HTML <meta>) encoding mismatch' %
644 (encinfo.xml_encoding, encinfo.meta_encoding))
645
646 log.info(u'Encoding (probably): %s (Mismatch: %s)',
647 encinfo.encoding, encinfo.mismatch)
648
649 encinfo.logtext = logstream.getvalue()
650 return encinfo
651
652
653 if __name__ == '__main__':
654 import pydoc
655 pydoc.help(__name__)
656