1 : <?php
2 :
3 : /**
4 : * A UTF-8 specific character encoder that handles cleaning and transforming.
5 : * @note All functions in this class should be static.
6 : */
7 : class HTMLPurifier_Encoder
8 1 : {
9 :
10 : /**
11 : * Constructor throws fatal error if you attempt to instantiate class
12 : */
13 : private function __construct() {
14 0 : trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
15 0 : }
16 :
17 : /**
18 : * Error-handler that mutes errors, alternative to shut-up operator.
19 : */
20 0 : private static function muteErrorHandler() {}
21 :
22 : /**
23 : * Cleans a UTF-8 string for well-formedness and SGML validity
24 : *
25 : * It will parse according to UTF-8 and return a valid UTF8 string, with
26 : * non-SGML codepoints excluded.
27 : *
28 : * @note Just for reference, the non-SGML code points are 0 to 31 and
29 : * 127 to 159, inclusive. However, we allow code points 9, 10
30 : * and 13, which are the tab, line feed and carriage return
31 : * respectively. 128 and above the code points map to multibyte
32 : * UTF-8 representations.
33 : *
34 : * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
35 : * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
36 : * LGPL license. Notes on what changed are inside, but in general,
37 : * the original code transformed UTF-8 text into an array of integer
38 : * Unicode codepoints. Understandably, transforming that back to
39 : * a string would be somewhat expensive, so the function was modded to
40 : * directly operate on the string. However, this discourages code
41 : * reuse, and the logic enumerated here would be useful for any
42 : * function that needs to be able to understand UTF-8 characters.
43 : * As of right now, only smart lossless character encoding converters
44 : * would need that, and I'm probably not going to implement them.
45 : * Once again, PHP 6 should solve all our problems.
46 : */
47 : public static function cleanUTF8($str, $force_php = false) {
48 :
49 : // UTF-8 validity is checked since PHP 4.3.5
50 : // This is an optimization: if the string is already valid UTF-8, no
51 : // need to do PHP stuff. 99% of the time, this will be the case.
52 : // The regexp matches the XML char production, as well as well as excluding
53 : // non-SGML codepoints U+007F to U+009F
54 2 : if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
55 2 : return $str;
56 : }
57 :
58 0 : $mState = 0; // cached expected number of octets after the current octet
59 : // until the beginning of the next UTF8 character sequence
60 0 : $mUcs4 = 0; // cached Unicode character
61 0 : $mBytes = 1; // cached expected number of octets in the current sequence
62 :
63 : // original code involved an $out that was an array of Unicode
64 : // codepoints. Instead of having to convert back into UTF-8, we've
65 : // decided to directly append valid UTF-8 characters onto a string
66 : // $out once they're done. $char accumulates raw bytes, while $mUcs4
67 : // turns into the Unicode code point, so there's some redundancy.
68 :
69 0 : $out = '';
70 0 : $char = '';
71 :
72 0 : $len = strlen($str);
73 0 : for($i = 0; $i < $len; $i++) {
74 0 : $in = ord($str{$i});
75 0 : $char .= $str[$i]; // append byte to char
76 0 : if (0 == $mState) {
77 : // When mState is zero we expect either a US-ASCII character
78 : // or a multi-octet sequence.
79 0 : if (0 == (0x80 & ($in))) {
80 : // US-ASCII, pass straight through.
81 0 : if (($in <= 31 || $in == 127) &&
82 0 : !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
83 0 : ) {
84 : // control characters, remove
85 0 : } else {
86 0 : $out .= $char;
87 : }
88 : // reset
89 0 : $char = '';
90 0 : $mBytes = 1;
91 0 : } elseif (0xC0 == (0xE0 & ($in))) {
92 : // First octet of 2 octet sequence
93 0 : $mUcs4 = ($in);
94 0 : $mUcs4 = ($mUcs4 & 0x1F) << 6;
95 0 : $mState = 1;
96 0 : $mBytes = 2;
97 0 : } elseif (0xE0 == (0xF0 & ($in))) {
98 : // First octet of 3 octet sequence
99 0 : $mUcs4 = ($in);
100 0 : $mUcs4 = ($mUcs4 & 0x0F) << 12;
101 0 : $mState = 2;
102 0 : $mBytes = 3;
103 0 : } elseif (0xF0 == (0xF8 & ($in))) {
104 : // First octet of 4 octet sequence
105 0 : $mUcs4 = ($in);
106 0 : $mUcs4 = ($mUcs4 & 0x07) << 18;
107 0 : $mState = 3;
108 0 : $mBytes = 4;
109 0 : } elseif (0xF8 == (0xFC & ($in))) {
110 : // First octet of 5 octet sequence.
111 : //
112 : // This is illegal because the encoded codepoint must be
113 : // either:
114 : // (a) not the shortest form or
115 : // (b) outside the Unicode range of 0-0x10FFFF.
116 : // Rather than trying to resynchronize, we will carry on
117 : // until the end of the sequence and let the later error
118 : // handling code catch it.
119 0 : $mUcs4 = ($in);
120 0 : $mUcs4 = ($mUcs4 & 0x03) << 24;
121 0 : $mState = 4;
122 0 : $mBytes = 5;
123 0 : } elseif (0xFC == (0xFE & ($in))) {
124 : // First octet of 6 octet sequence, see comments for 5
125 : // octet sequence.
126 0 : $mUcs4 = ($in);
127 0 : $mUcs4 = ($mUcs4 & 1) << 30;
128 0 : $mState = 5;
129 0 : $mBytes = 6;
130 0 : } else {
131 : // Current octet is neither in the US-ASCII range nor a
132 : // legal first octet of a multi-octet sequence.
133 0 : $mState = 0;
134 0 : $mUcs4 = 0;
135 0 : $mBytes = 1;
136 0 : $char = '';
137 : }
138 0 : } else {
139 : // When mState is non-zero, we expect a continuation of the
140 : // multi-octet sequence
141 0 : if (0x80 == (0xC0 & ($in))) {
142 : // Legal continuation.
143 0 : $shift = ($mState - 1) * 6;
144 0 : $tmp = $in;
145 0 : $tmp = ($tmp & 0x0000003F) << $shift;
146 0 : $mUcs4 |= $tmp;
147 :
148 0 : if (0 == --$mState) {
149 : // End of the multi-octet sequence. mUcs4 now contains
150 : // the final Unicode codepoint to be output
151 :
152 : // Check for illegal sequences and codepoints.
153 :
154 : // From Unicode 3.1, non-shortest form is illegal
155 0 : if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
156 0 : ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
157 0 : ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
158 0 : (4 < $mBytes) ||
159 : // From Unicode 3.2, surrogate characters = illegal
160 0 : (($mUcs4 & 0xFFFFF800) == 0xD800) ||
161 : // Codepoints outside the Unicode range are illegal
162 0 : ($mUcs4 > 0x10FFFF)
163 0 : ) {
164 :
165 0 : } elseif (0xFEFF != $mUcs4 && // omit BOM
166 : // check for valid Char unicode codepoints
167 : (
168 0 : 0x9 == $mUcs4 ||
169 0 : 0xA == $mUcs4 ||
170 0 : 0xD == $mUcs4 ||
171 0 : (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
172 : // 7F-9F is not strictly prohibited by XML,
173 : // but it is non-SGML, and thus we don't allow it
174 0 : (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
175 0 : (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
176 0 : )
177 0 : ) {
178 0 : $out .= $char;
179 0 : }
180 : // initialize UTF8 cache (reset)
181 0 : $mState = 0;
182 0 : $mUcs4 = 0;
183 0 : $mBytes = 1;
184 0 : $char = '';
185 0 : }
186 0 : } else {
187 : // ((0xC0 & (*in) != 0x80) && (mState != 0))
188 : // Incomplete multi-octet sequence.
189 : // used to result in complete fail, but we'll reset
190 0 : $mState = 0;
191 0 : $mUcs4 = 0;
192 0 : $mBytes = 1;
193 0 : $char ='';
194 : }
195 : }
196 0 : }
197 0 : return $out;
198 : }
199 :
200 : /**
201 : * Translates a Unicode codepoint into its corresponding UTF-8 character.
202 : * @note Based on Feyd's function at
203 : * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
204 : * which is in public domain.
205 : * @note While we're going to do code point parsing anyway, a good
206 : * optimization would be to refuse to translate code points that
207 : * are non-SGML characters. However, this could lead to duplication.
208 : * @note This is very similar to the unichr function in
209 : * maintenance/generate-entity-file.php (although this is superior,
210 : * due to its sanity checks).
211 : */
212 :
213 : // +----------+----------+----------+----------+
214 : // | 33222222 | 22221111 | 111111 | |
215 : // | 10987654 | 32109876 | 54321098 | 76543210 | bit
216 : // +----------+----------+----------+----------+
217 : // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
218 : // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
219 : // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
220 : // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
221 : // +----------+----------+----------+----------+
222 : // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
223 : // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
224 : // +----------+----------+----------+----------+
225 :
226 : public static function unichr($code) {
227 0 : if($code > 1114111 or $code < 0 or
228 0 : ($code >= 55296 and $code <= 57343) ) {
229 : // bits are set outside the "valid" range as defined
230 : // by UNICODE 4.1.0
231 0 : return '';
232 : }
233 :
234 0 : $x = $y = $z = $w = 0;
235 0 : if ($code < 128) {
236 : // regular ASCII character
237 0 : $x = $code;
238 0 : } else {
239 : // set up bits for UTF-8
240 0 : $x = ($code & 63) | 128;
241 0 : if ($code < 2048) {
242 0 : $y = (($code & 2047) >> 6) | 192;
243 0 : } else {
244 0 : $y = (($code & 4032) >> 6) | 128;
245 0 : if($code < 65536) {
246 0 : $z = (($code >> 12) & 15) | 224;
247 0 : } else {
248 0 : $z = (($code >> 12) & 63) | 128;
249 0 : $w = (($code >> 18) & 7) | 240;
250 : }
251 : }
252 : }
253 : // set up the actual character
254 0 : $ret = '';
255 0 : if($w) $ret .= chr($w);
256 0 : if($z) $ret .= chr($z);
257 0 : if($y) $ret .= chr($y);
258 0 : $ret .= chr($x);
259 :
260 0 : return $ret;
261 : }
262 :
263 : /**
264 : * Converts a string to UTF-8 based on configuration.
265 : */
266 : public static function convertToUTF8($str, $config, $context) {
267 2 : $encoding = $config->get('Core', 'Encoding');
268 2 : if ($encoding === 'utf-8') return $str;
269 0 : static $iconv = null;
270 0 : if ($iconv === null) $iconv = function_exists('iconv');
271 0 : set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
272 0 : if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
273 0 : $str = iconv($encoding, 'utf-8//IGNORE', $str);
274 : // If the string is bjorked by Shift_JIS or a similar encoding
275 : // that doesn't support all of ASCII, convert the naughty
276 : // characters to their true byte-wise ASCII/UTF-8 equivalents.
277 0 : $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
278 0 : restore_error_handler();
279 0 : return $str;
280 0 : } elseif ($encoding === 'iso-8859-1') {
281 0 : $str = utf8_encode($str);
282 0 : restore_error_handler();
283 0 : return $str;
284 : }
285 0 : trigger_error('Encoding not supported', E_USER_ERROR);
286 0 : }
287 :
288 : /**
289 : * Converts a string from UTF-8 based on configuration.
290 : * @note Currently, this is a lossy conversion, with unexpressable
291 : * characters being omitted.
292 : */
293 : public static function convertFromUTF8($str, $config, $context) {
294 2 : $encoding = $config->get('Core', 'Encoding');
295 2 : if ($encoding === 'utf-8') return $str;
296 0 : static $iconv = null;
297 0 : if ($iconv === null) $iconv = function_exists('iconv');
298 0 : if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
299 0 : $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
300 0 : }
301 0 : set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
302 0 : if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
303 : // Undo our previous fix in convertToUTF8, otherwise iconv will barf
304 0 : $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
305 0 : if (!$escape && !empty($ascii_fix)) {
306 0 : $clear_fix = array();
307 0 : foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
308 0 : $str = strtr($str, $clear_fix);
309 0 : }
310 0 : $str = strtr($str, array_flip($ascii_fix));
311 : // Normal stuff
312 0 : $str = iconv('utf-8', $encoding . '//IGNORE', $str);
313 0 : restore_error_handler();
314 0 : return $str;
315 0 : } elseif ($encoding === 'iso-8859-1') {
316 0 : $str = utf8_decode($str);
317 0 : restore_error_handler();
318 0 : return $str;
319 : }
320 0 : trigger_error('Encoding not supported', E_USER_ERROR);
321 0 : }
322 :
323 : /**
324 : * Lossless (character-wise) conversion of HTML to ASCII
325 : * @param $str UTF-8 string to be converted to ASCII
326 : * @returns ASCII encoded string with non-ASCII character entity-ized
327 : * @warning Adapted from MediaWiki, claiming fair use: this is a common
328 : * algorithm. If you disagree with this license fudgery,
329 : * implement it yourself.
330 : * @note Uses decimal numeric entities since they are best supported.
331 : * @note This is a DUMB function: it has no concept of keeping
332 : * character entities that the projected character encoding
333 : * can allow. We could possibly implement a smart version
334 : * but that would require it to also know which Unicode
335 : * codepoints the charset supported (not an easy task).
336 : * @note Sort of with cleanUTF8() but it assumes that $str is
337 : * well-formed UTF-8
338 : */
339 : public static function convertToASCIIDumbLossless($str) {
340 0 : $bytesleft = 0;
341 0 : $result = '';
342 0 : $working = 0;
343 0 : $len = strlen($str);
344 0 : for( $i = 0; $i < $len; $i++ ) {
345 0 : $bytevalue = ord( $str[$i] );
346 0 : if( $bytevalue <= 0x7F ) { //0xxx xxxx
347 0 : $result .= chr( $bytevalue );
348 0 : $bytesleft = 0;
349 0 : } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
350 0 : $working = $working << 6;
351 0 : $working += ($bytevalue & 0x3F);
352 0 : $bytesleft--;
353 0 : if( $bytesleft <= 0 ) {
354 0 : $result .= "&#" . $working . ";";
355 0 : }
356 0 : } elseif( $bytevalue <= 0xDF ) { //110x xxxx
357 0 : $working = $bytevalue & 0x1F;
358 0 : $bytesleft = 1;
359 0 : } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
360 0 : $working = $bytevalue & 0x0F;
361 0 : $bytesleft = 2;
362 0 : } else { //1111 0xxx
363 0 : $working = $bytevalue & 0x07;
364 0 : $bytesleft = 3;
365 : }
366 0 : }
367 0 : return $result;
368 : }
369 :
370 : /**
371 : * This expensive function tests whether or not a given character
372 : * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
373 : * fail this test, and require special processing. Variable width
374 : * encodings shouldn't ever fail.
375 : *
376 : * @param string $encoding Encoding name to test, as per iconv format
377 : * @param bool $bypass Whether or not to bypass the precompiled arrays.
378 : * @return Array of UTF-8 characters to their corresponding ASCII,
379 : * which can be used to "undo" any overzealous iconv action.
380 : */
381 : public static function testEncodingSupportsASCII($encoding, $bypass = false) {
382 0 : static $encodings = array();
383 0 : if (!$bypass) {
384 0 : if (isset($encodings[$encoding])) return $encodings[$encoding];
385 0 : $lenc = strtolower($encoding);
386 : switch ($lenc) {
387 0 : case 'shift_jis':
388 0 : return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
389 0 : case 'johab':
390 0 : return array("\xE2\x82\xA9" => '\\');
391 : }
392 0 : if (strpos($lenc, 'iso-8859-') === 0) return array();
393 0 : }
394 0 : $ret = array();
395 0 : set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
396 0 : if (iconv('UTF-8', $encoding, 'a') === false) return false;
397 0 : for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
398 0 : $c = chr($i);
399 0 : if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
400 : // Reverse engineer: what's the UTF-8 equiv of this byte
401 : // sequence? This assumes that there's no variable width
402 : // encoding that doesn't support ASCII.
403 0 : $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
404 0 : }
405 0 : }
406 0 : restore_error_handler();
407 0 : $encodings[$encoding] = $ret;
408 0 : return $ret;
409 : }
410 :
411 :
412 : }
413 :
|