1 <?php
  2 /**
  3 * Tools for locating / replacing bad bytes in UTF-8 strings
  4 * The Original Code is Mozilla Communicator client code.
  5 * The Initial Developer of the Original Code is
  6 * Netscape Communications Corporation.
  7 * Portions created by the Initial Developer are Copyright (C) 1998
  8 * the Initial Developer. All Rights Reserved.
  9 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
 10 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
 11 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
 12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
 13 * @see http://hsivonen.iki.fi/php-utf8/
 14 * @package utf8
 15 * @see utf8_is_valid
 16 */
 17 
 18 //--------------------------------------------------------------------
 19 /**
 20 * Locates the first bad byte in a UTF-8 string returning it's
 21 * byte index in the string
 22 * PCRE Pattern to locate bad bytes in a UTF-8 string
 23 * Comes from W3 FAQ: Multilingual Forms
 24 * Note: modified to include full ASCII range including control chars
 25 * @see http://www.w3.org/International/questions/qa-forms-utf-8
 26 * @param string
 27 * @return mixed integer byte index or FALSE if no bad found
 28 * @package utf8
 29 */
 30 function utf8_bad_find($str) {
 31     $UTF8_BAD =
 32     '([\x00-\x7F]'.                          # ASCII (including control chars)
 33     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 34     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 35     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 36     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 37     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 38     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 39     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 40     '|(.{1}))';                              # invalid byte
 41     $pos = 0;
 42     $badList = array();
 43     while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 44         $bytes = strlen($matches[0]);
 45         if ( isset($matches[2])) {
 46             return $pos;
 47         }
 48         $pos += $bytes;
 49         $str = substr($str,$bytes);
 50     }
 51     return FALSE;
 52 }
 53 
 54 //--------------------------------------------------------------------
 55 /**
 56 * Locates all bad bytes in a UTF-8 string and returns a list of their
 57 * byte index in the string
 58 * PCRE Pattern to locate bad bytes in a UTF-8 string
 59 * Comes from W3 FAQ: Multilingual Forms
 60 * Note: modified to include full ASCII range including control chars
 61 * @see http://www.w3.org/International/questions/qa-forms-utf-8
 62 * @param string
 63 * @return mixed array of integers or FALSE if no bad found
 64 * @package utf8
 65 */
 66 function utf8_bad_findall($str) {
 67     $UTF8_BAD =
 68     '([\x00-\x7F]'.                          # ASCII (including control chars)
 69     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 70     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 71     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 72     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 73     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 74     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 75     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 76     '|(.{1}))';                              # invalid byte
 77     $pos = 0;
 78     $badList = array();
 79     while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 80         $bytes = strlen($matches[0]);
 81         if ( isset($matches[2])) {
 82             $badList[] = $pos;
 83         }
 84         $pos += $bytes;
 85         $str = substr($str,$bytes);
 86     }
 87     if ( count($badList) > 0 ) {
 88         return $badList;
 89     }
 90     return FALSE;
 91 }
 92 
 93 //--------------------------------------------------------------------
 94 /**
 95 * Strips out any bad bytes from a UTF-8 string and returns the rest
 96 * PCRE Pattern to locate bad bytes in a UTF-8 string
 97 * Comes from W3 FAQ: Multilingual Forms
 98 * Note: modified to include full ASCII range including control chars
 99 * @see http://www.w3.org/International/questions/qa-forms-utf-8
100 * @param string
101 * @return string
102 * @package utf8
103 */
104 function utf8_bad_strip($str) {
105     $UTF8_BAD =
106     '([\x00-\x7F]'.                          # ASCII (including control chars)
107     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
108     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
109     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
110     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
111     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
112     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
113     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
114     '|(.{1}))';                              # invalid byte
115     ob_start();
116     while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
117         if ( !isset($matches[2])) {
118             echo $matches[0];
119         }
120         $str = substr($str,strlen($matches[0]));
121     }
122     $result = ob_get_contents();
123     ob_end_clean();
124     return $result;
125 }
126 
127 //--------------------------------------------------------------------
128 /**
129 * Replace bad bytes with an alternative character - ASCII character
130 * recommended is replacement char
131 * PCRE Pattern to locate bad bytes in a UTF-8 string
132 * Comes from W3 FAQ: Multilingual Forms
133 * Note: modified to include full ASCII range including control chars
134 * @see http://www.w3.org/International/questions/qa-forms-utf-8
135 * @param string to search
136 * @param string to replace bad bytes with (defaults to '?') - use ASCII
137 * @return string
138 * @package utf8
139 */
140 function utf8_bad_replace($str, $replace = '?') {
141     $UTF8_BAD =
142     '([\x00-\x7F]'.                          # ASCII (including control chars)
143     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
144     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
145     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
146     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
147     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
148     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
149     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
150     '|(.{1}))';                              # invalid byte
151     ob_start();
152     while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
153         if ( !isset($matches[2])) {
154             echo $matches[0];
155         } else {
156             echo $replace;
157         }
158         $str = substr($str,strlen($matches[0]));
159     }
160     $result = ob_get_contents();
161     ob_end_clean();
162     return $result;
163 }
164 
165 //--------------------------------------------------------------------
166 /**
167 * Return code from utf8_bad_identify() when a five octet sequence is detected.
168 * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
169 * do not represent a useful character
170 * @see utf8_bad_identify
171 * @package utf8
172 */
173 define('UTF8_BAD_5OCTET',1);
174 
175 /**
176 * Return code from utf8_bad_identify() when a six octet sequence is detected.
177 * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
178 * do not represent a useful character
179 * @see utf8_bad_identify
180 * @package utf8
181 */
182 define('UTF8_BAD_6OCTET',2);
183 
184 /**
185 * Return code from utf8_bad_identify().
186 * Invalid octet for use as start of multi-byte UTF-8 sequence
187 * @see utf8_bad_identify
188 * @package utf8
189 */
190 define('UTF8_BAD_SEQID',3);
191 
192 /**
193 * Return code from utf8_bad_identify().
194 * From Unicode 3.1, non-shortest form is illegal
195 * @see utf8_bad_identify
196 * @package utf8
197 */
198 define('UTF8_BAD_NONSHORT',4);
199 
200 /**
201 * Return code from utf8_bad_identify().
202 * From Unicode 3.2, surrogate characters are illegal
203 * @see utf8_bad_identify
204 * @package utf8
205 */
206 define('UTF8_BAD_SURROGATE',5);
207 
208 /**
209 * Return code from utf8_bad_identify().
210 * Codepoints outside the Unicode range are illegal
211 * @see utf8_bad_identify
212 * @package utf8
213 */
214 define('UTF8_BAD_UNIOUTRANGE',6);
215 
216 /**
217 * Return code from utf8_bad_identify().
218 * Incomplete multi-octet sequence
219 * Note: this is kind of a "catch-all"
220 * @see utf8_bad_identify
221 * @package utf8
222 */
223 define('UTF8_BAD_SEQINCOMPLETE',7);
224 
225 //--------------------------------------------------------------------
226 /**
227 * Reports on the type of bad byte found in a UTF-8 string. Returns a
228 * status code on the first bad byte found
229 * @author <hsivonen@iki.fi>
230 * @param string UTF-8 encoded string
231 * @return mixed integer constant describing problem or FALSE if valid UTF-8
232 * @see utf8_bad_explain
233 * @see http://hsivonen.iki.fi/php-utf8/
234 * @package utf8
235 */
236 function utf8_bad_identify($str, &$i) {
237 
238     $mState = 0;     // cached expected number of octets after the current octet
239                      // until the beginning of the next UTF8 character sequence
240     $mUcs4  = 0;     // cached Unicode character
241     $mBytes = 1;     // cached expected number of octets in the current sequence
242 
243     $len = strlen($str);
244 
245     for($i = 0; $i < $len; $i++) {
246 
247         $in = ord($str{$i});
248 
249         if ( $mState == 0) {
250 
251             // When mState is zero we expect either a US-ASCII character or a
252             // multi-octet sequence.
253             if (0 == (0x80 & ($in))) {
254                 // US-ASCII, pass straight through.
255                 $mBytes = 1;
256 
257             } else if (0xC0 == (0xE0 & ($in))) {
258                 // First octet of 2 octet sequence
259                 $mUcs4 = ($in);
260                 $mUcs4 = ($mUcs4 & 0x1F) << 6;
261                 $mState = 1;
262                 $mBytes = 2;
263 
264             } else if (0xE0 == (0xF0 & ($in))) {
265                 // First octet of 3 octet sequence
266                 $mUcs4 = ($in);
267                 $mUcs4 = ($mUcs4 & 0x0F) << 12;
268                 $mState = 2;
269                 $mBytes = 3;
270 
271             } else if (0xF0 == (0xF8 & ($in))) {
272                 // First octet of 4 octet sequence
273                 $mUcs4 = ($in);
274                 $mUcs4 = ($mUcs4 & 0x07) << 18;
275                 $mState = 3;
276                 $mBytes = 4;
277 
278             } else if (0xF8 == (0xFC & ($in))) {
279 
280                 /* First octet of 5 octet sequence.
281                 *
282                 * This is illegal because the encoded codepoint must be either
283                 * (a) not the shortest form or
284                 * (b) outside the Unicode range of 0-0x10FFFF.
285                 */
286 
287                 return UTF8_BAD_5OCTET;
288 
289             } else if (0xFC == (0xFE & ($in))) {
290 
291                 // First octet of 6 octet sequence, see comments for 5 octet sequence.
292                 return UTF8_BAD_6OCTET;
293 
294             } else {
295                 // Current octet is neither in the US-ASCII range nor a legal first
296                 // octet of a multi-octet sequence.
297                 return UTF8_BAD_SEQID;
298 
299             }
300 
301         } else {
302 
303             // When mState is non-zero, we expect a continuation of the multi-octet
304             // sequence
305             if (0x80 == (0xC0 & ($in))) {
306 
307                 // Legal continuation.
308                 $shift = ($mState - 1) * 6;
309                 $tmp = $in;
310                 $tmp = ($tmp & 0x0000003F) << $shift;
311                 $mUcs4 |= $tmp;
312 
313                 /**
314                 * End of the multi-octet sequence. mUcs4 now contains the final
315                 * Unicode codepoint to be output
316                 */
317                 if (0 == --$mState) {
318 
319                     // From Unicode 3.1, non-shortest form is illegal
320                     if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
321                         ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
322                         ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
323                         return UTF8_BAD_NONSHORT;
324 
325                     // From Unicode 3.2, surrogate characters are illegal
326                     } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
327                         return UTF8_BAD_SURROGATE;
328 
329                     // Codepoints outside the Unicode range are illegal
330                     } else if ($mUcs4 > 0x10FFFF) {
331                         return UTF8_BAD_UNIOUTRANGE;
332                     }
333 
334                     //initialize UTF8 cache
335                     $mState = 0;
336                     $mUcs4  = 0;
337                     $mBytes = 1;
338                 }
339 
340             } else {
341                 // ((0xC0 & (*in) != 0x80) && (mState != 0))
342                 // Incomplete multi-octet sequence.
343                 $i--;
344                 return UTF8_BAD_SEQINCOMPLETE;
345             }
346         }
347     }
348 
349     if ( $mState != 0 ) {
350         // Incomplete multi-octet sequence.
351         $i--;
352         return UTF8_BAD_SEQINCOMPLETE;
353     }
354 
355     // No bad octets found
356     $i = NULL;
357     return FALSE;
358 }
359 
360 //--------------------------------------------------------------------
361 /**
362 * Takes a return code from utf8_bad_identify() are returns a message
363 * (in English) explaining what the problem is.
364 * @param int return code from utf8_bad_identify
365 * @return mixed string message or FALSE if return code unknown
366 * @see utf8_bad_identify
367 * @package utf8
368 */
369 function utf8_bad_explain($code) {
370 
371     switch ($code) {
372 
373         case UTF8_BAD_5OCTET:
374             return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
375         break;
376 
377         case UTF8_BAD_6OCTET:
378             return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
379         break;
380 
381         case UTF8_BAD_SEQID:
382             return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
383         break;
384 
385         case UTF8_BAD_NONSHORT:
386             return 'From Unicode 3.1, non-shortest form is illegal';
387         break;
388 
389         case UTF8_BAD_SURROGATE:
390             return 'From Unicode 3.2, surrogate characters are illegal';
391         break;
392 
393         case UTF8_BAD_UNIOUTRANGE:
394             return 'Codepoints outside the Unicode range are illegal';
395         break;
396 
397         case UTF8_BAD_SEQINCOMPLETE:
398             return 'Incomplete multi-octet sequence';
399         break;
400 
401     }
402 
403     trigger_error('Unknown error code: '.$code,E_USER_WARNING);
404     return FALSE;
405 
406 }
407