comparison mcabber/libjabber/xmltok.c @ 25:bf3d6e241714

[/trunk] Changeset 41 by mikael * Add libjabber to trunk. Let the game begin! :-)
author mikael
date Sun, 27 Mar 2005 20:18:21 +0000
parents
children c8df64f43625
comparison
equal deleted inserted replaced
24:e88b15cbf2de 25:bf3d6e241714
1 /*
2 The contents of this file are subject to the Mozilla Public License
3 Version 1.1 (the "License"); you may not use this file except in
4 compliance with the License. You may obtain a copy of the License at
5 http://www.mozilla.org/MPL/
6
7 Software distributed under the License is distributed on an "AS IS"
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9 License for the specific language governing rights and limitations
10 under the License.
11
12 The Original Code is expat.
13
14 The Initial Developer of the Original Code is James Clark.
15 Portions created by James Clark are Copyright (C) 1998, 1999
16 James Clark. All Rights Reserved.
17
18 Contributor(s):
19
20 Alternatively, the contents of this file may be used under the terms
21 of the GNU General Public License (the "GPL"), in which case the
22 provisions of the GPL are applicable instead of those above. If you
23 wish to allow use of your version of this file only under the terms of
24 the GPL and not to allow others to use your version of this file under
25 the MPL, indicate your decision by deleting the provisions above and
26 replace them with the notice and other provisions required by the
27 GPL. If you do not delete the provisions above, a recipient may use
28 your version of this file under either the MPL or the GPL.
29 */
30
31 #include "xmldef.h"
32 #include "xmltok.h"
33 #include "nametab.h"
34
35 #define VTABLE1 \
36 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
37 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
38 PREFIX(sameName), \
39 PREFIX(nameMatchesAscii), \
40 PREFIX(nameLength), \
41 PREFIX(skipS), \
42 PREFIX(getAtts), \
43 PREFIX(charRefNumber), \
44 PREFIX(predefinedEntityName), \
45 PREFIX(updatePosition), \
46 PREFIX(isPublicId)
47
48 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
49
50 #define UCS2_GET_NAMING(pages, hi, lo) \
51 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
52
53 /* A 2 byte UTF-8 representation splits the characters 11 bits
54 between the bottom 5 and 6 bits of the bytes.
55 We need 8 bits to index into pages, 3 bits to add to that index and
56 5 bits to generate the mask. */
57 #define UTF8_GET_NAMING2(pages, byte) \
58 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
59 + ((((byte)[0]) & 3) << 1) \
60 + ((((byte)[1]) >> 5) & 1)] \
61 & (1 << (((byte)[1]) & 0x1F)))
62
63 /* A 3 byte UTF-8 representation splits the characters 16 bits
64 between the bottom 4, 6 and 6 bits of the bytes.
65 We need 8 bits to index into pages, 3 bits to add to that index and
66 5 bits to generate the mask. */
67 #define UTF8_GET_NAMING3(pages, byte) \
68 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
69 + ((((byte)[1]) >> 2) & 0xF)] \
70 << 3) \
71 + ((((byte)[1]) & 3) << 1) \
72 + ((((byte)[2]) >> 5) & 1)] \
73 & (1 << (((byte)[2]) & 0x1F)))
74
75 #define UTF8_GET_NAMING(pages, p, n) \
76 ((n) == 2 \
77 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
78 : ((n) == 3 \
79 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
80 : 0))
81
82 #define UTF8_INVALID3(p) \
83 ((*p) == 0xED \
84 ? (((p)[1] & 0x20) != 0) \
85 : ((*p) == 0xEF \
86 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
87 : 0))
88
89 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
90
91 static
92 int isNever(const ENCODING *enc, const char *p)
93 {
94 return 0;
95 }
96
97 static
98 int utf8_isName2(const ENCODING *enc, const char *p)
99 {
100 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
101 }
102
103 static
104 int utf8_isName3(const ENCODING *enc, const char *p)
105 {
106 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
107 }
108
109 #define utf8_isName4 isNever
110
111 static
112 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
113 {
114 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
115 }
116
117 static
118 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
119 {
120 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
121 }
122
123 #define utf8_isNmstrt4 isNever
124
125 #define utf8_isInvalid2 isNever
126
127 static
128 int utf8_isInvalid3(const ENCODING *enc, const char *p)
129 {
130 return UTF8_INVALID3((const unsigned char *)p);
131 }
132
133 static
134 int utf8_isInvalid4(const ENCODING *enc, const char *p)
135 {
136 return UTF8_INVALID4((const unsigned char *)p);
137 }
138
139 struct normal_encoding {
140 ENCODING enc;
141 unsigned char type[256];
142 #ifdef XML_MIN_SIZE
143 int (*byteType)(const ENCODING *, const char *);
144 int (*isNameMin)(const ENCODING *, const char *);
145 int (*isNmstrtMin)(const ENCODING *, const char *);
146 int (*byteToAscii)(const ENCODING *, const char *);
147 int (*charMatches)(const ENCODING *, const char *, int);
148 #endif /* XML_MIN_SIZE */
149 int (*isName2)(const ENCODING *, const char *);
150 int (*isName3)(const ENCODING *, const char *);
151 int (*isName4)(const ENCODING *, const char *);
152 int (*isNmstrt2)(const ENCODING *, const char *);
153 int (*isNmstrt3)(const ENCODING *, const char *);
154 int (*isNmstrt4)(const ENCODING *, const char *);
155 int (*isInvalid2)(const ENCODING *, const char *);
156 int (*isInvalid3)(const ENCODING *, const char *);
157 int (*isInvalid4)(const ENCODING *, const char *);
158 };
159
160 #ifdef XML_MIN_SIZE
161
162 #define STANDARD_VTABLE(E) \
163 E ## byteType, \
164 E ## isNameMin, \
165 E ## isNmstrtMin, \
166 E ## byteToAscii, \
167 E ## charMatches,
168
169 #else
170
171 #define STANDARD_VTABLE(E) /* as nothing */
172
173 #endif
174
175 #define NORMAL_VTABLE(E) \
176 E ## isName2, \
177 E ## isName3, \
178 E ## isName4, \
179 E ## isNmstrt2, \
180 E ## isNmstrt3, \
181 E ## isNmstrt4, \
182 E ## isInvalid2, \
183 E ## isInvalid3, \
184 E ## isInvalid4
185
186 static int checkCharRefNumber(int);
187
188 #include "xmltok_impl.h"
189
190 #ifdef XML_MIN_SIZE
191 #define sb_isNameMin isNever
192 #define sb_isNmstrtMin isNever
193 #endif
194
195 #ifdef XML_MIN_SIZE
196 #define MINBPC(enc) ((enc)->minBytesPerChar)
197 #else
198 /* minimum bytes per character */
199 #define MINBPC(enc) 1
200 #endif
201
202 #define SB_BYTE_TYPE(enc, p) \
203 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
204
205 #ifdef XML_MIN_SIZE
206 static
207 int sb_byteType(const ENCODING *enc, const char *p)
208 {
209 return SB_BYTE_TYPE(enc, p);
210 }
211 #define BYTE_TYPE(enc, p) \
212 (((const struct normal_encoding *)(enc))->byteType(enc, p))
213 #else
214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
215 #endif
216
217 #ifdef XML_MIN_SIZE
218 #define BYTE_TO_ASCII(enc, p) \
219 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
220 static
221 int sb_byteToAscii(const ENCODING *enc, const char *p)
222 {
223 return *p;
224 }
225 #else
226 #define BYTE_TO_ASCII(enc, p) (*p)
227 #endif
228
229 #define IS_NAME_CHAR(enc, p, n) \
230 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
231 #define IS_NMSTRT_CHAR(enc, p, n) \
232 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
233 #define IS_INVALID_CHAR(enc, p, n) \
234 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
235
236 #ifdef XML_MIN_SIZE
237 #define IS_NAME_CHAR_MINBPC(enc, p) \
238 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
240 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
241 #else
242 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
244 #endif
245
246 #ifdef XML_MIN_SIZE
247 #define CHAR_MATCHES(enc, p, c) \
248 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
249 static
250 int sb_charMatches(const ENCODING *enc, const char *p, int c)
251 {
252 return *p == c;
253 }
254 #else
255 /* c is an ASCII character */
256 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
257 #endif
258
259 #define PREFIX(ident) normal_ ## ident
260 #include "xmltok_impl_c.h"
261
262 #undef MINBPC
263 #undef BYTE_TYPE
264 #undef BYTE_TO_ASCII
265 #undef CHAR_MATCHES
266 #undef IS_NAME_CHAR
267 #undef IS_NAME_CHAR_MINBPC
268 #undef IS_NMSTRT_CHAR
269 #undef IS_NMSTRT_CHAR_MINBPC
270 #undef IS_INVALID_CHAR
271
272 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
273 UTF8_cval1 = 0x00,
274 UTF8_cval2 = 0xc0,
275 UTF8_cval3 = 0xe0,
276 UTF8_cval4 = 0xf0
277 };
278
279 static
280 void utf8_toUtf8(const ENCODING *enc,
281 const char **fromP, const char *fromLim,
282 char **toP, const char *toLim)
283 {
284 char *to;
285 const char *from;
286 if (fromLim - *fromP > toLim - *toP) {
287 /* Avoid copying partial characters. */
288 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
289 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
290 break;
291 }
292 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
293 *to = *from;
294 *fromP = from;
295 *toP = to;
296 }
297
298 static
299 void utf8_toUtf16(const ENCODING *enc,
300 const char **fromP, const char *fromLim,
301 unsigned short **toP, const unsigned short *toLim)
302 {
303 unsigned short *to = *toP;
304 const char *from = *fromP;
305 while (from != fromLim && to != toLim) {
306 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
307 case BT_LEAD2:
308 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
309 from += 2;
310 break;
311 case BT_LEAD3:
312 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
313 from += 3;
314 break;
315 case BT_LEAD4:
316 {
317 unsigned long n;
318 if (to + 1 == toLim)
319 break;
320 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
321 n -= 0x10000;
322 to[0] = (unsigned short)((n >> 10) | 0xD800);
323 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
324 to += 2;
325 from += 4;
326 }
327 break;
328 default:
329 *to++ = *from++;
330 break;
331 }
332 }
333 *fromP = from;
334 *toP = to;
335 }
336
337 #ifdef XML_NS
338 static const struct normal_encoding utf8_encoding_ns = {
339 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
340 {
341 #include "asciitab.h"
342 #include "utf8tab.h"
343 },
344 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
345 };
346 #endif
347
348 static const struct normal_encoding utf8_encoding = {
349 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
350 {
351 #define BT_COLON BT_NMSTRT
352 #include "asciitab.h"
353 #undef BT_COLON
354 #include "utf8tab.h"
355 },
356 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
357 };
358
359 #ifdef XML_NS
360
361 static const struct normal_encoding internal_utf8_encoding_ns = {
362 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
363 {
364 #include "iasciitab.h"
365 #include "utf8tab.h"
366 },
367 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
368 };
369
370 #endif
371
372 static const struct normal_encoding internal_utf8_encoding = {
373 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
374 {
375 #define BT_COLON BT_NMSTRT
376 #include "iasciitab.h"
377 #undef BT_COLON
378 #include "utf8tab.h"
379 },
380 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
381 };
382
383 static
384 void latin1_toUtf8(const ENCODING *enc,
385 const char **fromP, const char *fromLim,
386 char **toP, const char *toLim)
387 {
388 for (;;) {
389 unsigned char c;
390 if (*fromP == fromLim)
391 break;
392 c = (unsigned char)**fromP;
393 if (c & 0x80) {
394 if (toLim - *toP < 2)
395 break;
396 *(*toP)++ = ((c >> 6) | UTF8_cval2);
397 *(*toP)++ = ((c & 0x3f) | 0x80);
398 (*fromP)++;
399 }
400 else {
401 if (*toP == toLim)
402 break;
403 *(*toP)++ = *(*fromP)++;
404 }
405 }
406 }
407
408 static
409 void latin1_toUtf16(const ENCODING *enc,
410 const char **fromP, const char *fromLim,
411 unsigned short **toP, const unsigned short *toLim)
412 {
413 while (*fromP != fromLim && *toP != toLim)
414 *(*toP)++ = (unsigned char)*(*fromP)++;
415 }
416
417 #ifdef XML_NS
418
419 static const struct normal_encoding latin1_encoding_ns = {
420 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
421 {
422 #include "asciitab.h"
423 #include "latin1tab.h"
424 },
425 STANDARD_VTABLE(sb_)
426 };
427
428 #endif
429
430 static const struct normal_encoding latin1_encoding = {
431 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
432 {
433 #define BT_COLON BT_NMSTRT
434 #include "asciitab.h"
435 #undef BT_COLON
436 #include "latin1tab.h"
437 },
438 STANDARD_VTABLE(sb_)
439 };
440
441 static
442 void ascii_toUtf8(const ENCODING *enc,
443 const char **fromP, const char *fromLim,
444 char **toP, const char *toLim)
445 {
446 while (*fromP != fromLim && *toP != toLim)
447 *(*toP)++ = *(*fromP)++;
448 }
449
450 #ifdef XML_NS
451
452 static const struct normal_encoding ascii_encoding_ns = {
453 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
454 {
455 #include "asciitab.h"
456 /* BT_NONXML == 0 */
457 },
458 STANDARD_VTABLE(sb_)
459 };
460
461 #endif
462
463 static const struct normal_encoding ascii_encoding = {
464 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
465 {
466 #define BT_COLON BT_NMSTRT
467 #include "asciitab.h"
468 #undef BT_COLON
469 /* BT_NONXML == 0 */
470 },
471 STANDARD_VTABLE(sb_)
472 };
473
474 static int unicode_byte_type(char hi, char lo)
475 {
476 switch ((unsigned char)hi) {
477 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
478 return BT_LEAD4;
479 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
480 return BT_TRAIL;
481 case 0xFF:
482 switch ((unsigned char)lo) {
483 case 0xFF:
484 case 0xFE:
485 return BT_NONXML;
486 }
487 break;
488 }
489 return BT_NONASCII;
490 }
491
492 #define DEFINE_UTF16_TO_UTF8(E) \
493 static \
494 void E ## toUtf8(const ENCODING *enc, \
495 const char **fromP, const char *fromLim, \
496 char **toP, const char *toLim) \
497 { \
498 const char *from; \
499 for (from = *fromP; from != fromLim; from += 2) { \
500 int plane; \
501 unsigned char lo2; \
502 unsigned char lo = GET_LO(from); \
503 unsigned char hi = GET_HI(from); \
504 switch (hi) { \
505 case 0: \
506 if (lo < 0x80) { \
507 if (*toP == toLim) { \
508 *fromP = from; \
509 return; \
510 } \
511 *(*toP)++ = lo; \
512 break; \
513 } \
514 /* fall through */ \
515 case 0x1: case 0x2: case 0x3: \
516 case 0x4: case 0x5: case 0x6: case 0x7: \
517 if (toLim - *toP < 2) { \
518 *fromP = from; \
519 return; \
520 } \
521 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
522 *(*toP)++ = ((lo & 0x3f) | 0x80); \
523 break; \
524 default: \
525 if (toLim - *toP < 3) { \
526 *fromP = from; \
527 return; \
528 } \
529 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
530 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
531 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
532 *(*toP)++ = ((lo & 0x3f) | 0x80); \
533 break; \
534 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
535 if (toLim - *toP < 4) { \
536 *fromP = from; \
537 return; \
538 } \
539 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
540 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
541 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
542 from += 2; \
543 lo2 = GET_LO(from); \
544 *(*toP)++ = (((lo & 0x3) << 4) \
545 | ((GET_HI(from) & 0x3) << 2) \
546 | (lo2 >> 6) \
547 | 0x80); \
548 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
549 break; \
550 } \
551 } \
552 *fromP = from; \
553 }
554
555 #define DEFINE_UTF16_TO_UTF16(E) \
556 static \
557 void E ## toUtf16(const ENCODING *enc, \
558 const char **fromP, const char *fromLim, \
559 unsigned short **toP, const unsigned short *toLim) \
560 { \
561 /* Avoid copying first half only of surrogate */ \
562 if (fromLim - *fromP > ((toLim - *toP) << 1) \
563 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
564 fromLim -= 2; \
565 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
566 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
567 }
568
569 #define SET2(ptr, ch) \
570 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
571 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
572 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
573
574 DEFINE_UTF16_TO_UTF8(little2_)
575 DEFINE_UTF16_TO_UTF16(little2_)
576
577 #undef SET2
578 #undef GET_LO
579 #undef GET_HI
580
581 #define SET2(ptr, ch) \
582 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
583 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
584 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
585
586 DEFINE_UTF16_TO_UTF8(big2_)
587 DEFINE_UTF16_TO_UTF16(big2_)
588
589 #undef SET2
590 #undef GET_LO
591 #undef GET_HI
592
593 #define LITTLE2_BYTE_TYPE(enc, p) \
594 ((p)[1] == 0 \
595 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
596 : unicode_byte_type((p)[1], (p)[0]))
597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
600 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
602 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
603
604 #ifdef XML_MIN_SIZE
605
606 static
607 int little2_byteType(const ENCODING *enc, const char *p)
608 {
609 return LITTLE2_BYTE_TYPE(enc, p);
610 }
611
612 static
613 int little2_byteToAscii(const ENCODING *enc, const char *p)
614 {
615 return LITTLE2_BYTE_TO_ASCII(enc, p);
616 }
617
618 static
619 int little2_charMatches(const ENCODING *enc, const char *p, int c)
620 {
621 return LITTLE2_CHAR_MATCHES(enc, p, c);
622 }
623
624 static
625 int little2_isNameMin(const ENCODING *enc, const char *p)
626 {
627 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
628 }
629
630 static
631 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
632 {
633 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
634 }
635
636 #undef VTABLE
637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
638
639 #else /* not XML_MIN_SIZE */
640
641 #undef PREFIX
642 #define PREFIX(ident) little2_ ## ident
643 #define MINBPC(enc) 2
644 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
648 #define IS_NAME_CHAR(enc, p, n) 0
649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
650 #define IS_NMSTRT_CHAR(enc, p, n) (0)
651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
652
653 #include "xmltok_impl_c.h"
654
655 #undef MINBPC
656 #undef BYTE_TYPE
657 #undef BYTE_TO_ASCII
658 #undef CHAR_MATCHES
659 #undef IS_NAME_CHAR
660 #undef IS_NAME_CHAR_MINBPC
661 #undef IS_NMSTRT_CHAR
662 #undef IS_NMSTRT_CHAR_MINBPC
663 #undef IS_INVALID_CHAR
664
665 #endif /* not XML_MIN_SIZE */
666
667 #ifdef XML_NS
668
669 static const struct normal_encoding little2_encoding_ns = {
670 { VTABLE, 2, 0,
671 #if XML_BYTE_ORDER == 12
672 1
673 #else
674 0
675 #endif
676 },
677 {
678 #include "asciitab.h"
679 #include "latin1tab.h"
680 },
681 STANDARD_VTABLE(little2_)
682 };
683
684 #endif
685
686 static const struct normal_encoding little2_encoding = {
687 { VTABLE, 2, 0,
688 #if XML_BYTE_ORDER == 12
689 1
690 #else
691 0
692 #endif
693 },
694 {
695 #define BT_COLON BT_NMSTRT
696 #include "asciitab.h"
697 #undef BT_COLON
698 #include "latin1tab.h"
699 },
700 STANDARD_VTABLE(little2_)
701 };
702
703 #if XML_BYTE_ORDER != 21
704
705 #ifdef XML_NS
706
707 static const struct normal_encoding internal_little2_encoding_ns = {
708 { VTABLE, 2, 0, 1 },
709 {
710 #include "iasciitab.h"
711 #include "latin1tab.h"
712 },
713 STANDARD_VTABLE(little2_)
714 };
715
716 #endif
717
718 static const struct normal_encoding internal_little2_encoding = {
719 { VTABLE, 2, 0, 1 },
720 {
721 #define BT_COLON BT_NMSTRT
722 #include "iasciitab.h"
723 #undef BT_COLON
724 #include "latin1tab.h"
725 },
726 STANDARD_VTABLE(little2_)
727 };
728
729 #endif
730
731
732 #define BIG2_BYTE_TYPE(enc, p) \
733 ((p)[0] == 0 \
734 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
735 : unicode_byte_type((p)[0], (p)[1]))
736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
739 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
741 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
742
743 #ifdef XML_MIN_SIZE
744
745 static
746 int big2_byteType(const ENCODING *enc, const char *p)
747 {
748 return BIG2_BYTE_TYPE(enc, p);
749 }
750
751 static
752 int big2_byteToAscii(const ENCODING *enc, const char *p)
753 {
754 return BIG2_BYTE_TO_ASCII(enc, p);
755 }
756
757 static
758 int big2_charMatches(const ENCODING *enc, const char *p, int c)
759 {
760 return BIG2_CHAR_MATCHES(enc, p, c);
761 }
762
763 static
764 int big2_isNameMin(const ENCODING *enc, const char *p)
765 {
766 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
767 }
768
769 static
770 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
771 {
772 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
773 }
774
775 #undef VTABLE
776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
777
778 #else /* not XML_MIN_SIZE */
779
780 #undef PREFIX
781 #define PREFIX(ident) big2_ ## ident
782 #define MINBPC(enc) 2
783 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
787 #define IS_NAME_CHAR(enc, p, n) 0
788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
789 #define IS_NMSTRT_CHAR(enc, p, n) (0)
790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
791
792 #include "xmltok_impl_c.h"
793
794 #undef MINBPC
795 #undef BYTE_TYPE
796 #undef BYTE_TO_ASCII
797 #undef CHAR_MATCHES
798 #undef IS_NAME_CHAR
799 #undef IS_NAME_CHAR_MINBPC
800 #undef IS_NMSTRT_CHAR
801 #undef IS_NMSTRT_CHAR_MINBPC
802 #undef IS_INVALID_CHAR
803
804 #endif /* not XML_MIN_SIZE */
805
806 #ifdef XML_NS
807
808 static const struct normal_encoding big2_encoding_ns = {
809 { VTABLE, 2, 0,
810 #if XML_BYTE_ORDER == 21
811 1
812 #else
813 0
814 #endif
815 },
816 {
817 #include "asciitab.h"
818 #include "latin1tab.h"
819 },
820 STANDARD_VTABLE(big2_)
821 };
822
823 #endif
824
825 static const struct normal_encoding big2_encoding = {
826 { VTABLE, 2, 0,
827 #if XML_BYTE_ORDER == 21
828 1
829 #else
830 0
831 #endif
832 },
833 {
834 #define BT_COLON BT_NMSTRT
835 #include "asciitab.h"
836 #undef BT_COLON
837 #include "latin1tab.h"
838 },
839 STANDARD_VTABLE(big2_)
840 };
841
842 #if XML_BYTE_ORDER != 12
843
844 #ifdef XML_NS
845
846 static const struct normal_encoding internal_big2_encoding_ns = {
847 { VTABLE, 2, 0, 1 },
848 {
849 #include "iasciitab.h"
850 #include "latin1tab.h"
851 },
852 STANDARD_VTABLE(big2_)
853 };
854
855 #endif
856
857 static const struct normal_encoding internal_big2_encoding = {
858 { VTABLE, 2, 0, 1 },
859 {
860 #define BT_COLON BT_NMSTRT
861 #include "iasciitab.h"
862 #undef BT_COLON
863 #include "latin1tab.h"
864 },
865 STANDARD_VTABLE(big2_)
866 };
867
868 #endif
869
870 #undef PREFIX
871
872 static
873 int streqci(const char *s1, const char *s2)
874 {
875 for (;;) {
876 char c1 = *s1++;
877 char c2 = *s2++;
878 if ('a' <= c1 && c1 <= 'z')
879 c1 += 'A' - 'a';
880 if ('a' <= c2 && c2 <= 'z')
881 c2 += 'A' - 'a';
882 if (c1 != c2)
883 return 0;
884 if (!c1)
885 break;
886 }
887 return 1;
888 }
889
890 static
891 void initUpdatePosition(const ENCODING *enc, const char *ptr,
892 const char *end, POSITION *pos)
893 {
894 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
895 }
896
897 static
898 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
899 {
900 char buf[1];
901 char *p = buf;
902 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
903 if (p == buf)
904 return -1;
905 else
906 return buf[0];
907 }
908
909 static
910 int isSpace(int c)
911 {
912 switch (c) {
913 case 0x20:
914 case 0xD:
915 case 0xA:
916 case 0x9:
917 return 1;
918 }
919 return 0;
920 }
921
922 /* Return 1 if there's just optional white space
923 or there's an S followed by name=val. */
924 static
925 int parsePseudoAttribute(const ENCODING *enc,
926 const char *ptr,
927 const char *end,
928 const char **namePtr,
929 const char **valPtr,
930 const char **nextTokPtr)
931 {
932 int c;
933 char open;
934 if (ptr == end) {
935 *namePtr = 0;
936 return 1;
937 }
938 if (!isSpace(toAscii(enc, ptr, end))) {
939 *nextTokPtr = ptr;
940 return 0;
941 }
942 do {
943 ptr += enc->minBytesPerChar;
944 } while (isSpace(toAscii(enc, ptr, end)));
945 if (ptr == end) {
946 *namePtr = 0;
947 return 1;
948 }
949 *namePtr = ptr;
950 for (;;) {
951 c = toAscii(enc, ptr, end);
952 if (c == -1) {
953 *nextTokPtr = ptr;
954 return 0;
955 }
956 if (c == '=')
957 break;
958 if (isSpace(c)) {
959 do {
960 ptr += enc->minBytesPerChar;
961 } while (isSpace(c = toAscii(enc, ptr, end)));
962 if (c != '=') {
963 *nextTokPtr = ptr;
964 return 0;
965 }
966 break;
967 }
968 ptr += enc->minBytesPerChar;
969 }
970 if (ptr == *namePtr) {
971 *nextTokPtr = ptr;
972 return 0;
973 }
974 ptr += enc->minBytesPerChar;
975 c = toAscii(enc, ptr, end);
976 while (isSpace(c)) {
977 ptr += enc->minBytesPerChar;
978 c = toAscii(enc, ptr, end);
979 }
980 if (c != '"' && c != '\'') {
981 *nextTokPtr = ptr;
982 return 0;
983 }
984 open = c;
985 ptr += enc->minBytesPerChar;
986 *valPtr = ptr;
987 for (;; ptr += enc->minBytesPerChar) {
988 c = toAscii(enc, ptr, end);
989 if (c == open)
990 break;
991 if (!('a' <= c && c <= 'z')
992 && !('A' <= c && c <= 'Z')
993 && !('0' <= c && c <= '9')
994 && c != '.'
995 && c != '-'
996 && c != '_') {
997 *nextTokPtr = ptr;
998 return 0;
999 }
1000 }
1001 *nextTokPtr = ptr + enc->minBytesPerChar;
1002 return 1;
1003 }
1004
1005 static
1006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1007 const char *,
1008 const char *),
1009 int isGeneralTextEntity,
1010 const ENCODING *enc,
1011 const char *ptr,
1012 const char *end,
1013 const char **badPtr,
1014 const char **versionPtr,
1015 const char **encodingName,
1016 const ENCODING **encoding,
1017 int *standalone)
1018 {
1019 const char *val = 0;
1020 const char *name = 0;
1021 ptr += 5 * enc->minBytesPerChar;
1022 end -= 2 * enc->minBytesPerChar;
1023 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
1024 *badPtr = ptr;
1025 return 0;
1026 }
1027 if (!XmlNameMatchesAscii(enc, name, "version")) {
1028 if (!isGeneralTextEntity) {
1029 *badPtr = name;
1030 return 0;
1031 }
1032 }
1033 else {
1034 if (versionPtr)
1035 *versionPtr = val;
1036 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1037 *badPtr = ptr;
1038 return 0;
1039 }
1040 if (!name) {
1041 if (isGeneralTextEntity) {
1042 /* a TextDecl must have an EncodingDecl */
1043 *badPtr = ptr;
1044 return 0;
1045 }
1046 return 1;
1047 }
1048 }
1049 if (XmlNameMatchesAscii(enc, name, "encoding")) {
1050 int c = toAscii(enc, val, end);
1051 if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
1052 *badPtr = val;
1053 return 0;
1054 }
1055 if (encodingName)
1056 *encodingName = val;
1057 if (encoding)
1058 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1059 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1060 *badPtr = ptr;
1061 return 0;
1062 }
1063 if (!name)
1064 return 1;
1065 }
1066 if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
1067 *badPtr = name;
1068 return 0;
1069 }
1070 if (XmlNameMatchesAscii(enc, val, "yes")) {
1071 if (standalone)
1072 *standalone = 1;
1073 }
1074 else if (XmlNameMatchesAscii(enc, val, "no")) {
1075 if (standalone)
1076 *standalone = 0;
1077 }
1078 else {
1079 *badPtr = val;
1080 return 0;
1081 }
1082 while (isSpace(toAscii(enc, ptr, end)))
1083 ptr += enc->minBytesPerChar;
1084 if (ptr != end) {
1085 *badPtr = ptr;
1086 return 0;
1087 }
1088 return 1;
1089 }
1090
1091 static
1092 int checkCharRefNumber(int result)
1093 {
1094 switch (result >> 8) {
1095 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1096 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1097 return -1;
1098 case 0:
1099 if (latin1_encoding.type[result] == BT_NONXML)
1100 return -1;
1101 break;
1102 case 0xFF:
1103 if (result == 0xFFFE || result == 0xFFFF)
1104 return -1;
1105 break;
1106 }
1107 return result;
1108 }
1109
1110 int XmlUtf8Encode(int c, char *buf)
1111 {
1112 enum {
1113 /* minN is minimum legal resulting value for N byte sequence */
1114 min2 = 0x80,
1115 min3 = 0x800,
1116 min4 = 0x10000
1117 };
1118
1119 if (c < 0)
1120 return 0;
1121 if (c < min2) {
1122 buf[0] = (c | UTF8_cval1);
1123 return 1;
1124 }
1125 if (c < min3) {
1126 buf[0] = ((c >> 6) | UTF8_cval2);
1127 buf[1] = ((c & 0x3f) | 0x80);
1128 return 2;
1129 }
1130 if (c < min4) {
1131 buf[0] = ((c >> 12) | UTF8_cval3);
1132 buf[1] = (((c >> 6) & 0x3f) | 0x80);
1133 buf[2] = ((c & 0x3f) | 0x80);
1134 return 3;
1135 }
1136 if (c < 0x110000) {
1137 buf[0] = ((c >> 18) | UTF8_cval4);
1138 buf[1] = (((c >> 12) & 0x3f) | 0x80);
1139 buf[2] = (((c >> 6) & 0x3f) | 0x80);
1140 buf[3] = ((c & 0x3f) | 0x80);
1141 return 4;
1142 }
1143 return 0;
1144 }
1145
1146 int XmlUtf16Encode(int charNum, unsigned short *buf)
1147 {
1148 if (charNum < 0)
1149 return 0;
1150 if (charNum < 0x10000) {
1151 buf[0] = charNum;
1152 return 1;
1153 }
1154 if (charNum < 0x110000) {
1155 charNum -= 0x10000;
1156 buf[0] = (charNum >> 10) + 0xD800;
1157 buf[1] = (charNum & 0x3FF) + 0xDC00;
1158 return 2;
1159 }
1160 return 0;
1161 }
1162
1163 struct unknown_encoding {
1164 struct normal_encoding normal;
1165 int (*convert)(void *userData, const char *p);
1166 void *userData;
1167 unsigned short utf16[256];
1168 char utf8[256][4];
1169 };
1170
1171 int XmlSizeOfUnknownEncoding()
1172 {
1173 return sizeof(struct unknown_encoding);
1174 }
1175
1176 static
1177 int unknown_isName(const ENCODING *enc, const char *p)
1178 {
1179 int c = ((const struct unknown_encoding *)enc)
1180 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1181 if (c & ~0xFFFF)
1182 return 0;
1183 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1184 }
1185
1186 static
1187 int unknown_isNmstrt(const ENCODING *enc, const char *p)
1188 {
1189 int c = ((const struct unknown_encoding *)enc)
1190 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1191 if (c & ~0xFFFF)
1192 return 0;
1193 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1194 }
1195
1196 static
1197 int unknown_isInvalid(const ENCODING *enc, const char *p)
1198 {
1199 int c = ((const struct unknown_encoding *)enc)
1200 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1201 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1202 }
1203
1204 static
1205 void unknown_toUtf8(const ENCODING *enc,
1206 const char **fromP, const char *fromLim,
1207 char **toP, const char *toLim)
1208 {
1209 char buf[XML_UTF8_ENCODE_MAX];
1210 for (;;) {
1211 const char *utf8;
1212 int n;
1213 if (*fromP == fromLim)
1214 break;
1215 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1216 n = *utf8++;
1217 if (n == 0) {
1218 int c = ((const struct unknown_encoding *)enc)
1219 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1220 n = XmlUtf8Encode(c, buf);
1221 if (n > toLim - *toP)
1222 break;
1223 utf8 = buf;
1224 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1225 - (BT_LEAD2 - 2);
1226 }
1227 else {
1228 if (n > toLim - *toP)
1229 break;
1230 (*fromP)++;
1231 }
1232 do {
1233 *(*toP)++ = *utf8++;
1234 } while (--n != 0);
1235 }
1236 }
1237
1238 static
1239 void unknown_toUtf16(const ENCODING *enc,
1240 const char **fromP, const char *fromLim,
1241 unsigned short **toP, const unsigned short *toLim)
1242 {
1243 while (*fromP != fromLim && *toP != toLim) {
1244 unsigned short c
1245 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1246 if (c == 0) {
1247 c = (unsigned short)((const struct unknown_encoding *)enc)
1248 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1249 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1250 - (BT_LEAD2 - 2);
1251 }
1252 else
1253 (*fromP)++;
1254 *(*toP)++ = c;
1255 }
1256 }
1257
1258 ENCODING *
1259 XmlInitUnknownEncoding(void *mem,
1260 int *table,
1261 int (*convert)(void *userData, const char *p),
1262 void *userData)
1263 {
1264 int i;
1265 struct unknown_encoding *e = mem;
1266 for (i = 0; i < sizeof(struct normal_encoding); i++)
1267 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1268 for (i = 0; i < 128; i++)
1269 if (latin1_encoding.type[i] != BT_OTHER
1270 && latin1_encoding.type[i] != BT_NONXML
1271 && table[i] != i)
1272 return 0;
1273 for (i = 0; i < 256; i++) {
1274 int c = table[i];
1275 if (c == -1) {
1276 e->normal.type[i] = BT_MALFORM;
1277 /* This shouldn't really get used. */
1278 e->utf16[i] = 0xFFFF;
1279 e->utf8[i][0] = 1;
1280 e->utf8[i][1] = 0;
1281 }
1282 else if (c < 0) {
1283 if (c < -4)
1284 return 0;
1285 e->normal.type[i] = BT_LEAD2 - (c + 2);
1286 e->utf8[i][0] = 0;
1287 e->utf16[i] = 0;
1288 }
1289 else if (c < 0x80) {
1290 if (latin1_encoding.type[c] != BT_OTHER
1291 && latin1_encoding.type[c] != BT_NONXML
1292 && c != i)
1293 return 0;
1294 e->normal.type[i] = latin1_encoding.type[c];
1295 e->utf8[i][0] = 1;
1296 e->utf8[i][1] = (char)c;
1297 e->utf16[i] = c == 0 ? 0xFFFF : c;
1298 }
1299 else if (checkCharRefNumber(c) < 0) {
1300 e->normal.type[i] = BT_NONXML;
1301 /* This shouldn't really get used. */
1302 e->utf16[i] = 0xFFFF;
1303 e->utf8[i][0] = 1;
1304 e->utf8[i][1] = 0;
1305 }
1306 else {
1307 if (c > 0xFFFF)
1308 return 0;
1309 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1310 e->normal.type[i] = BT_NMSTRT;
1311 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1312 e->normal.type[i] = BT_NAME;
1313 else
1314 e->normal.type[i] = BT_OTHER;
1315 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1316 e->utf16[i] = c;
1317 }
1318 }
1319 e->userData = userData;
1320 e->convert = convert;
1321 if (convert) {
1322 e->normal.isName2 = unknown_isName;
1323 e->normal.isName3 = unknown_isName;
1324 e->normal.isName4 = unknown_isName;
1325 e->normal.isNmstrt2 = unknown_isNmstrt;
1326 e->normal.isNmstrt3 = unknown_isNmstrt;
1327 e->normal.isNmstrt4 = unknown_isNmstrt;
1328 e->normal.isInvalid2 = unknown_isInvalid;
1329 e->normal.isInvalid3 = unknown_isInvalid;
1330 e->normal.isInvalid4 = unknown_isInvalid;
1331 }
1332 e->normal.enc.utf8Convert = unknown_toUtf8;
1333 e->normal.enc.utf16Convert = unknown_toUtf16;
1334 return &(e->normal.enc);
1335 }
1336
1337 /* If this enumeration is changed, getEncodingIndex and encodings
1338 must also be changed. */
1339 enum {
1340 UNKNOWN_ENC = -1,
1341 ISO_8859_1_ENC = 0,
1342 US_ASCII_ENC,
1343 UTF_8_ENC,
1344 UTF_16_ENC,
1345 UTF_16BE_ENC,
1346 UTF_16LE_ENC,
1347 /* must match encodingNames up to here */
1348 NO_ENC
1349 };
1350
1351 static
1352 int getEncodingIndex(const char *name)
1353 {
1354 static const char *encodingNames[] = {
1355 "ISO-8859-1",
1356 "US-ASCII",
1357 "UTF-8",
1358 "UTF-16",
1359 "UTF-16BE"
1360 "UTF-16LE",
1361 };
1362 int i;
1363 if (name == 0)
1364 return NO_ENC;
1365 for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
1366 if (streqci(name, encodingNames[i]))
1367 return i;
1368 return UNKNOWN_ENC;
1369 }
1370
1371 /* For binary compatibility, we store the index of the encoding specified
1372 at initialization in the isUtf16 member. */
1373
1374 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
1375
1376 /* This is what detects the encoding.
1377 encodingTable maps from encoding indices to encodings;
1378 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1379 state is XML_CONTENT_STATE if we're parsing an external text entity,
1380 and XML_PROLOG_STATE otherwise.
1381 */
1382
1383
1384 static
1385 int initScan(const ENCODING **encodingTable,
1386 const INIT_ENCODING *enc,
1387 int state,
1388 const char *ptr,
1389 const char *end,
1390 const char **nextTokPtr)
1391 {
1392 const ENCODING **encPtr;
1393
1394 if (ptr == end)
1395 return XML_TOK_NONE;
1396 encPtr = enc->encPtr;
1397 if (ptr + 1 == end) {
1398 /* only a single byte available for auto-detection */
1399 /* a well-formed document entity must have more than one byte */
1400 if (state != XML_CONTENT_STATE)
1401 return XML_TOK_PARTIAL;
1402 /* so we're parsing an external text entity... */
1403 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1404 switch (INIT_ENC_INDEX(enc)) {
1405 case UTF_16_ENC:
1406 case UTF_16LE_ENC:
1407 case UTF_16BE_ENC:
1408 return XML_TOK_PARTIAL;
1409 }
1410 switch ((unsigned char)*ptr) {
1411 case 0xFE:
1412 case 0xFF:
1413 case 0xEF: /* possibly first byte of UTF-8 BOM */
1414 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1415 && state == XML_CONTENT_STATE)
1416 break;
1417 /* fall through */
1418 case 0x00:
1419 case 0x3C:
1420 return XML_TOK_PARTIAL;
1421 }
1422 }
1423 else {
1424 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1425 case 0xFEFF:
1426 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1427 && state == XML_CONTENT_STATE)
1428 break;
1429 *nextTokPtr = ptr + 2;
1430 *encPtr = encodingTable[UTF_16BE_ENC];
1431 return XML_TOK_BOM;
1432 /* 00 3C is handled in the default case */
1433 case 0x3C00:
1434 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1435 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1436 && state == XML_CONTENT_STATE)
1437 break;
1438 *encPtr = encodingTable[UTF_16LE_ENC];
1439 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1440 case 0xFFFE:
1441 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1442 && state == XML_CONTENT_STATE)
1443 break;
1444 *nextTokPtr = ptr + 2;
1445 *encPtr = encodingTable[UTF_16LE_ENC];
1446 return XML_TOK_BOM;
1447 case 0xEFBB:
1448 /* Maybe a UTF-8 BOM (EF BB BF) */
1449 /* If there's an explicitly specified (external) encoding
1450 of ISO-8859-1 or some flavour of UTF-16
1451 and this is an external text entity,
1452 don't look for the BOM,
1453 because it might be a legal data. */
1454 if (state == XML_CONTENT_STATE) {
1455 int e = INIT_ENC_INDEX(enc);
1456 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1457 break;
1458 }
1459 if (ptr + 2 == end)
1460 return XML_TOK_PARTIAL;
1461 if ((unsigned char)ptr[2] == 0xBF) {
1462 *encPtr = encodingTable[UTF_8_ENC];
1463 return XML_TOK_BOM;
1464 }
1465 break;
1466 default:
1467 if (ptr[0] == '\0') {
1468 /* 0 isn't a legal data character. Furthermore a document entity can only
1469 start with ASCII characters. So the only way this can fail to be big-endian
1470 UTF-16 if it it's an external parsed general entity that's labelled as
1471 UTF-16LE. */
1472 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1473 break;
1474 *encPtr = encodingTable[UTF_16BE_ENC];
1475 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1476 }
1477 else if (ptr[1] == '\0') {
1478 /* We could recover here in the case:
1479 - parsing an external entity
1480 - second byte is 0
1481 - no externally specified encoding
1482 - no encoding declaration
1483 by assuming UTF-16LE. But we don't, because this would mean when
1484 presented just with a single byte, we couldn't reliably determine
1485 whether we needed further bytes. */
1486 if (state == XML_CONTENT_STATE)
1487 break;
1488 *encPtr = encodingTable[UTF_16LE_ENC];
1489 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1490 }
1491 break;
1492 }
1493 }
1494 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1495 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1496 }
1497
1498
1499 #define NS(x) x
1500 #define ns(x) x
1501 #include "xmltok_ns_c.h"
1502 #undef NS
1503 #undef ns
1504
1505 #ifdef XML_NS
1506
1507 #define NS(x) x ## NS
1508 #define ns(x) x ## _ns
1509
1510 #include "xmltok_ns_c.h"
1511
1512 #undef NS
1513 #undef ns
1514
1515 ENCODING *
1516 XmlInitUnknownEncodingNS(void *mem,
1517 int *table,
1518 int (*convert)(void *userData, const char *p),
1519 void *userData)
1520 {
1521 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1522 if (enc)
1523 ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
1524 return enc;
1525 }
1526
1527 #endif /* XML_NS */