00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #include "CUnicode.h"
00020 #include "CArch.h"
00021 #include <cstring>
00022
00023
00024
00025
00026
00027 inline
00028 static
00029 UInt16
00030 decode16(const UInt8* n, bool byteSwapped)
00031 {
00032 union x16 {
00033 UInt8 n8[2];
00034 UInt16 n16;
00035 } c;
00036 if (byteSwapped) {
00037 c.n8[0] = n[1];
00038 c.n8[1] = n[0];
00039 }
00040 else {
00041 c.n8[0] = n[0];
00042 c.n8[1] = n[1];
00043 }
00044 return c.n16;
00045 }
00046
00047 inline
00048 static
00049 UInt32
00050 decode32(const UInt8* n, bool byteSwapped)
00051 {
00052 union x32 {
00053 UInt8 n8[4];
00054 UInt32 n32;
00055 } c;
00056 if (byteSwapped) {
00057 c.n8[0] = n[3];
00058 c.n8[1] = n[2];
00059 c.n8[2] = n[1];
00060 c.n8[3] = n[0];
00061 }
00062 else {
00063 c.n8[0] = n[0];
00064 c.n8[1] = n[1];
00065 c.n8[2] = n[2];
00066 c.n8[3] = n[3];
00067 }
00068 return c.n32;
00069 }
00070
00071 inline
00072 static
00073 void
00074 resetError(bool* errors)
00075 {
00076 if (errors != NULL) {
00077 *errors = false;
00078 }
00079 }
00080
00081 inline
00082 static
00083 void
00084 setError(bool* errors)
00085 {
00086 if (errors != NULL) {
00087 *errors = true;
00088 }
00089 }
00090
00091
00092
00093
00094
00095
00096 UInt32 CUnicode::s_invalid = 0x0000ffff;
00097 UInt32 CUnicode::s_replacement = 0x0000fffd;
00098
00099 bool
00100 CUnicode::isUTF8(const CString& src)
00101 {
00102
00103 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00104 for (UInt32 n = (UInt32)src.size(); n > 0; ) {
00105 if (fromUTF8(data, n) == s_invalid) {
00106 return false;
00107 }
00108 }
00109 return true;
00110 }
00111
00112 CString
00113 CUnicode::UTF8ToUCS2(const CString& src, bool* errors)
00114 {
00115
00116 resetError(errors);
00117
00118
00119 UInt32 n = (UInt32)src.size();
00120 CString dst;
00121 dst.reserve(2 * n);
00122
00123
00124 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00125 while (n > 0) {
00126 UInt32 c = fromUTF8(data, n);
00127 if (c == s_invalid) {
00128 c = s_replacement;
00129 }
00130 else if (c >= 0x00010000) {
00131 setError(errors);
00132 c = s_replacement;
00133 }
00134 UInt16 ucs2 = static_cast<UInt16>(c);
00135 dst.append(reinterpret_cast<const char*>(&ucs2), 2);
00136 }
00137
00138 return dst;
00139 }
00140
00141 CString
00142 CUnicode::UTF8ToUCS4(const CString& src, bool* errors)
00143 {
00144
00145 resetError(errors);
00146
00147
00148 UInt32 n = (UInt32)src.size();
00149 CString dst;
00150 dst.reserve(4 * n);
00151
00152
00153 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00154 while (n > 0) {
00155 UInt32 c = fromUTF8(data, n);
00156 if (c == s_invalid) {
00157 c = s_replacement;
00158 }
00159 dst.append(reinterpret_cast<const char*>(&c), 4);
00160 }
00161
00162 return dst;
00163 }
00164
00165 CString
00166 CUnicode::UTF8ToUTF16(const CString& src, bool* errors)
00167 {
00168
00169 resetError(errors);
00170
00171
00172 UInt32 n = (UInt32)src.size();
00173 CString dst;
00174 dst.reserve(2 * n);
00175
00176
00177 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00178 while (n > 0) {
00179 UInt32 c = fromUTF8(data, n);
00180 if (c == s_invalid) {
00181 c = s_replacement;
00182 }
00183 else if (c >= 0x00110000) {
00184 setError(errors);
00185 c = s_replacement;
00186 }
00187 if (c < 0x00010000) {
00188 UInt16 ucs2 = static_cast<UInt16>(c);
00189 dst.append(reinterpret_cast<const char*>(&ucs2), 2);
00190 }
00191 else {
00192 c -= 0x00010000;
00193 UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800);
00194 UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00);
00195 dst.append(reinterpret_cast<const char*>(&utf16h), 2);
00196 dst.append(reinterpret_cast<const char*>(&utf16l), 2);
00197 }
00198 }
00199
00200 return dst;
00201 }
00202
00203 CString
00204 CUnicode::UTF8ToUTF32(const CString& src, bool* errors)
00205 {
00206
00207 resetError(errors);
00208
00209
00210 UInt32 n = (UInt32)src.size();
00211 CString dst;
00212 dst.reserve(4 * n);
00213
00214
00215 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
00216 while (n > 0) {
00217 UInt32 c = fromUTF8(data, n);
00218 if (c == s_invalid) {
00219 c = s_replacement;
00220 }
00221 else if (c >= 0x00110000) {
00222 setError(errors);
00223 c = s_replacement;
00224 }
00225 dst.append(reinterpret_cast<const char*>(&c), 4);
00226 }
00227
00228 return dst;
00229 }
00230
00231 CString
00232 CUnicode::UTF8ToText(const CString& src, bool* errors)
00233 {
00234
00235 resetError(errors);
00236
00237
00238 UInt32 size;
00239 wchar_t* tmp = UTF8ToWideChar(src, size, errors);
00240
00241
00242 int len = ARCH->convStringWCToMB(NULL, tmp, size, errors);
00243 char* mbs = new char[len + 1];
00244 ARCH->convStringWCToMB(mbs, tmp, size, errors);
00245 CString text(mbs, len);
00246
00247
00248 delete[] mbs;
00249 delete[] tmp;
00250
00251 return text;
00252 }
00253
00254 CString
00255 CUnicode::UCS2ToUTF8(const CString& src, bool* errors)
00256 {
00257
00258 resetError(errors);
00259
00260
00261 UInt32 n = (UInt32)src.size() >> 1;
00262 return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
00263 }
00264
00265 CString
00266 CUnicode::UCS4ToUTF8(const CString& src, bool* errors)
00267 {
00268
00269 resetError(errors);
00270
00271
00272 UInt32 n = (UInt32)src.size() >> 2;
00273 return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
00274 }
00275
00276 CString
00277 CUnicode::UTF16ToUTF8(const CString& src, bool* errors)
00278 {
00279
00280 resetError(errors);
00281
00282
00283 UInt32 n = (UInt32)src.size() >> 1;
00284 return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
00285 }
00286
00287 CString
00288 CUnicode::UTF32ToUTF8(const CString& src, bool* errors)
00289 {
00290
00291 resetError(errors);
00292
00293
00294 UInt32 n = (UInt32)src.size() >> 2;
00295 return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
00296 }
00297
00298 CString
00299 CUnicode::textToUTF8(const CString& src, bool* errors)
00300 {
00301
00302 resetError(errors);
00303
00304
00305 UInt32 n = (UInt32)src.size();
00306 int len = ARCH->convStringMBToWC(NULL, src.c_str(), n, errors);
00307 wchar_t* wcs = new wchar_t[len + 1];
00308 ARCH->convStringMBToWC(wcs, src.c_str(), n, errors);
00309
00310
00311 CString utf8 = wideCharToUTF8(wcs, len, errors);
00312
00313
00314 delete[] wcs;
00315
00316 return utf8;
00317 }
00318
00319 wchar_t*
00320 CUnicode::UTF8ToWideChar(const CString& src, UInt32& size, bool* errors)
00321 {
00322
00323 CString tmp;
00324 switch (ARCH->getWideCharEncoding()) {
00325 case IArchString::kUCS2:
00326 tmp = UTF8ToUCS2(src, errors);
00327 size = (UInt32)tmp.size() >> 1;
00328 break;
00329
00330 case IArchString::kUCS4:
00331 tmp = UTF8ToUCS4(src, errors);
00332 size = (UInt32)tmp.size() >> 2;
00333 break;
00334
00335 case IArchString::kUTF16:
00336 tmp = UTF8ToUTF16(src, errors);
00337 size = (UInt32)tmp.size() >> 1;
00338 break;
00339
00340 case IArchString::kUTF32:
00341 tmp = UTF8ToUTF32(src, errors);
00342 size = (UInt32)tmp.size() >> 2;
00343 break;
00344
00345 default:
00346 assert(0 && "unknown wide character encoding");
00347 }
00348
00349
00350 wchar_t* dst = new wchar_t[size];
00351 ::memcpy(dst, tmp.data(), sizeof(wchar_t) * size);
00352 return dst;
00353 }
00354
00355 CString
00356 CUnicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors)
00357 {
00358
00359
00360
00361 switch (ARCH->getWideCharEncoding()) {
00362 case IArchString::kUCS2:
00363 return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
00364
00365 case IArchString::kUCS4:
00366 return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
00367
00368 case IArchString::kUTF16:
00369 return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
00370
00371 case IArchString::kUTF32:
00372 return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
00373
00374 default:
00375 assert(0 && "unknown wide character encoding");
00376 return CString();
00377 }
00378 }
00379
00380 CString
00381 CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
00382 {
00383
00384 CString dst;
00385 dst.reserve(n);
00386
00387
00388 bool byteSwapped = false;
00389 if (n >= 1) {
00390 switch (decode16(data, false)) {
00391 case 0x0000feff:
00392 data += 2;
00393 --n;
00394 break;
00395
00396 case 0x0000fffe:
00397 byteSwapped = true;
00398 data += 2;
00399 --n;
00400 break;
00401
00402 default:
00403 break;
00404 }
00405 }
00406
00407
00408 for (; n > 0; data += 2, --n) {
00409 UInt32 c = decode16(data, byteSwapped);
00410 toUTF8(dst, c, errors);
00411 }
00412
00413 return dst;
00414 }
00415
00416 CString
00417 CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
00418 {
00419
00420 CString dst;
00421 dst.reserve(n);
00422
00423
00424 bool byteSwapped = false;
00425 if (n >= 1) {
00426 switch (decode32(data, false)) {
00427 case 0x0000feff:
00428 data += 4;
00429 --n;
00430 break;
00431
00432 case 0x0000fffe:
00433 byteSwapped = true;
00434 data += 4;
00435 --n;
00436 break;
00437
00438 default:
00439 break;
00440 }
00441 }
00442
00443
00444 for (; n > 0; data += 4, --n) {
00445 UInt32 c = decode32(data, byteSwapped);
00446 toUTF8(dst, c, errors);
00447 }
00448
00449 return dst;
00450 }
00451
00452 CString
00453 CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
00454 {
00455
00456 CString dst;
00457 dst.reserve(n);
00458
00459
00460 bool byteSwapped = false;
00461 if (n >= 1) {
00462 switch (decode16(data, false)) {
00463 case 0x0000feff:
00464 data += 2;
00465 --n;
00466 break;
00467
00468 case 0x0000fffe:
00469 byteSwapped = true;
00470 data += 2;
00471 --n;
00472 break;
00473
00474 default:
00475 break;
00476 }
00477 }
00478
00479
00480 for (; n > 0; data += 2, --n) {
00481 UInt32 c = decode16(data, byteSwapped);
00482 if (c < 0x0000d800 || c > 0x0000dfff) {
00483 toUTF8(dst, c, errors);
00484 }
00485 else if (n == 1) {
00486
00487 setError(errors);
00488 toUTF8(dst, s_replacement, NULL);
00489 }
00490 else if (c >= 0x0000d800 && c <= 0x0000dbff) {
00491 UInt32 c2 = decode16(data, byteSwapped);
00492 data += 2;
00493 --n;
00494 if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
00495
00496 setError(errors);
00497 toUTF8(dst, s_replacement, NULL);
00498 }
00499 else {
00500 c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
00501 toUTF8(dst, c, errors);
00502 }
00503 }
00504 else {
00505
00506 setError(errors);
00507 toUTF8(dst, s_replacement, NULL);
00508 }
00509 }
00510
00511 return dst;
00512 }
00513
00514 CString
00515 CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
00516 {
00517
00518 CString dst;
00519 dst.reserve(n);
00520
00521
00522 bool byteSwapped = false;
00523 if (n >= 1) {
00524 switch (decode32(data, false)) {
00525 case 0x0000feff:
00526 data += 4;
00527 --n;
00528 break;
00529
00530 case 0x0000fffe:
00531 byteSwapped = true;
00532 data += 4;
00533 --n;
00534 break;
00535
00536 default:
00537 break;
00538 }
00539 }
00540
00541
00542 for (; n > 0; data += 4, --n) {
00543 UInt32 c = decode32(data, byteSwapped);
00544 if (c >= 0x00110000) {
00545 setError(errors);
00546 c = s_replacement;
00547 }
00548 toUTF8(dst, c, errors);
00549 }
00550
00551 return dst;
00552 }
00553
00554 UInt32
00555 CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
00556 {
00557 assert(data != NULL);
00558 assert(n != 0);
00559
00560
00561
00562
00563 UInt32 size;
00564 if (data[0] < 0x80) {
00565
00566 size = 1;
00567 }
00568 else if (data[0] < 0xc0) {
00569
00570
00571 --n;
00572 ++data;
00573 return s_invalid;
00574 }
00575 else if (data[0] < 0xe0) {
00576
00577 size = 2;
00578 }
00579 else if (data[0] < 0xf0) {
00580
00581 size = 3;
00582 }
00583 else if (data[0] < 0xf8) {
00584
00585 size = 4;
00586 }
00587 else if (data[0] < 0xfc) {
00588
00589 size = 5;
00590 }
00591 else if (data[0] < 0xfe) {
00592
00593 size = 6;
00594 }
00595 else {
00596
00597 --n;
00598 ++data;
00599 return s_invalid;
00600 }
00601
00602
00603 if (size > n) {
00604 data += n;
00605 n = 0;
00606 return s_invalid;
00607 }
00608
00609
00610 UInt32 c;
00611 switch (size) {
00612 case 1:
00613 c = static_cast<UInt32>(data[0]);
00614 break;
00615
00616 case 2:
00617 c = ((static_cast<UInt32>(data[0]) & 0x1f) << 6) |
00618 ((static_cast<UInt32>(data[1]) & 0x3f) );
00619 break;
00620
00621 case 3:
00622 c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) |
00623 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
00624 ((static_cast<UInt32>(data[2]) & 0x3f) );
00625 break;
00626
00627 case 4:
00628 c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) |
00629 ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
00630 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
00631 ((static_cast<UInt32>(data[1]) & 0x3f) );
00632 break;
00633
00634 case 5:
00635 c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) |
00636 ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
00637 ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
00638 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
00639 ((static_cast<UInt32>(data[1]) & 0x3f) );
00640 break;
00641
00642 case 6:
00643 c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) |
00644 ((static_cast<UInt32>(data[1]) & 0x3f) << 24) |
00645 ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
00646 ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
00647 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
00648 ((static_cast<UInt32>(data[1]) & 0x3f) );
00649 break;
00650
00651 default:
00652 assert(0 && "invalid size");
00653 return s_invalid;
00654 }
00655
00656
00657
00658 bool truncated = false;
00659 switch (size) {
00660 case 6:
00661 if ((data[5] & 0xc0) != 0x80) {
00662 truncated = true;
00663 size = 5;
00664 }
00665
00666
00667 case 5:
00668 if ((data[4] & 0xc0) != 0x80) {
00669 truncated = true;
00670 size = 4;
00671 }
00672
00673
00674 case 4:
00675 if ((data[3] & 0xc0) != 0x80) {
00676 truncated = true;
00677 size = 3;
00678 }
00679
00680
00681 case 3:
00682 if ((data[2] & 0xc0) != 0x80) {
00683 truncated = true;
00684 size = 2;
00685 }
00686
00687
00688 case 2:
00689 if ((data[1] & 0xc0) != 0x80) {
00690 truncated = true;
00691 size = 1;
00692 }
00693 }
00694
00695
00696 data += size;
00697 n -= size;
00698
00699
00700 if (truncated) {
00701 return s_invalid;
00702 }
00703
00704
00705 static UInt32 s_minChar[] = {
00706 0,
00707 0x00000000,
00708 0x00000080,
00709 0x00000800,
00710 0x00010000,
00711 0x00200000,
00712 0x04000000
00713 };
00714 if (c < s_minChar[size]) {
00715 return s_invalid;
00716 }
00717
00718
00719 if (c >= 0x0000d800 && c <= 0x0000dfff) {
00720 return s_invalid;
00721 }
00722 if (c >= 0x0000fffe && c <= 0x0000ffff) {
00723 return s_invalid;
00724 }
00725
00726 return c;
00727 }
00728
00729 void
00730 CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors)
00731 {
00732 UInt8 data[6];
00733
00734
00735 if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) {
00736 setError(errors);
00737 c = s_replacement;
00738 }
00739
00740
00741 if (c < 0x00000080) {
00742 data[0] = static_cast<UInt8>(c);
00743 dst.append(reinterpret_cast<char*>(data), 1);
00744 }
00745 else if (c < 0x00000800) {
00746 data[0] = static_cast<UInt8>(((c >> 6) & 0x0000001f) + 0xc0);
00747 data[1] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00748 dst.append(reinterpret_cast<char*>(data), 2);
00749 }
00750 else if (c < 0x00010000) {
00751 data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0);
00752 data[1] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
00753 data[2] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00754 dst.append(reinterpret_cast<char*>(data), 3);
00755 }
00756 else if (c < 0x00200000) {
00757 data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0);
00758 data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
00759 data[2] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
00760 data[3] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00761 dst.append(reinterpret_cast<char*>(data), 4);
00762 }
00763 else if (c < 0x04000000) {
00764 data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8);
00765 data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
00766 data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
00767 data[3] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
00768 data[4] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00769 dst.append(reinterpret_cast<char*>(data), 5);
00770 }
00771 else if (c < 0x80000000) {
00772 data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc);
00773 data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80);
00774 data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
00775 data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
00776 data[4] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
00777 data[5] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
00778 dst.append(reinterpret_cast<char*>(data), 6);
00779 }
00780 else {
00781 assert(0 && "character out of range");
00782 }
00783 }