utf8.js 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. var ucs2 = require('./ucs2');
  2. exports = {
  3. encode: function(str) {
  4. var codePoints = ucs2.decode(str);
  5. var byteArr = '';
  6. for (var i = 0, len = codePoints.length; i < len; i++) {
  7. byteArr += encodeCodePoint(codePoints[i]);
  8. }
  9. return byteArr;
  10. },
  11. decode: function(str, safe) {
  12. byteArr = ucs2.decode(str);
  13. byteIdx = 0;
  14. byteCount = byteArr.length;
  15. codePoint = 0;
  16. bytesSeen = 0;
  17. bytesNeeded = 0;
  18. lowerBoundary = 0x80;
  19. upperBoundary = 0xbf;
  20. var codePoints = [];
  21. var tmp;
  22. while ((tmp = decodeCodePoint(safe)) !== false) {
  23. codePoints.push(tmp);
  24. }
  25. return ucs2.encode(codePoints);
  26. }
  27. };
  28. var fromCharCode = String.fromCharCode;
  29. function encodeCodePoint(codePoint) {
  30. if ((codePoint & 0xffffff80) === 0) {
  31. return fromCharCode(codePoint);
  32. }
  33. var ret = '',
  34. count,
  35. offset;
  36. if ((codePoint & 0xfffff800) === 0) {
  37. count = 1;
  38. offset = 0xc0;
  39. } else if ((codePoint & 0xffff0000) === 0) {
  40. count = 2;
  41. offset = 0xe0;
  42. } else if ((codePoint & 0xffe00000) == 0) {
  43. count = 3;
  44. offset = 0xf0;
  45. }
  46. ret += fromCharCode((codePoint >> (6 * count)) + offset);
  47. while (count > 0) {
  48. var tmp = codePoint >> (6 * (count - 1));
  49. ret += fromCharCode(0x80 | (tmp & 0x3f));
  50. count--;
  51. }
  52. return ret;
  53. }
  54. var byteArr,
  55. byteIdx,
  56. byteCount,
  57. codePoint,
  58. bytesSeen,
  59. bytesNeeded,
  60. lowerBoundary,
  61. upperBoundary;
  62. function decodeCodePoint(safe) {
  63. while (true) {
  64. if (byteIdx >= byteCount && bytesNeeded) {
  65. if (safe) return goBack();
  66. throw new Error('Invalid byte index');
  67. }
  68. if (byteIdx === byteCount) return false;
  69. var byte = byteArr[byteIdx];
  70. byteIdx++;
  71. if (!bytesNeeded) {
  72. if ((byte & 0x80) === 0) {
  73. return byte;
  74. }
  75. if ((byte & 0xe0) === 0xc0) {
  76. bytesNeeded = 1;
  77. codePoint = byte & 0x1f;
  78. } else if ((byte & 0xf0) === 0xe0) {
  79. if (byte === 0xe0) lowerBoundary = 0xa0;
  80. if (byte === 0xed) upperBoundary = 0x9f;
  81. bytesNeeded = 2;
  82. codePoint = byte & 0xf;
  83. } else if ((byte & 0xf8) === 0xf0) {
  84. if (byte === 0xf0) lowerBoundary = 0x90;
  85. if (byte === 0xf4) upperBoundary = 0x8f;
  86. bytesNeeded = 3;
  87. codePoint = byte & 0x7;
  88. } else {
  89. if (safe) return goBack();
  90. throw new Error('Invalid UTF-8 detected');
  91. }
  92. continue;
  93. }
  94. if (byte < lowerBoundary || byte > upperBoundary) {
  95. if (safe) {
  96. byteIdx--;
  97. return goBack();
  98. }
  99. throw new Error('Invalid continuation byte');
  100. }
  101. lowerBoundary = 0x80;
  102. upperBoundary = 0xbf;
  103. codePoint = (codePoint << 6) | (byte & 0x3f);
  104. bytesSeen++;
  105. if (bytesSeen !== bytesNeeded) continue;
  106. var tmp = codePoint;
  107. codePoint = 0;
  108. bytesNeeded = 0;
  109. bytesSeen = 0;
  110. return tmp;
  111. }
  112. }
  113. function goBack() {
  114. var start = byteIdx - bytesSeen - 1;
  115. byteIdx = start + 1;
  116. codePoint = 0;
  117. bytesNeeded = 0;
  118. bytesSeen = 0;
  119. lowerBoundary = 0x80;
  120. upperBoundary = 0xbf;
  121. return byteArr[start];
  122. }
  123. module.exports = exports;