parseHtml.js 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. var last = require('./last');
  2. var arrToMap = require('./arrToMap');
  3. var startWith = require('./startWith');
  4. var lowerCase = require('./lowerCase');
  5. exports = function(html, handler) {
  6. var stack = [];
  7. var text;
  8. var lastHtml = html;
  9. while (html) {
  10. text = true;
  11. if (!last(stack) || !SPECIAL[last(stack)]) {
  12. if (startWith(html, '<!--')) {
  13. var endIdx = html.indexOf('-->');
  14. if (endIdx >= 0) {
  15. if (handler.comment) {
  16. handler.comment(html.substring(4, endIdx));
  17. }
  18. html = html.substring(endIdx + 3);
  19. text = false;
  20. }
  21. } else if (startWith(html, '<!')) {
  22. var match = html.match(regDoctype);
  23. if (match) {
  24. if (handler.text)
  25. handler.text(html.substring(0, match[0].length));
  26. html = html.substring(match[0].length);
  27. text = false;
  28. }
  29. } else if (startWith(html, '</')) {
  30. var _match = html.match(regEndTag);
  31. if (_match) {
  32. html = html.substring(_match[0].length);
  33. _match[0].replace(regEndTag, parseEndTag);
  34. text = false;
  35. }
  36. } else if (startWith(html, '<')) {
  37. var _match2 = html.match(regStartTag);
  38. if (_match2) {
  39. html = html.substring(_match2[0].length);
  40. _match2[0].replace(regStartTag, parseStartTag);
  41. text = false;
  42. }
  43. }
  44. if (text) {
  45. var _endIdx = html.indexOf('<');
  46. var _text = _endIdx < 0 ? html : html.substring(0, _endIdx);
  47. html = _endIdx < 0 ? '' : html.substring(_endIdx);
  48. if (handler.text) handler.text(_text);
  49. }
  50. } else {
  51. var execRes = new RegExp('</'.concat(last(stack), '[^>]*>')).exec(
  52. html
  53. );
  54. if (execRes) {
  55. var _text2 = html.substring(0, execRes.index);
  56. html = html.substring(execRes.index + execRes[0].length);
  57. if (_text2 && handler.text) handler.text(_text2);
  58. }
  59. parseEndTag('', last(stack));
  60. }
  61. if (lastHtml === html) {
  62. throw Error('Parse Error: ' + html);
  63. }
  64. lastHtml = html;
  65. }
  66. parseEndTag();
  67. function parseStartTag(tag, tagName, rest, unary) {
  68. tagName = lowerCase(tagName);
  69. unary = !!unary;
  70. if (!unary) stack.push(tagName);
  71. if (handler.start) {
  72. var attrs = {};
  73. rest.replace(regAttr, function(all, $1, $2, $3, $4) {
  74. attrs[$1] = $2 || $3 || $4 || '';
  75. });
  76. handler.start(tagName, attrs, unary);
  77. }
  78. }
  79. function parseEndTag(tag, tagName) {
  80. tagName = lowerCase(tagName);
  81. var pos;
  82. if (!tagName) {
  83. pos = 0;
  84. } else {
  85. for (pos = stack.length - 1; pos >= 0; pos--) {
  86. if (stack[pos] === tagName) break;
  87. }
  88. }
  89. if (pos >= 0) {
  90. for (var i = stack.length - 1; i >= pos; i--) {
  91. if (handler.end) handler.end(stack[i]);
  92. }
  93. stack.length = pos;
  94. }
  95. }
  96. };
  97. var regDoctype = /^<!\s*doctype((?:\s+[\w:]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/i;
  98. var regEndTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/;
  99. var regStartTag = /^<([-A-Za-z0-9_]+)((?:\s+[-A-Za-z0-9_:@.]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/i;
  100. var regAttr = /([-A-Za-z0-9_:@.]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;
  101. var SPECIAL = arrToMap('script,style'.split(','));
  102. module.exports = exports;