htmlparser.js 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. /**
  2. * Referred Sources
  3. * http://ckeditor.com/ htmlparser.js
  4. * http://ejohn.org/blog/pure-javascript-html-parser/ by John Resig (ejohn.org)
  5. * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js by Erik Arvidsson
  6. */
  7. (function() {
  8. /**
  9. * <o:p>MSO</o:p>
  10. * <table><tr><td></td>Text<td></td></tr></table>
  11. * <embed></embed>, <embed>
  12. * area, param
  13. */
  14. function extend(dest, org) {
  15. for (var key in org) {
  16. dest[key] = org[key];
  17. }
  18. return dest;
  19. }
  20. function makeMap(str) {
  21. var obj = {}, items = str.split(",");
  22. for (var i = 0; i < items.length; i++) {
  23. obj[ items[i] ] = true;
  24. obj[ items[i].toUpperCase() ] = true;
  25. }
  26. return obj;
  27. }
  28. var htmlPartsRegex = /<(?:(?:\/([A-Za-z][-A-Za-z0-9_:]*)[^>]*>)|(?:!--([\S\s]*?)-->)|(?:([A-Za-z][-A-Za-z0-9_:]*)((?:\s+(?:\/(?!>)|[^>\s=])+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*?)\s*(\/?)>))/g;
  29. // Empty Elements - HTML 4.01
  30. var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");
  31. // Block Elements - HTML 4.01
  32. var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");
  33. // Inline Elements - HTML 4.01
  34. var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
  35. // Elements that you can, intentionally, leave open
  36. // (and which close themselves)
  37. var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
  38. // Attributes that have their values filled in disabled="disabled"
  39. // var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
  40. // Special Elements (can contain anything)
  41. var special = makeMap("script,style,textarea");
  42. // 다른 데에서도 사용을 할까?
  43. var Set = {
  44. isEmpty: function(obj) {
  45. for (var key in obj) {
  46. return false;
  47. }
  48. return true;
  49. },
  50. intersection: function(s1, s2) {
  51. var result = {};
  52. for (var key in s1) {
  53. if (key in s2 && s1[key] === s2[key]) {
  54. result[key] = s1[key];
  55. }
  56. }
  57. return result;
  58. },
  59. difference: function(s1, s2) {
  60. var result = extend({}, s1);
  61. for (var key in s2) {
  62. delete result[key];
  63. }
  64. return result;
  65. },
  66. union: function(s1, s2) {
  67. var result = extend({}, s1);
  68. for (var key in s2) {
  69. result[key] = s2[key];
  70. }
  71. return result;
  72. },
  73. isSubset: function(s1, s2) {
  74. for (var key in s2) {
  75. if (s2[key] !== s1[key]) {
  76. return false;
  77. }
  78. }
  79. return true;
  80. }
  81. };
  82. var HTMLTree = this.HTMLTree = function() {
  83. this.current = null;
  84. this.depth = 0;
  85. this.maxDepth = 0;
  86. };
  87. HTMLTree.create = function() {
  88. var tree = new HTMLTree();
  89. tree.openTag(1, "ROOT", "", false, false);
  90. return tree;
  91. };
  92. HTMLTree.parseAttributes = function(attrText) {
  93. var ATTRIBUTE_REGEX = /\s*([\w\-:.]+)(?:(?:\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)')|([^\s>]+)))|(?=\s|$))/g;
  94. var attribMatch, attribs = {};
  95. if (attrText) {
  96. while (( attribMatch = ATTRIBUTE_REGEX.exec(attrText) )) {
  97. var attName = attribMatch[1].toLowerCase(),
  98. attValue = attribMatch[2] || attribMatch[3] || attribMatch[4] || '';
  99. if (attName == "class") {
  100. attName = "className"; // class is special. {class:1}, {}.class makes error.
  101. }
  102. attribs[ attName ] = attValue;
  103. }
  104. }
  105. return attribs;
  106. };
  107. HTMLTree.prototype.openTag = function(nodeType, nodeData, restText, unary) {
  108. var tagName = nodeType == 1 ? nodeData.toUpperCase() : null;
  109. var data = {
  110. parent: this.current,
  111. nodeType: nodeType,
  112. tagName: tagName,
  113. nodeData: nodeData,
  114. restText: restText || "",
  115. children: [],
  116. inheritingFontStyle: {},
  117. fontStyle: {},
  118. hasText: nodeType == 8 || (nodeType == 3 && !/^(\r|\n)*$/.test(nodeData)),
  119. valid: true,
  120. unary: nodeType == 1 ? unary : true,
  121. hasKeyAttribute: false
  122. };
  123. // attribute, font관련 css에 대한 처리를 한다.
  124. if (nodeType == 1) {
  125. // ancestor에서 정의된 font 속성
  126. var inheritedFontStyle = data.parent ? data.parent.inheritingFontStyle : {};
  127. if (tagName == "TABLE" /* && quirks mode */) {
  128. inheritedFontStyle = {};
  129. }
  130. var attributes = HTMLTree.parseAttributes(restText);
  131. // 중요한 속성을 갖고 있는가?
  132. if (attributes.id || attributes.className) {
  133. // TODO : naming
  134. data.hasKeyAttribute = true;
  135. }
  136. // 현재 node에서 정의한 font 속성
  137. var currentFontStyle = FontCssProperty.create(tagName, attributes);
  138. data.fontStyle = currentFontStyle;
  139. /* <span style="font-size:12pt"><span style="font-size:12pt">Hello</span></span> 에서 문제 발생, valid 여부 확인하면서 돌면 될듯함
  140. // ancestor에서 정의된 속성에 포함이 되는가?
  141. // TODO font related tags
  142. if ((FontCssProperty.TAGS_FOR_PRESENTATION[tagName] || tagName == "SPAN") && Set.isSubset(inheritedFontStyle, currentFontStyle) && !data.hasAttributes) {
  143. data.valid = false;
  144. }
  145. */
  146. // descendant에 적용될 font 속성
  147. var inheritingFontStyle = extend({}, inheritedFontStyle);
  148. inheritingFontStyle = extend(inheritedFontStyle, currentFontStyle);
  149. data.inheritingFontStyle = inheritingFontStyle;
  150. }
  151. if (this.current) {
  152. this.current.children.push(data);
  153. } else {
  154. this.root = data;
  155. data.valid = false;
  156. }
  157. this.depth += 1;
  158. this.maxDepth = Math.max(this.depth, this.maxDepth);
  159. this.current = data;
  160. };
  161. HTMLTree.prototype.unaryTag = function(nodeType, nodeData, restText) {
  162. this.openTag(nodeType, nodeData, restText, true);
  163. this.closeTag();
  164. };
  165. HTMLTree.prototype.closeTag = function() {
  166. this.depth -= 1;
  167. this.current = this.current.parent;
  168. };
  169. HTMLTree.prototype.toString = function() {
  170. var result = [];
  171. var root = this.root;
  172. visitNode(root);
  173. return result.join('');
  174. function visitNode(node) {
  175. if (root != node) {
  176. if (node.nodeType == 1) {
  177. result.push("<");
  178. result.push(node.nodeData);
  179. result.push(node.restText);
  180. result.push(">");
  181. } else if (node.nodeType == 3) {
  182. result.push(node.nodeData);
  183. } else if (node.nodeType == 8) {
  184. result.push("<!--");
  185. result.push(node.nodeData);
  186. result.push("-->");
  187. }
  188. }
  189. for (var i = 0; i < node.children.length; i++) {
  190. visitNode(node.children[i]);
  191. }
  192. if (root != node && !node.unary) {
  193. result.push("</");
  194. result.push(node.nodeData);
  195. result.push(">");
  196. }
  197. }
  198. };
  199. HTMLTree.prototype.cleanHTML = function() {
  200. if (!this.cleanedUp) {
  201. this.removeUseless();
  202. this.cleanedUp = true;
  203. }
  204. var result = [];
  205. visitNode(this.root);
  206. return result.join("");
  207. function visitNode(node) {
  208. if (node.valid) {
  209. if (node.nodeType == 1) {
  210. result.push("<");
  211. result.push(node.nodeData);
  212. result.push(node.restText);
  213. result.push(">");
  214. } else if (node.nodeType == 3) {
  215. result.push(node.nodeData);
  216. } else if (node.nodeType == 8) {
  217. result.push("<!--");
  218. result.push(node.nodeData);
  219. result.push("-->");
  220. }
  221. }
  222. for (var i = 0; i < node.children.length; i++) {
  223. visitNode(node.children[i]);
  224. }
  225. if (node.valid && !node.unary) {
  226. result.push("</");
  227. result.push(node.nodeData);
  228. result.push(">");
  229. }
  230. }
  231. };
  232. HTMLTree.prototype.postOrder = function(callback) {
  233. visitNode(this.root);
  234. function visitNode(node) {
  235. for (var i = 0; i < node.children.length; i++) {
  236. visitNode(node.children[i]);
  237. }
  238. callback(node);
  239. }
  240. };
  241. HTMLTree.prototype.removeUseless = function() {
  242. var start = new Date().getTime();
  243. var count = 0;
  244. this.postOrder(function(node) {
  245. switch(node.nodeType) {
  246. case 1:
  247. var tagName = node.tagName;
  248. var childrenCommonStyles = {};
  249. for (var i = 0; i < node.children.length; i++) {
  250. var child = node.children[i];
  251. if (i == 0) {
  252. childrenCommonStyles = extend({}, child.fontStyle);
  253. } else {
  254. childrenCommonStyles = Set.intersection(childrenCommonStyles, child.fontStyle);
  255. }
  256. if (!node.hasText) {
  257. node.hasText = child.hasText;
  258. }
  259. }
  260. // console.log(tagName, JSON.stringify(node.fontStyle), JSON.stringify(childCommonStyles));
  261. var effectingStyle = Set.difference(node.fontStyle, childrenCommonStyles);
  262. node.fontStyle = Set.union(node.fontStyle, childrenCommonStyles);
  263. // TODO font related
  264. if (!node.hasKeyAttribute && (FontCssProperty.TAGS_FOR_PRESENTATION[tagName] || tagName == "SPAN")) {
  265. if (Set.isEmpty(effectingStyle) || !node.hasText) {
  266. count++;
  267. node.valid = false;
  268. }
  269. }
  270. break;
  271. case 3:
  272. node.fontStyle = {};
  273. break;
  274. case 8:
  275. node.fontStyle = {};
  276. break;
  277. }
  278. });
  279. // console.log('# of removed elements: ' + count);
  280. // console.log('duration: ' + (new Date().getTime() - start));
  281. };
  282. this.HTMLParser = function(html) {
  283. var wellFormed = true,
  284. parts,
  285. tagName,
  286. nextIndex = 0,
  287. stack = [],
  288. tree = HTMLTree.create(),
  289. cdata; // The collected data inside a CDATA section.
  290. stack.empty = function() {
  291. return this.length === 0;
  292. };
  293. stack.last = function() {
  294. return this[this.length - 1];
  295. };
  296. while (( parts = htmlPartsRegex.exec(html) )) {
  297. // visit TextNode
  298. var tagIndex = parts.index;
  299. if (tagIndex > nextIndex) {
  300. var text = html.substring(nextIndex, tagIndex);
  301. if (cdata) {
  302. cdata.push(text);
  303. } else {
  304. onText(text);
  305. }
  306. }
  307. nextIndex = htmlPartsRegex.lastIndex;
  308. /*
  309. "parts" is an array with the following items:
  310. 0 : The entire match for opening/closing tags and comments.
  311. 1 : Group filled with the tag name for closing tags.
  312. 2 : Group filled with the comment text.
  313. 3 : Group filled with the tag name for opening tags.
  314. 4 : Group filled with the attributes part of opening tags.
  315. */
  316. // Closing tag
  317. if (( tagName = parts[ 1 ] )) {
  318. if (cdata && special[ tagName ]) {
  319. onCDATA(cdata.join(''));
  320. cdata = null;
  321. }
  322. if (!cdata) {
  323. onEndTag(tagName);
  324. continue;
  325. }
  326. }
  327. // If CDATA is enabled, just save the raw match.
  328. if (cdata) {
  329. cdata.push(parts[ 0 ]);
  330. continue;
  331. }
  332. // Opening tag
  333. if (( tagName = parts[ 3 ] )) {
  334. // There are some tag names that can break things, so let's
  335. // simply ignore them when parsing. (#5224)
  336. if (/="/.test(tagName))
  337. continue;
  338. var unary = !!( parts[ 4 ] && parts[ 4 ].charAt(parts[ 4 ].length - 1) == '/' );
  339. onStartTag(tagName, parts[ 4 ], unary);
  340. // Open CDATA mode when finding the appropriate tags.
  341. if (!cdata && special[ tagName ]) {
  342. cdata = [];
  343. }
  344. continue;
  345. }
  346. // Comment
  347. if (( tagName = parts[ 2 ] )) {
  348. onComment(tagName);
  349. }
  350. }
  351. onText(html.substring(nextIndex));
  352. cleanUnclosedUp();
  353. return {
  354. wellFormed: wellFormed,
  355. maxDepth: tree.maxDepth,
  356. cleanHTML: tree.cleanHTML()
  357. };
  358. function onStartTag(tagName, rest, unary) {
  359. if (closeSelf[ tagName ] && !stack.empty() && stack.last().tagName == tagName) {
  360. onEndTag(tagName);
  361. }
  362. var repair = [];
  363. /** p > block을 해결위한 코드이지만, tree를 과다하게 크게 생성하기 때문에 제외함.
  364. while (!stack.empty() && tagName.toLowerCase() == 'p') {
  365. var last = stack.last();
  366. if (inline[ last.tagName ]) {
  367. onEndTag(last.tagName);
  368. repair.push(last);
  369. } else if (last.tagName.toLowerCase() == "p") {
  370. onEndTag(last.tagName);
  371. break;
  372. } else {
  373. break;
  374. }
  375. }
  376. */
  377. unary = empty[ tagName ] || !!unary;
  378. if (!unary) {
  379. stack.push({
  380. tagName: tagName,
  381. rest: rest,
  382. unary: unary
  383. });
  384. tree.openTag(1, tagName, rest, unary);
  385. for (var i = repair.length - 1; i >= 0; i--) {
  386. onStartTag(repair[i].tagName, repair[i].rest, repair[i].unary);
  387. }
  388. } else {
  389. tree.unaryTag(1, tagName, rest);
  390. }
  391. }
  392. function onEndTag(tagName) {
  393. if (stack.empty()) {
  394. wellFormed = false;
  395. // console.log('stack is empty : ' + tagName);
  396. return;
  397. }
  398. var repair = [],
  399. found = -1,
  400. i;
  401. for (i = stack.length - 1; i >= 0; i--) {
  402. var visit = stack[i];
  403. if (visit.tagName == tagName) {
  404. found = i;
  405. break;
  406. } else {
  407. wellFormed = false;
  408. if (closeSelf[visit.tagName]) {
  409. // console.log('self close by meeting closing tag : ' + tagName);
  410. } else {
  411. repair.push(visit);
  412. // console.log('invalid : ' + tagName);
  413. }
  414. }
  415. }
  416. if (found == -1) {
  417. wellFormed = false;
  418. // console.log('not opened tag : ' + tagName);
  419. return;
  420. }
  421. for (i = stack.length - 1; i >= found; i--) {
  422. stack.pop();
  423. tree.closeTag();
  424. }
  425. for (i = repair.length - 1; i >= 0; i--) {
  426. // console.log("wrong pair : " + tagName);
  427. onStartTag(repair[i].tagName, repair[i].rest, repair[i].unary);
  428. }
  429. }
  430. function onText(text) {
  431. tree.unaryTag(3, text);
  432. }
  433. function onCDATA(cdata) {
  434. tree.unaryTag(3, cdata);
  435. }
  436. function onComment(comment) {
  437. tree.unaryTag(8, comment);
  438. }
  439. function cleanUnclosedUp() {
  440. if (stack.length > 0) {
  441. wellFormed = false;
  442. for (var i = stack.length - 1; i >= 0; i--) {
  443. if (closeSelf[ stack[ i ].tagName ]) {
  444. // console.log('self close : ' + stack[i].tagName);
  445. } else {
  446. // console.log('not closed : ' + stack[i].tagName);
  447. }
  448. tree.closeTag();
  449. }
  450. }
  451. }
  452. }
  453. })();