serialize.cc 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. // Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, All Rights Reserved.
  2. // loosely based on a greatly simplified version of BeautifulSoup4 decode() routine
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. //
  16. // Author: Kevin Hendricks
  17. //
  18. // Serialize back to html / xhtml making as few changes as possible (even in whitespace)
  19. #include <fstream>
  20. #include <iostream>
  21. #include <stdlib.h>
  22. #include <string>
  23. #include "gumbo.h"
  24. static std::string nonbreaking_inline = "|a|abbr|acronym|b|bdo|big|cite|code|dfn|em|font|i|img|kbd|nobr|s|small|span|strike|strong|sub|sup|tt|";
  25. static std::string empty_tags = "|area|base|basefont|bgsound|br|command|col|embed|event-source|frame|hr|image|img|input|keygen|link|menuitem|meta|param|source|spacer|track|wbr|";
  26. static std::string preserve_whitespace = "|pre|textarea|script|style|";
  27. static std::string special_handling = "|html|body|";
  28. static std::string no_entity_sub = "|script|style|";
  29. static inline void rtrim(std::string &s)
  30. {
  31. s.erase(s.find_last_not_of(" \n\r\t")+1);
  32. }
  33. static inline void ltrim(std::string &s)
  34. {
  35. s.erase(0,s.find_first_not_of(" \n\r\t"));
  36. }
  37. static void replace_all(std::string &s, const char * s1, const char * s2)
  38. {
  39. std::string t1(s1);
  40. size_t len = t1.length();
  41. size_t pos = s.find(t1);
  42. while (pos != std::string::npos) {
  43. s.replace(pos, len, s2);
  44. pos = s.find(t1, pos + len);
  45. }
  46. }
  47. static std::string substitute_xml_entities_into_text(const std::string &text)
  48. {
  49. std::string result = text;
  50. // replacing & must come first
  51. replace_all(result, "&", "&amp;");
  52. replace_all(result, "<", "&lt;");
  53. replace_all(result, ">", "&gt;");
  54. return result;
  55. }
  56. static std::string substitute_xml_entities_into_attributes(char quote, const std::string &text)
  57. {
  58. std::string result = substitute_xml_entities_into_text(text);
  59. if (quote == '"') {
  60. replace_all(result,"\"","&quot;");
  61. }
  62. else if (quote == '\'') {
  63. replace_all(result,"'","&apos;");
  64. }
  65. return result;
  66. }
  67. static std::string handle_unknown_tag(GumboStringPiece *text)
  68. {
  69. std::string tagname = "";
  70. if (text->data == NULL) {
  71. return tagname;
  72. }
  73. // work with copy GumboStringPiece to prevent asserts
  74. // if try to read same unknown tag name more than once
  75. GumboStringPiece gsp = *text;
  76. gumbo_tag_from_original_text(&gsp);
  77. tagname = std::string(gsp.data, gsp.length);
  78. return tagname;
  79. }
  80. static std::string get_tag_name(GumboNode *node)
  81. {
  82. std::string tagname;
  83. // work around lack of proper name for document node
  84. if (node->type == GUMBO_NODE_DOCUMENT) {
  85. tagname = "document";
  86. } else {
  87. tagname = gumbo_normalized_tagname(node->v.element.tag);
  88. }
  89. if (tagname.empty()) {
  90. tagname = handle_unknown_tag(&node->v.element.original_tag);
  91. }
  92. return tagname;
  93. }
  94. static std::string build_doctype(GumboNode *node)
  95. {
  96. std::string results = "";
  97. if (node->v.document.has_doctype) {
  98. results.append("<!DOCTYPE ");
  99. results.append(node->v.document.name);
  100. std::string pi(node->v.document.public_identifier);
  101. if ((node->v.document.public_identifier != NULL) && !pi.empty() ) {
  102. results.append(" PUBLIC \"");
  103. results.append(node->v.document.public_identifier);
  104. results.append("\" \"");
  105. results.append(node->v.document.system_identifier);
  106. results.append("\"");
  107. }
  108. results.append(">\n");
  109. }
  110. return results;
  111. }
  112. static std::string build_attributes(GumboAttribute * at, bool no_entities)
  113. {
  114. std::string atts = " ";
  115. atts.append(at->name);
  116. // how do we want to handle attributes with empty values
  117. // <input type="checkbox" checked /> or <input type="checkbox" checked="" />
  118. if ( (!std::string(at->value).empty()) ||
  119. (at->original_value.data[0] == '"') ||
  120. (at->original_value.data[0] == '\'') ) {
  121. // determine original quote character used if it exists
  122. char quote = at->original_value.data[0];
  123. std::string qs = "";
  124. if (quote == '\'') qs = std::string("'");
  125. if (quote == '"') qs = std::string("\"");
  126. atts.append("=");
  127. atts.append(qs);
  128. if (no_entities) {
  129. atts.append(at->value);
  130. } else {
  131. atts.append(substitute_xml_entities_into_attributes(quote, std::string(at->value)));
  132. }
  133. atts.append(qs);
  134. }
  135. return atts;
  136. }
  137. // forward declaration
  138. static std::string serialize(GumboNode*);
  139. // serialize children of a node
  140. // may be invoked recursively
  141. static std::string serialize_contents(GumboNode* node) {
  142. std::string contents = "";
  143. std::string tagname = get_tag_name(node);
  144. std::string key = "|" + tagname + "|";
  145. bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
  146. bool keep_whitespace = preserve_whitespace.find(key) != std::string::npos;
  147. bool is_inline = nonbreaking_inline.find(key) != std::string::npos;
  148. // build up result for each child, recursively if need be
  149. GumboVector* children = &node->v.element.children;
  150. for (unsigned int i = 0; i < children->length; ++i) {
  151. GumboNode* child = static_cast<GumboNode*> (children->data[i]);
  152. if (child->type == GUMBO_NODE_TEXT) {
  153. if (no_entity_substitution) {
  154. contents.append(std::string(child->v.text.text));
  155. } else {
  156. contents.append(substitute_xml_entities_into_text(std::string(child->v.text.text)));
  157. }
  158. } else if (child->type == GUMBO_NODE_ELEMENT || child->type == GUMBO_NODE_TEMPLATE) {
  159. contents.append(serialize(child));
  160. } else if (child->type == GUMBO_NODE_WHITESPACE) {
  161. // keep all whitespace to keep as close to original as possible
  162. contents.append(std::string(child->v.text.text));
  163. } else if (child->type != GUMBO_NODE_COMMENT) {
  164. // Does this actually exist: (child->type == GUMBO_NODE_CDATA)
  165. fprintf(stderr, "unknown element of type: %d\n", child->type);
  166. }
  167. }
  168. return contents;
  169. }
  170. // serialize a GumboNode back to html/xhtml
  171. // may be invoked recursively
  172. static std::string serialize(GumboNode* node) {
  173. // special case the document node
  174. if (node->type == GUMBO_NODE_DOCUMENT) {
  175. std::string results = build_doctype(node);
  176. results.append(serialize_contents(node));
  177. return results;
  178. }
  179. std::string close = "";
  180. std::string closeTag = "";
  181. std::string atts = "";
  182. std::string tagname = get_tag_name(node);
  183. std::string key = "|" + tagname + "|";
  184. bool need_special_handling = special_handling.find(key) != std::string::npos;
  185. bool is_empty_tag = empty_tags.find(key) != std::string::npos;
  186. bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
  187. bool is_inline = nonbreaking_inline.find(key) != std::string::npos;
  188. // build attr string
  189. const GumboVector * attribs = &node->v.element.attributes;
  190. for (int i=0; i< attribs->length; ++i) {
  191. GumboAttribute* at = static_cast<GumboAttribute*>(attribs->data[i]);
  192. atts.append(build_attributes(at, no_entity_substitution));
  193. }
  194. // determine closing tag type
  195. if (is_empty_tag) {
  196. close = "/";
  197. } else {
  198. closeTag = "</" + tagname + ">";
  199. }
  200. // serialize your contents
  201. std::string contents = serialize_contents(node);
  202. if (need_special_handling) {
  203. ltrim(contents);
  204. rtrim(contents);
  205. contents.append("\n");
  206. }
  207. // build results
  208. std::string results;
  209. results.append("<"+tagname+atts+close+">");
  210. if (need_special_handling) results.append("\n");
  211. results.append(contents);
  212. results.append(closeTag);
  213. if (need_special_handling) results.append("\n");
  214. return results;
  215. }
  216. int main(int argc, char** argv) {
  217. if (argc != 2) {
  218. std::cout << "serialize <html filename>\n";
  219. exit(EXIT_FAILURE);
  220. }
  221. const char* filename = argv[1];
  222. std::ifstream in(filename, std::ios::in | std::ios::binary);
  223. if (!in) {
  224. std::cout << "File " << filename << " not found!\n";
  225. exit(EXIT_FAILURE);
  226. }
  227. std::string contents;
  228. in.seekg(0, std::ios::end);
  229. contents.resize(in.tellg());
  230. in.seekg(0, std::ios::beg);
  231. in.read(&contents[0], contents.size());
  232. in.close();
  233. GumboOptions options = kGumboDefaultOptions;
  234. GumboOutput* output = gumbo_parse_with_options(&options, contents.data(), contents.length());
  235. std::cout << serialize(output->document) << std::endl;
  236. gumbo_destroy_output(&kGumboDefaultOptions, output);
  237. }