prettyprint.cc 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. // Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, All Rights Reserved.
  2. // loosely based on a greatly simplified version of BeautifulSoup4 decode() routine
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. //
  16. // Author: Kevin Hendricks
  17. //
  18. // Prettyprint back to html / xhtml
  19. #include <fstream>
  20. #include <iostream>
  21. #include <stdlib.h>
  22. #include <string>
  23. #include "gumbo.h"
  24. static std::string nonbreaking_inline = "|a|abbr|acronym|b|bdo|big|cite|code|dfn|em|font|i|img|kbd|nobr|s|small|span|strike|strong|sub|sup|tt|";
  25. static std::string empty_tags = "|area|base|basefont|bgsound|br|command|col|embed|event-source|frame|hr|image|img|input|keygen|link|menuitem|meta|param|source|spacer|track|wbr|";
  26. static std::string preserve_whitespace = "|pre|textarea|script|style|";
  27. static std::string special_handling = "|html|body|";
  28. static std::string no_entity_sub = "|script|style|";
  29. static std::string treat_like_inline = "|p|";
  30. static inline void rtrim(std::string &s)
  31. {
  32. s.erase(s.find_last_not_of(" \n\r\t")+1);
  33. }
  34. static inline void ltrim(std::string &s)
  35. {
  36. s.erase(0,s.find_first_not_of(" \n\r\t"));
  37. }
  38. static void replace_all(std::string &s, const char * s1, const char * s2)
  39. {
  40. std::string t1(s1);
  41. size_t len = t1.length();
  42. size_t pos = s.find(t1);
  43. while (pos != std::string::npos) {
  44. s.replace(pos, len, s2);
  45. pos = s.find(t1, pos + len);
  46. }
  47. }
  48. static std::string substitute_xml_entities_into_text(const std::string &text)
  49. {
  50. std::string result = text;
  51. // replacing & must come first
  52. replace_all(result, "&", "&amp;");
  53. replace_all(result, "<", "&lt;");
  54. replace_all(result, ">", "&gt;");
  55. return result;
  56. }
  57. static std::string substitute_xml_entities_into_attributes(char quote, const std::string &text)
  58. {
  59. std::string result = substitute_xml_entities_into_text(text);
  60. if (quote == '"') {
  61. replace_all(result,"\"","&quot;");
  62. }
  63. else if (quote == '\'') {
  64. replace_all(result,"'","&apos;");
  65. }
  66. return result;
  67. }
  68. static std::string handle_unknown_tag(GumboStringPiece *text)
  69. {
  70. std::string tagname = "";
  71. if (text->data == NULL) {
  72. return tagname;
  73. }
  74. // work with copy GumboStringPiece to prevent asserts
  75. // if try to read same unknown tag name more than once
  76. GumboStringPiece gsp = *text;
  77. gumbo_tag_from_original_text(&gsp);
  78. tagname = std::string(gsp.data, gsp.length);
  79. return tagname;
  80. }
  81. static std::string get_tag_name(GumboNode *node)
  82. {
  83. std::string tagname;
  84. // work around lack of proper name for document node
  85. if (node->type == GUMBO_NODE_DOCUMENT) {
  86. tagname = "document";
  87. } else {
  88. tagname = gumbo_normalized_tagname(node->v.element.tag);
  89. }
  90. if (tagname.empty()) {
  91. tagname = handle_unknown_tag(&node->v.element.original_tag);
  92. }
  93. return tagname;
  94. }
  95. static std::string build_doctype(GumboNode *node)
  96. {
  97. std::string results = "";
  98. if (node->v.document.has_doctype) {
  99. results.append("<!DOCTYPE ");
  100. results.append(node->v.document.name);
  101. std::string pi(node->v.document.public_identifier);
  102. if ((node->v.document.public_identifier != NULL) && !pi.empty() ) {
  103. results.append(" PUBLIC \"");
  104. results.append(node->v.document.public_identifier);
  105. results.append("\" \"");
  106. results.append(node->v.document.system_identifier);
  107. results.append("\"");
  108. }
  109. results.append(">\n");
  110. }
  111. return results;
  112. }
  113. static std::string build_attributes(GumboAttribute * at, bool no_entities)
  114. {
  115. std::string atts = "";
  116. atts.append(" ");
  117. atts.append(at->name);
  118. // how do we want to handle attributes with empty values
  119. // <input type="checkbox" checked /> or <input type="checkbox" checked="" />
  120. if ( (!std::string(at->value).empty()) ||
  121. (at->original_value.data[0] == '"') ||
  122. (at->original_value.data[0] == '\'') ) {
  123. // determine original quote character used if it exists
  124. char quote = at->original_value.data[0];
  125. std::string qs = "";
  126. if (quote == '\'') qs = std::string("'");
  127. if (quote == '"') qs = std::string("\"");
  128. atts.append("=");
  129. atts.append(qs);
  130. if (no_entities) {
  131. atts.append(at->value);
  132. } else {
  133. atts.append(substitute_xml_entities_into_attributes(quote, std::string(at->value)));
  134. }
  135. atts.append(qs);
  136. }
  137. return atts;
  138. }
  139. // forward declaration
  140. static std::string prettyprint(GumboNode*, int lvl, const std::string indent_chars);
  141. // prettyprint children of a node
  142. // may be invoked recursively
  143. static std::string prettyprint_contents(GumboNode* node, int lvl, const std::string indent_chars) {
  144. std::string contents = "";
  145. std::string tagname = get_tag_name(node);
  146. std::string key = "|" + tagname + "|";
  147. bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
  148. bool keep_whitespace = preserve_whitespace.find(key) != std::string::npos;
  149. bool is_inline = nonbreaking_inline.find(key) != std::string::npos;
  150. bool pp_okay = !is_inline && !keep_whitespace;
  151. GumboVector* children = &node->v.element.children;
  152. for (unsigned int i = 0; i < children->length; ++i) {
  153. GumboNode* child = static_cast<GumboNode*> (children->data[i]);
  154. if (child->type == GUMBO_NODE_TEXT) {
  155. std::string val;
  156. if (no_entity_substitution) {
  157. val = std::string(child->v.text.text);
  158. } else {
  159. val = substitute_xml_entities_into_text(std::string(child->v.text.text));
  160. }
  161. if (pp_okay) rtrim(val);
  162. if (pp_okay && (contents.length() == 0)) {
  163. // add required indentation
  164. char c = indent_chars.at(0);
  165. int n = indent_chars.length();
  166. contents.append(std::string((lvl-1)*n,c));
  167. }
  168. contents.append(val);
  169. } else if ((child->type == GUMBO_NODE_ELEMENT) || (child->type == GUMBO_NODE_TEMPLATE)) {
  170. std::string val = prettyprint(child, lvl, indent_chars);
  171. // remove any indentation if this child is inline and not first child
  172. std::string childname = get_tag_name(child);
  173. std::string childkey = "|" + childname + "|";
  174. if ((nonbreaking_inline.find(childkey) != std::string::npos) && (contents.length() > 0)) {
  175. ltrim(val);
  176. }
  177. contents.append(val);
  178. } else if (child->type == GUMBO_NODE_WHITESPACE) {
  179. if (keep_whitespace || is_inline) {
  180. contents.append(std::string(child->v.text.text));
  181. }
  182. } else if (child->type != GUMBO_NODE_COMMENT) {
  183. // Does this actually exist: (child->type == GUMBO_NODE_CDATA)
  184. fprintf(stderr, "unknown element of type: %d\n", child->type);
  185. }
  186. }
  187. return contents;
  188. }
  189. // prettyprint a GumboNode back to html/xhtml
  190. // may be invoked recursively
  191. static std::string prettyprint(GumboNode* node, int lvl, const std::string indent_chars) {
  192. // special case the document node
  193. if (node->type == GUMBO_NODE_DOCUMENT) {
  194. std::string results = build_doctype(node);
  195. results.append(prettyprint_contents(node,lvl+1,indent_chars));
  196. return results;
  197. }
  198. std::string close = "";
  199. std::string closeTag = "";
  200. std::string atts = "";
  201. std::string tagname = get_tag_name(node);
  202. std::string key = "|" + tagname + "|";
  203. bool need_special_handling = special_handling.find(key) != std::string::npos;
  204. bool is_empty_tag = empty_tags.find(key) != std::string::npos;
  205. bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
  206. bool keep_whitespace = preserve_whitespace.find(key) != std::string::npos;
  207. bool is_inline = nonbreaking_inline.find(key) != std::string::npos;
  208. bool inline_like = treat_like_inline.find(key) != std::string::npos;
  209. bool pp_okay = !is_inline && !keep_whitespace;
  210. char c = indent_chars.at(0);
  211. int n = indent_chars.length();
  212. // build attr string
  213. const GumboVector * attribs = &node->v.element.attributes;
  214. for (int i=0; i< attribs->length; ++i) {
  215. GumboAttribute* at = static_cast<GumboAttribute*>(attribs->data[i]);
  216. atts.append(build_attributes(at, no_entity_substitution));
  217. }
  218. // determine closing tag type
  219. if (is_empty_tag) {
  220. close = "/";
  221. } else {
  222. closeTag = "</" + tagname + ">";
  223. }
  224. std::string indent_space = std::string((lvl-1)*n,c);
  225. // prettyprint your contents
  226. std::string contents = prettyprint_contents(node, lvl+1, indent_chars);
  227. if (need_special_handling) {
  228. rtrim(contents);
  229. contents.append("\n");
  230. }
  231. char last_char = ' ';
  232. if (!contents.empty()) {
  233. last_char = contents.at(contents.length()-1);
  234. }
  235. // build results
  236. std::string results;
  237. if (pp_okay) {
  238. results.append(indent_space);
  239. }
  240. results.append("<"+tagname+atts+close+">");
  241. if (pp_okay && !inline_like) {
  242. results.append("\n");
  243. }
  244. if (inline_like) {
  245. ltrim(contents);
  246. }
  247. results.append(contents);
  248. if (pp_okay && !contents.empty() && (last_char != '\n') && (!inline_like)) {
  249. results.append("\n");
  250. }
  251. if (pp_okay && !inline_like && !closeTag.empty()) {
  252. results.append(indent_space);
  253. }
  254. results.append(closeTag);
  255. if (pp_okay && !closeTag.empty()) {
  256. results.append("\n");
  257. }
  258. return results;
  259. }
  260. int main(int argc, char** argv) {
  261. if (argc != 2) {
  262. std::cout << "prettyprint <html filename>\n";
  263. exit(EXIT_FAILURE);
  264. }
  265. const char* filename = argv[1];
  266. std::ifstream in(filename, std::ios::in | std::ios::binary);
  267. if (!in) {
  268. std::cout << "File " << filename << " not found!\n";
  269. exit(EXIT_FAILURE);
  270. }
  271. std::string contents;
  272. in.seekg(0, std::ios::end);
  273. contents.resize(in.tellg());
  274. in.seekg(0, std::ios::beg);
  275. in.read(&contents[0], contents.size());
  276. in.close();
  277. GumboOptions options = kGumboDefaultOptions;
  278. GumboOutput* output = gumbo_parse_with_options(&options, contents.data(), contents.length());
  279. std::string indent_chars = " ";
  280. std::cout << prettyprint(output->document, 0, indent_chars) << std::endl;
  281. gumbo_destroy_output(&kGumboDefaultOptions, output);
  282. }