md2html.awk 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. #!/usr/bin/mawk -f
  2. #
  3. # by: Jesus Galan (yiyus) 2009
  4. #
  5. # Usage: md2html.awk file.md > file.html
  6. # See: http://4l77.com/src/md2html.awk
  7. function eschtml(t) {
  8. gsub("&", "\\&", t);
  9. gsub("<", "\\&lt;", t);
  10. return t;
  11. }
  12. function oprint(t){
  13. if(nr == 0)
  14. print t;
  15. else
  16. otext = otext "\n" t;
  17. }
  18. function subref(id){
  19. for(; nr > 0 && sub("<<" id, ref[id], otext); nr--);
  20. if(nr == 0 && otext) {
  21. print otext;
  22. otext = "";
  23. }
  24. }
  25. function nextil(t) {
  26. if(!match(t, /[`<&\[*_\\-]|(\!\[)/))
  27. return t;
  28. t1 = substr(t, 1, RSTART - 1);
  29. tag = substr(t, RSTART, RLENGTH);
  30. t2 = substr(t, RSTART + RLENGTH);
  31. if(ilcode && tag != "`")
  32. return eschtml(t1 tag) nextil(t2);
  33. # Backslash escaping
  34. if(tag == "\\"){
  35. if(match(t2, /^[\\`*_{}\[\]()#+\-\.!]/)){
  36. tag = substr(t2, 1, 1);
  37. t2 = substr(t2, 2);
  38. }
  39. return t1 tag nextil(t2);
  40. }
  41. # Dashes
  42. if(tag == "-"){
  43. if(sub(/^-/, "", t2))
  44. tag = "&#8212;";
  45. return t1 tag nextil(t2);
  46. }
  47. # Inline Code
  48. if(tag == "`"){
  49. if(sub(/^`/, "", t2)){
  50. if(!match(t2, /``/))
  51. return t1 "&#8221;" nextil(t2);
  52. ilcode2 = !ilcode2;
  53. }
  54. else if(ilcode2)
  55. return t1 tag nextil(t2);
  56. tag = "<code>";
  57. if(ilcode){
  58. t1 = eschtml(t1);
  59. tag = "</code>";
  60. }
  61. ilcode = !ilcode;
  62. return t1 tag nextil(t2);
  63. }
  64. if(tag == "<"){
  65. # Autolinks
  66. if(match(t2, /^[^ ]+[\.@][^ ]+>/)){
  67. url = eschtml(substr(t2, 1, RLENGTH - 1));
  68. t2 = substr(t2, RLENGTH + 1);
  69. linktext = url;
  70. if(match(url, /@/) && !match(url, /^mailto:/))
  71. url = "mailto:" url;
  72. return t1 "<a href=\"" url "\">" linktext "</a>" nextil(t2);
  73. }
  74. # Html tags
  75. if(match(t2, /^[A-Za-z\/!][^>]*>/)){
  76. tag = tag substr(t2, RSTART, RLENGTH);
  77. t2 = substr(t2, RLENGTH + 1);
  78. return t1 tag nextil(t2);
  79. }
  80. return t1 "&lt;" nextil(t2);
  81. }
  82. # Html special entities
  83. if(tag == "&"){
  84. if(match(t2, /^#?[A-Za-z0-9]+;/)){
  85. tag = tag substr(t2, RSTART, RLENGTH);
  86. t2 = substr(t2, RLENGTH + 1);
  87. return t1 tag nextil(t2);
  88. }
  89. return t1 "&amp;" nextil(t2);
  90. }
  91. # Images
  92. if(tag == "!["){
  93. if(!match(t2, /(\[.*\])|(\(.*\))/))
  94. return t1 tag nextil(t2);
  95. match(t2, /^[^\]]*/);
  96. alt = substr(t2, 1, RLENGTH);
  97. t2 = substr(t2, RLENGTH + 2);
  98. if(match(t2, /^\(/)){
  99. # Inline
  100. sub(/^\(/, "", t2);
  101. match(t2, /^[^\)]+/);
  102. url = eschtml(substr(t2, 1, RLENGTH));
  103. t2 = substr(t2, RLENGTH + 2);
  104. title = "";
  105. if(match(url, /[ ]+\".*\"[ ]*$/)) {
  106. title = substr(url, RSTART, RLENGTH);
  107. url = substr(url, 1, RSTART - 1);
  108. match(title, /\".*\"/);
  109. title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
  110. }
  111. if(match(url, /^<.*>$/))
  112. url = substr(url, 2, RLENGTH - 2);
  113. return t1 "<img src=\"" url "\" alt=\"" alt "\"" title " />" nextil(t2);
  114. }
  115. else{
  116. # Referenced
  117. sub(/^ ?\[/, "", t2);
  118. id = alt;
  119. if(match(t2, /^[^\]]+/))
  120. id = substr(t2, 1, RLENGTH);
  121. t2 = substr(t2, RLENGTH + 2);
  122. if(ref[id])
  123. r = ref[id];
  124. else{
  125. r = "<<" id;
  126. nr++;
  127. }
  128. return t1 "<img src=\"" r "\" alt=\"" alt "\" />" nextil(t2);
  129. }
  130. }
  131. # Links
  132. if(tag == "["){
  133. if(!match(t2, /(\[.*\])|(\(.*\))/))
  134. return t1 tag nextil(t2);
  135. match(t2, /^[^\]]*(\[[^\]]*\][^\]]*)*/);
  136. linktext = substr(t2, 1, RLENGTH);
  137. t2 = substr(t2, RLENGTH + 2);
  138. if(match(t2, /^\(/)){
  139. # Inline
  140. match(t2, /^[^\)]+(\([^\)]+\)[^\)]*)*/);
  141. url = substr(t2, 2, RLENGTH - 1);
  142. pt2 = substr(t2, RLENGTH + 2);
  143. title = "";
  144. if(match(url, /[ ]+\".*\"[ ]*$/)) {
  145. title = substr(url, RSTART, RLENGTH);
  146. url = substr(url, 1, RSTART - 1);
  147. match(title, /\".*\"/);
  148. title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
  149. }
  150. if(match(url, /^<.*>$/))
  151. url = substr(url, 2, RLENGTH - 2);
  152. url = eschtml(url);
  153. return t1 "<a href=\"" url "\"" title ">" nextil(linktext) "</a>" nextil(pt2);
  154. }
  155. else{
  156. # Referenced
  157. sub(/^ ?\[/, "", t2);
  158. id = linktext;
  159. if(match(t2, /^[^\]]+/))
  160. id = substr(t2, 1, RLENGTH);
  161. t2 = substr(t2, RLENGTH + 2);
  162. if(ref[id])
  163. r = ref[id];
  164. else{
  165. r = "<<" id;
  166. nr++;
  167. }
  168. pt2 = t2;
  169. return t1 "<a href=\"" r "\" />" nextil(linktext) "</a>" nextil(pt2);
  170. }
  171. }
  172. # Emphasis
  173. if(match(tag, /[*_]/)){
  174. ntag = tag;
  175. if(sub("^" tag, "", t2)){
  176. if(stag[ns] == tag && match(t2, "^" tag))
  177. t2 = tag t2;
  178. else
  179. ntag = tag tag
  180. }
  181. n = length(ntag);
  182. tag = (n == 2) ? "strong" : "em";
  183. if(match(t1, / $/) && match(t2, /^ /))
  184. return t1 tag nextil(t2);
  185. if(stag[ns] == ntag){
  186. tag = "/" tag;
  187. ns--;
  188. }
  189. else
  190. stag[++ns] = ntag;
  191. tag = "<" tag ">";
  192. return t1 tag nextil(t2);
  193. }
  194. }
  195. function inline(t) {
  196. ilcode = 0;
  197. ilcode2 = 0;
  198. ns = 0;
  199. return nextil(t);
  200. }
  201. function printp(tag) {
  202. if(!match(text, /^[ ]*$/)){
  203. text = inline(text);
  204. if(tag != "")
  205. oprint("<" tag ">" text "</" tag ">");
  206. else
  207. oprint(text);
  208. }
  209. text = "";
  210. }
  211. BEGIN {
  212. blank = 0;
  213. code = 0;
  214. hr = 0;
  215. html = 0;
  216. nl = 0;
  217. nr = 0;
  218. otext = "";
  219. text = "";
  220. par = "p";
  221. }
  222. # References
  223. !code && /^ *\[[^\]]*\]:[ ]+/ {
  224. sub(/^ *\[/, "");
  225. match($0, /\]/);
  226. id = substr($0, 1, RSTART - 1);
  227. sub(id "\\]:[ ]+", "");
  228. title = "";
  229. if(match($0, /\".*\"$/))
  230. title = "\" title=\"" substr($0, RSTART + 1, RLENGTH - 2);
  231. sub(/[ ]+\".*\"$/, "");
  232. url = eschtml($0);
  233. ref[id] = url title;
  234. subref(id);
  235. next;
  236. }
  237. # html
  238. !html && /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
  239. isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/ {
  240. if(code)
  241. oprint("</pre></code>");
  242. for(; !text && block[nl] == "blockquote"; nl--)
  243. oprint("</blockquote>");
  244. match($0, /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
  245. isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/);
  246. htag = substr($0, 2, RLENGTH - 1);
  247. if(!match($0, "(<\\/" htag ">)|((^<hr ?\\/?)|(--)>$)"))
  248. html = 1;
  249. if(html && match($0, /^<hr/))
  250. hr = 1;
  251. oprint($0);
  252. next;
  253. }
  254. html && (/(^<\/(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
  255. isindex|menu|noframes|noscript|ol|p|pre|table|ul).*)|(--)>$/ ||
  256. (hr && />$/)) {
  257. html = 0;
  258. hr = 0;
  259. oprint($0);
  260. next;
  261. }
  262. html {
  263. oprint($0);
  264. next;
  265. }
  266. # List and quote blocks
  267. # Remove indentation
  268. {
  269. for(nnl = 0; nnl < nl; nnl++)
  270. if((match(block[nnl + 1], /[ou]l/) && !sub(/^( | )/, "")) || \
  271. (block[nnl + 1] == "blockquote" && !sub(/^> ?/, "")))
  272. break;
  273. }
  274. nnl < nl && !blank && text && ! /^ ? ? ?([*+-]|([0-9]+\.)+)( +| )/ { nnl = nl; }
  275. # Quote blocks
  276. {
  277. while(sub(/^> /, ""))
  278. nblock[++nnl] = "blockquote";
  279. }
  280. # Horizontal rules
  281. { hr = 0; }
  282. (blank || (!text && !code)) && /^ ? ? ?([-*_][ ]*)([-*_][ ]*)([-*_][ ]*)+$/ {
  283. if(code){
  284. oprint("</pre></code>");
  285. code = 0;
  286. }
  287. blank = 0;
  288. nnl = 0;
  289. hr = 1;
  290. }
  291. # List items
  292. block[nl] ~ /[ou]l/ && /^$/ {
  293. blank = 1;
  294. next;
  295. }
  296. { newli = 0; }
  297. !hr && (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?[*+-]( +| )/ {
  298. sub(/^ ? ? ?[*+-]( +| )/, "");
  299. nnl++;
  300. nblock[nnl] = "ul";
  301. newli = 1;
  302. }
  303. (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?([0-9]+\.)+( +| )/ {
  304. sub(/^ ? ? ?([0-9]+\.)+( +| )/, "");
  305. nnl++;
  306. nblock[nnl] = "ol";
  307. newli = 1;
  308. }
  309. newli {
  310. if(blank && nnl == nl && !par)
  311. par = "p";
  312. blank = 0;
  313. printp(par);
  314. if(nnl == nl && block[nl] == nblock[nl])
  315. oprint("</li><li>");
  316. }
  317. blank && ! /^$/ {
  318. if(match(block[nnl], /[ou]l/) && !par)
  319. par = "p";
  320. printp(par);
  321. par = "p";
  322. blank = 0;
  323. }
  324. # Close old blocks and open new ones
  325. nnl != nl || nblock[nl] != block[nl] {
  326. if(code){
  327. oprint("</pre></code>");
  328. code = 0;
  329. }
  330. printp(par);
  331. b = (nnl > nl) ? nblock[nnl] : block[nnl];
  332. par = (match(b, /[ou]l/)) ? "" : "p";
  333. }
  334. nnl < nl || (nnl == nl && nblock[nl] != block[nl]) {
  335. for(; nl > nnl || (nnl == nl && pblock[nl] != block[nl]); nl--){
  336. if(match(block[nl], /[ou]l/))
  337. oprint("</li>");
  338. oprint("</" block[nl] ">");
  339. }
  340. }
  341. nnl > nl {
  342. for(; nl < nnl; nl++){
  343. block[nl + 1] = nblock[nl + 1];
  344. oprint("<" block[nl + 1] ">");
  345. if(match(block[nl + 1], /[ou]l/))
  346. oprint("<li>");
  347. }
  348. }
  349. hr {
  350. oprint("<hr>");
  351. next;
  352. }
  353. # Code blocks
  354. code && /^$/ {
  355. if(blanK)
  356. oprint("");
  357. blank = 1;
  358. next;
  359. }
  360. !text && sub(/^( | )/, "") {
  361. if(blanK)
  362. oprint("");
  363. blank = 0;
  364. if(!code)
  365. oprint("<code><pre>");
  366. code = 1;
  367. $0 = eschtml($0);
  368. oprint($0);
  369. next;
  370. }
  371. code {
  372. oprint("</pre></code>");
  373. code = 0;
  374. }
  375. # Setex-style Headers
  376. text && /^=+$/ {printp("h1"); next;}
  377. text && /^-+$/ {printp("h2"); next;}
  378. # Atx-Style headers
  379. /^#+/ && (!newli || par=="p" || /^##/) {
  380. for(n = 0; n < 6 && sub(/^# */, ""); n++)
  381. sub(/#$/, "");
  382. par = "h" n;
  383. }
  384. # Paragraph
  385. /^$/ {
  386. printp(par);
  387. par = "p";
  388. next;
  389. }
  390. # Add text
  391. { text = (text ? text " " : "") $0; }
  392. END {
  393. if(code){
  394. oprint("</pre></code>");
  395. code = 0;
  396. }
  397. printp(par);
  398. for(; nl > 0; nl--){
  399. if(match(block[nl], /[ou]l/))
  400. oprint("</li>");
  401. oprint("</" block[nl] ">");
  402. }
  403. gsub(/<<[^\"]*/, "", otext);
  404. print(otext);
  405. }