123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427 |
- #!/usr/bin/mawk -f
- #
- # by: Jesus Galan (yiyus) 2009
- #
- # Usage: md2html.awk file.md > file.html
- # See: http://4l77.com/src/md2html.awk
- function eschtml(t) {
- gsub("&", "\\&", t);
- gsub("<", "\\<", t);
- return t;
- }
- function oprint(t){
- if(nr == 0)
- print t;
- else
- otext = otext "\n" t;
- }
- function subref(id){
- for(; nr > 0 && sub("<<" id, ref[id], otext); nr--);
- if(nr == 0 && otext) {
- print otext;
- otext = "";
- }
- }
- function nextil(t) {
- if(!match(t, /[`<&\[*_\\-]|(\!\[)/))
- return t;
- t1 = substr(t, 1, RSTART - 1);
- tag = substr(t, RSTART, RLENGTH);
- t2 = substr(t, RSTART + RLENGTH);
- if(ilcode && tag != "`")
- return eschtml(t1 tag) nextil(t2);
- # Backslash escaping
- if(tag == "\\"){
- if(match(t2, /^[\\`*_{}\[\]()#+\-\.!]/)){
- tag = substr(t2, 1, 1);
- t2 = substr(t2, 2);
- }
- return t1 tag nextil(t2);
- }
- # Dashes
- if(tag == "-"){
- if(sub(/^-/, "", t2))
- tag = "—";
- return t1 tag nextil(t2);
- }
- # Inline Code
- if(tag == "`"){
- if(sub(/^`/, "", t2)){
- if(!match(t2, /``/))
- return t1 "”" nextil(t2);
- ilcode2 = !ilcode2;
- }
- else if(ilcode2)
- return t1 tag nextil(t2);
- tag = "<code>";
- if(ilcode){
- t1 = eschtml(t1);
- tag = "</code>";
- }
- ilcode = !ilcode;
- return t1 tag nextil(t2);
- }
- if(tag == "<"){
- # Autolinks
- if(match(t2, /^[^ ]+[\.@][^ ]+>/)){
- url = eschtml(substr(t2, 1, RLENGTH - 1));
- t2 = substr(t2, RLENGTH + 1);
- linktext = url;
- if(match(url, /@/) && !match(url, /^mailto:/))
- url = "mailto:" url;
- return t1 "<a href=\"" url "\">" linktext "</a>" nextil(t2);
- }
- # Html tags
- if(match(t2, /^[A-Za-z\/!][^>]*>/)){
- tag = tag substr(t2, RSTART, RLENGTH);
- t2 = substr(t2, RLENGTH + 1);
- return t1 tag nextil(t2);
- }
- return t1 "<" nextil(t2);
- }
- # Html special entities
- if(tag == "&"){
- if(match(t2, /^#?[A-Za-z0-9]+;/)){
- tag = tag substr(t2, RSTART, RLENGTH);
- t2 = substr(t2, RLENGTH + 1);
- return t1 tag nextil(t2);
- }
- return t1 "&" nextil(t2);
- }
- # Images
- if(tag == "!["){
- if(!match(t2, /(\[.*\])|(\(.*\))/))
- return t1 tag nextil(t2);
- match(t2, /^[^\]]*/);
- alt = substr(t2, 1, RLENGTH);
- t2 = substr(t2, RLENGTH + 2);
- if(match(t2, /^\(/)){
- # Inline
- sub(/^\(/, "", t2);
- match(t2, /^[^\)]+/);
- url = eschtml(substr(t2, 1, RLENGTH));
- t2 = substr(t2, RLENGTH + 2);
- title = "";
- if(match(url, /[ ]+\".*\"[ ]*$/)) {
- title = substr(url, RSTART, RLENGTH);
- url = substr(url, 1, RSTART - 1);
- match(title, /\".*\"/);
- title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
- }
- if(match(url, /^<.*>$/))
- url = substr(url, 2, RLENGTH - 2);
- return t1 "<img src=\"" url "\" alt=\"" alt "\"" title " />" nextil(t2);
- }
- else{
- # Referenced
- sub(/^ ?\[/, "", t2);
- id = alt;
- if(match(t2, /^[^\]]+/))
- id = substr(t2, 1, RLENGTH);
- t2 = substr(t2, RLENGTH + 2);
- if(ref[id])
- r = ref[id];
- else{
- r = "<<" id;
- nr++;
- }
- return t1 "<img src=\"" r "\" alt=\"" alt "\" />" nextil(t2);
- }
- }
- # Links
- if(tag == "["){
- if(!match(t2, /(\[.*\])|(\(.*\))/))
- return t1 tag nextil(t2);
- match(t2, /^[^\]]*(\[[^\]]*\][^\]]*)*/);
- linktext = substr(t2, 1, RLENGTH);
- t2 = substr(t2, RLENGTH + 2);
- if(match(t2, /^\(/)){
- # Inline
- match(t2, /^[^\)]+(\([^\)]+\)[^\)]*)*/);
- url = substr(t2, 2, RLENGTH - 1);
- pt2 = substr(t2, RLENGTH + 2);
- title = "";
- if(match(url, /[ ]+\".*\"[ ]*$/)) {
- title = substr(url, RSTART, RLENGTH);
- url = substr(url, 1, RSTART - 1);
- match(title, /\".*\"/);
- title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
- }
- if(match(url, /^<.*>$/))
- url = substr(url, 2, RLENGTH - 2);
- url = eschtml(url);
- return t1 "<a href=\"" url "\"" title ">" nextil(linktext) "</a>" nextil(pt2);
- }
- else{
- # Referenced
- sub(/^ ?\[/, "", t2);
- id = linktext;
- if(match(t2, /^[^\]]+/))
- id = substr(t2, 1, RLENGTH);
- t2 = substr(t2, RLENGTH + 2);
- if(ref[id])
- r = ref[id];
- else{
- r = "<<" id;
- nr++;
- }
- pt2 = t2;
- return t1 "<a href=\"" r "\" />" nextil(linktext) "</a>" nextil(pt2);
- }
- }
- # Emphasis
- if(match(tag, /[*_]/)){
- ntag = tag;
- if(sub("^" tag, "", t2)){
- if(stag[ns] == tag && match(t2, "^" tag))
- t2 = tag t2;
- else
- ntag = tag tag
- }
- n = length(ntag);
- tag = (n == 2) ? "strong" : "em";
- if(match(t1, / $/) && match(t2, /^ /))
- return t1 tag nextil(t2);
- if(stag[ns] == ntag){
- tag = "/" tag;
- ns--;
- }
- else
- stag[++ns] = ntag;
- tag = "<" tag ">";
- return t1 tag nextil(t2);
- }
- }
- function inline(t) {
- ilcode = 0;
- ilcode2 = 0;
- ns = 0;
-
- return nextil(t);
- }
- function printp(tag) {
- if(!match(text, /^[ ]*$/)){
- text = inline(text);
- if(tag != "")
- oprint("<" tag ">" text "</" tag ">");
- else
- oprint(text);
- }
- text = "";
- }
- BEGIN {
- blank = 0;
- code = 0;
- hr = 0;
- html = 0;
- nl = 0;
- nr = 0;
- otext = "";
- text = "";
- par = "p";
- }
- # References
- !code && /^ *\[[^\]]*\]:[ ]+/ {
- sub(/^ *\[/, "");
- match($0, /\]/);
- id = substr($0, 1, RSTART - 1);
- sub(id "\\]:[ ]+", "");
- title = "";
- if(match($0, /\".*\"$/))
- title = "\" title=\"" substr($0, RSTART + 1, RLENGTH - 2);
- sub(/[ ]+\".*\"$/, "");
- url = eschtml($0);
- ref[id] = url title;
- subref(id);
- next;
- }
- # html
- !html && /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
- isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/ {
- if(code)
- oprint("</pre></code>");
- for(; !text && block[nl] == "blockquote"; nl--)
- oprint("</blockquote>");
- match($0, /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
- isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/);
- htag = substr($0, 2, RLENGTH - 1);
- if(!match($0, "(<\\/" htag ">)|((^<hr ?\\/?)|(--)>$)"))
- html = 1;
- if(html && match($0, /^<hr/))
- hr = 1;
- oprint($0);
- next;
- }
- html && (/(^<\/(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
- isindex|menu|noframes|noscript|ol|p|pre|table|ul).*)|(--)>$/ ||
- (hr && />$/)) {
- html = 0;
- hr = 0;
- oprint($0);
- next;
- }
- html {
- oprint($0);
- next;
- }
- # List and quote blocks
- # Remove indentation
- {
- for(nnl = 0; nnl < nl; nnl++)
- if((match(block[nnl + 1], /[ou]l/) && !sub(/^( | )/, "")) || \
- (block[nnl + 1] == "blockquote" && !sub(/^> ?/, "")))
- break;
- }
- nnl < nl && !blank && text && ! /^ ? ? ?([*+-]|([0-9]+\.)+)( +| )/ { nnl = nl; }
- # Quote blocks
- {
- while(sub(/^> /, ""))
- nblock[++nnl] = "blockquote";
- }
- # Horizontal rules
- { hr = 0; }
- (blank || (!text && !code)) && /^ ? ? ?([-*_][ ]*)([-*_][ ]*)([-*_][ ]*)+$/ {
- if(code){
- oprint("</pre></code>");
- code = 0;
- }
- blank = 0;
- nnl = 0;
- hr = 1;
- }
- # List items
- block[nl] ~ /[ou]l/ && /^$/ {
- blank = 1;
- next;
- }
- { newli = 0; }
- !hr && (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?[*+-]( +| )/ {
- sub(/^ ? ? ?[*+-]( +| )/, "");
- nnl++;
- nblock[nnl] = "ul";
- newli = 1;
- }
- (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?([0-9]+\.)+( +| )/ {
- sub(/^ ? ? ?([0-9]+\.)+( +| )/, "");
- nnl++;
- nblock[nnl] = "ol";
- newli = 1;
- }
- newli {
- if(blank && nnl == nl && !par)
- par = "p";
- blank = 0;
- printp(par);
- if(nnl == nl && block[nl] == nblock[nl])
- oprint("</li><li>");
- }
- blank && ! /^$/ {
- if(match(block[nnl], /[ou]l/) && !par)
- par = "p";
- printp(par);
- par = "p";
- blank = 0;
- }
-
- # Close old blocks and open new ones
- nnl != nl || nblock[nl] != block[nl] {
- if(code){
- oprint("</pre></code>");
- code = 0;
- }
- printp(par);
- b = (nnl > nl) ? nblock[nnl] : block[nnl];
- par = (match(b, /[ou]l/)) ? "" : "p";
- }
- nnl < nl || (nnl == nl && nblock[nl] != block[nl]) {
- for(; nl > nnl || (nnl == nl && pblock[nl] != block[nl]); nl--){
- if(match(block[nl], /[ou]l/))
- oprint("</li>");
- oprint("</" block[nl] ">");
- }
- }
- nnl > nl {
- for(; nl < nnl; nl++){
- block[nl + 1] = nblock[nl + 1];
- oprint("<" block[nl + 1] ">");
- if(match(block[nl + 1], /[ou]l/))
- oprint("<li>");
- }
- }
- hr {
- oprint("<hr>");
- next;
- }
- # Code blocks
- code && /^$/ {
- if(blanK)
- oprint("");
- blank = 1;
- next;
- }
- !text && sub(/^( | )/, "") {
- if(blanK)
- oprint("");
- blank = 0;
- if(!code)
- oprint("<code><pre>");
- code = 1;
- $0 = eschtml($0);
- oprint($0);
- next;
- }
- code {
- oprint("</pre></code>");
- code = 0;
- }
- # Setex-style Headers
- text && /^=+$/ {printp("h1"); next;}
- text && /^-+$/ {printp("h2"); next;}
- # Atx-Style headers
- /^#+/ && (!newli || par=="p" || /^##/) {
- for(n = 0; n < 6 && sub(/^# */, ""); n++)
- sub(/#$/, "");
- par = "h" n;
- }
- # Paragraph
- /^$/ {
- printp(par);
- par = "p";
- next;
- }
- # Add text
- { text = (text ? text " " : "") $0; }
- END {
- if(code){
- oprint("</pre></code>");
- code = 0;
- }
- printp(par);
- for(; nl > 0; nl--){
- if(match(block[nl], /[ou]l/))
- oprint("</li>");
- oprint("</" block[nl] ">");
- }
- gsub(/<<[^\"]*/, "", otext);
- print(otext);
- }
|