From 8386ee633abe6597903cc6439027ce1518847de2 Mon Sep 17 00:00:00 2001 From: Nicolas 'Pixel' Noble Date: Thu, 23 Jul 2020 09:51:52 -0700 Subject: [PATCH] Conversion script. --- conversion/psx-spx-pass1.awk | 137 +++++++++++++++++++++++++++++++++++ conversion/run.sh | 18 +++++ 2 files changed, 155 insertions(+) create mode 100644 conversion/psx-spx-pass1.awk create mode 100755 conversion/run.sh diff --git a/conversion/psx-spx-pass1.awk b/conversion/psx-spx-pass1.awk new file mode 100644 index 0000000..19529e7 --- /dev/null +++ b/conversion/psx-spx-pass1.awk @@ -0,0 +1,137 @@ +# Removes extra spaces off a string +function ltrim(s) { sub(/^[ \t\r\n]+/, "", s); return s; } +function rtrim(s) { sub(/[ \t\r\n]+$/, "", s); return s; } +function trim(s) { return rtrim(ltrim(s)); } + +# Escape slashes and amps in a string for building links +function linkescape(s) { + gsub(/\//, "\\/", s); + gsub(/\&/, "\\\\&", s); + return s; +} + +# Creates a section link using normal replacement rules +function nametolink(s) { + s = tolower(s); + gsub(/[^a-z0-9 -]/, "", s); + gsub(/ /, "-", s); + return s; +} + +BEGIN { + sectionCount = 0; + WAIT_FOR_START = 1; + IN_PRE = 0; +} + +# PARSE_HEADERS will be true only for the TOC list of HREFs +# so we can build the list of "chapters". +PARSE_HEADERS && /(.*)<.A/, m); + sections[sectionCount] = m[2]; + sectionsLink[sectionCount] = m[1]; + sectionCount++; + next; +} + +# This marks the beginning of the TOC really. +WAIT_FOR_START && /Nocash PSXSPX Playstation Specifications<.B>
/ { + PARSE_HEADERS = 1; + next; +} + +# These markers are to split off sections and subsections. +/^/ { + if (PARSE_HEADERS) WAIT_FOR_START = 0; + PARSE_HEADERS = 0; + WAIT_SECTION = 1; + next; +} + +# Trash anything that's in the TOC. We'll build a new one later. +WAIT_FOR_START { next; } + +# We need to do slighly different parsing if we're in or out a PRE section. +// { IN_PRE = 1; } +/<.TD><.TR><.TABLE>/ { IN_PRE = 0; } + +# Generic sed-like line replacements. +{ + # The input file isn't valid html to begin with, + # so we're not going to do a true html parser. + # Replace the typical html escapes right off the bat. + gsub(/ /, " "); + if (IN_PRE) { + gsub(/</, "<"); + gsub(/>/, ">"); + } else { + # outside of
 blocks, we want to escape these for the md format.
+        gsub(/</, "\\<");
+        gsub(/>/, "\\>");
+    }
+    gsub(/&/, "\\&");
+
+    # These are fairly straightforward replacements
+    sub(//, "```");
+    sub(/<.TD><.TR><.TABLE>/, "```");
+    $0 = gensub(/(.*)<.B>
/, "#### \\1", "g"); + sub(/
/, "
"); +} + +# Skip the remaining cruft. +/^/, m); + link = m[1]; + found = 0; + # Checking if it was in our TOC to distingish between section or subsection. + for (i = 0; i < sectionCount; i++) { + if (sectionsLink[i] == link) { + GOT_MAJOR_SECTION = 1; + sectionFile = link; + found = 1; + subsectionsCount = 0; + printf("SWITCHED TO MAJOR SECTION: %s\n", link); + } + } + if (!found) { + subsection = link; + printf("SWITCHED TO MINOR SECTION: %s\n", link); + } + next; +} + +# Creating the proper section header, and the second pass sed script. +WAIT_HEADER { + WAIT_HEADER = 0; + if (GOT_MAJOR_SECTION) { + print("s/[^<]*<.A>/[" linkescape(trim($0)) "](" link ".md)/g") > "psx-spx-pass2.sed"; + $0 = "# " $0; + } else { + print("s/[^<]*<.A>/[" linkescape(trim($0)) "](" sectionFile ".md#" nametolink(trim($0)) ")/g") > "psx-spx-pass2.sed"; + $0 = "## " $0; + } + + GOT_MAJOR_SECTION = 0; +} + +# If we get here, output our current line to the current section md file. +{ print > sectionFile ".md" } + +END { + # Emit the TOC and finish the second pass sed script, as well as + # the second pass runner script. + for (i = 0; i < sectionCount; i++) { + link = sectionsLink[i]; + name = sections[i]; + print("sed -f psx-spx-pass2.sed -i " link ".md") > "run-pass2.sh"; + print("[" name "](" link ".md)
") > "index.md"; + } + + print("s/^$//") > "psx-spx-pass2.sed"; +} diff --git a/conversion/run.sh b/conversion/run.sh new file mode 100755 index 0000000..25c2b07 --- /dev/null +++ b/conversion/run.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +# Download and slightly adjust the input file. +# There's one occurence where there's a newline missing, which throws our awk script off. +# Might as well fix it here rather than making the awk script more complex. +curl https://problemkaputt.de/psx-spx.htm | + dos2unix | + sed 's|
|
\n
|' > psx-spx.html
+
+# Run the awk script to generate all of the .md files, as well as the second pass scripts.
+gawk -f psx-spx-pass1.awk psx-spx.html
+
+# Invoke the second pass generated by the first pass script, to adjust all the cross references.
+. ./run-pass2.sh
+
+# Cleanup
+rm psx-spx.html run-pass2.sh psx-spx-pass2.sed
+mv *.md ..