Conversion script.
This commit is contained in:
parent
008022be40
commit
8386ee633a
|
@ -0,0 +1,137 @@
|
||||||
|
# Removes extra spaces off a string
|
||||||
|
function ltrim(s) { sub(/^[ \t\r\n]+/, "", s); return s; }
|
||||||
|
function rtrim(s) { sub(/[ \t\r\n]+$/, "", s); return s; }
|
||||||
|
function trim(s) { return rtrim(ltrim(s)); }
|
||||||
|
|
||||||
|
# Escape slashes and amps in a string for building links
|
||||||
|
function linkescape(s) {
|
||||||
|
gsub(/\//, "\\/", s);
|
||||||
|
gsub(/\&/, "\\\\&", s);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Creates a section link using normal replacement rules
|
||||||
|
function nametolink(s) {
|
||||||
|
s = tolower(s);
|
||||||
|
gsub(/[^a-z0-9 -]/, "", s);
|
||||||
|
gsub(/ /, "-", s);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
BEGIN {
|
||||||
|
sectionCount = 0;
|
||||||
|
WAIT_FOR_START = 1;
|
||||||
|
IN_PRE = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
# PARSE_HEADERS will be true only for the TOC list of HREFs
|
||||||
|
# so we can build the list of "chapters".
|
||||||
|
PARSE_HEADERS && /<A HREF=/ {
|
||||||
|
match($0, /"#(.*)">(.*)<.A/, m);
|
||||||
|
sections[sectionCount] = m[2];
|
||||||
|
sectionsLink[sectionCount] = m[1];
|
||||||
|
sectionCount++;
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
|
||||||
|
# This marks the beginning of the TOC really.
|
||||||
|
WAIT_FOR_START && /<B>Nocash PSXSPX Playstation Specifications<.B><BR>/ {
|
||||||
|
PARSE_HEADERS = 1;
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
|
||||||
|
# These markers are to split off sections and subsections.
|
||||||
|
/^<!--------------------------------------->/ {
|
||||||
|
if (PARSE_HEADERS) WAIT_FOR_START = 0;
|
||||||
|
PARSE_HEADERS = 0;
|
||||||
|
WAIT_SECTION = 1;
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Trash anything that's in the TOC. We'll build a new one later.
|
||||||
|
WAIT_FOR_START { next; }
|
||||||
|
|
||||||
|
# We need to do slighly different parsing if we're in or out a PRE section.
|
||||||
|
/<TABLE.*PRE>/ { IN_PRE = 1; }
|
||||||
|
/<.TD><.TR><.TABLE>/ { IN_PRE = 0; }
|
||||||
|
|
||||||
|
# Generic sed-like line replacements.
|
||||||
|
{
|
||||||
|
# The input file isn't valid html to begin with,
|
||||||
|
# so we're not going to do a true html parser.
|
||||||
|
# Replace the typical html escapes right off the bat.
|
||||||
|
gsub(/ /, " ");
|
||||||
|
if (IN_PRE) {
|
||||||
|
gsub(/</, "<");
|
||||||
|
gsub(/>/, ">");
|
||||||
|
} else {
|
||||||
|
# outside of <PRE> blocks, we want to escape these for the md format.
|
||||||
|
gsub(/</, "\\<");
|
||||||
|
gsub(/>/, "\\>");
|
||||||
|
}
|
||||||
|
gsub(/&/, "\\&");
|
||||||
|
|
||||||
|
# These are fairly straightforward replacements
|
||||||
|
sub(/<TABLE.*PRE>/, "```");
|
||||||
|
sub(/<.TD><.TR><.TABLE>/, "```");
|
||||||
|
$0 = gensub(/<B>(.*)<.B><BR>/, "#### \\1", "g");
|
||||||
|
sub(/<BR>/, "<br/>");
|
||||||
|
}
|
||||||
|
|
||||||
|
# Skip the remaining cruft.
|
||||||
|
/^<TABLE/ { next; }
|
||||||
|
/^<.FONT/ { next; }
|
||||||
|
|
||||||
|
# We need to parse the header of a major or minor section.
|
||||||
|
WAIT_SECTION {
|
||||||
|
WAIT_SECTION = 0;
|
||||||
|
WAIT_HEADER = 1;
|
||||||
|
match($0, /<A NAME="(.*)">/, m);
|
||||||
|
link = m[1];
|
||||||
|
found = 0;
|
||||||
|
# Checking if it was in our TOC to distingish between section or subsection.
|
||||||
|
for (i = 0; i < sectionCount; i++) {
|
||||||
|
if (sectionsLink[i] == link) {
|
||||||
|
GOT_MAJOR_SECTION = 1;
|
||||||
|
sectionFile = link;
|
||||||
|
found = 1;
|
||||||
|
subsectionsCount = 0;
|
||||||
|
printf("SWITCHED TO MAJOR SECTION: %s\n", link);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!found) {
|
||||||
|
subsection = link;
|
||||||
|
printf("SWITCHED TO MINOR SECTION: %s\n", link);
|
||||||
|
}
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Creating the proper section header, and the second pass sed script.
|
||||||
|
WAIT_HEADER {
|
||||||
|
WAIT_HEADER = 0;
|
||||||
|
if (GOT_MAJOR_SECTION) {
|
||||||
|
print("s/<A HREF=\"#" link "\">[^<]*<.A>/[" linkescape(trim($0)) "](" link ".md)/g") > "psx-spx-pass2.sed";
|
||||||
|
$0 = "# " $0;
|
||||||
|
} else {
|
||||||
|
print("s/<A HREF=\"#" link "\">[^<]*<.A>/[" linkescape(trim($0)) "](" sectionFile ".md#" nametolink(trim($0)) ")/g") > "psx-spx-pass2.sed";
|
||||||
|
$0 = "## " $0;
|
||||||
|
}
|
||||||
|
|
||||||
|
GOT_MAJOR_SECTION = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
# If we get here, output our current line to the current section md file.
|
||||||
|
{ print > sectionFile ".md" }
|
||||||
|
|
||||||
|
END {
|
||||||
|
# Emit the TOC and finish the second pass sed script, as well as
|
||||||
|
# the second pass runner script.
|
||||||
|
for (i = 0; i < sectionCount; i++) {
|
||||||
|
link = sectionsLink[i];
|
||||||
|
name = sections[i];
|
||||||
|
print("sed -f psx-spx-pass2.sed -i " link ".md") > "run-pass2.sh";
|
||||||
|
print("[" name "](" link ".md)<br/>") > "index.md";
|
||||||
|
}
|
||||||
|
|
||||||
|
print("s/^<br\\/>$//") > "psx-spx-pass2.sed";
|
||||||
|
}
|
|
@ -0,0 +1,18 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# Download and slightly adjust the input file.
|
||||||
|
# There's one occurence where there's a newline missing, which throws our awk script off.
|
||||||
|
# Might as well fix it here rather than making the awk script more complex.
|
||||||
|
curl https://problemkaputt.de/psx-spx.htm |
|
||||||
|
dos2unix |
|
||||||
|
sed 's|</FONT></TD></TR></TABLE><TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0><TR><TD><PRE>|</FONT></TD></TR></TABLE>\n<TABLE BORDER=0 CELLSPACING=0 CELLPADDING=0><TR><TD><PRE>|' > psx-spx.html
|
||||||
|
|
||||||
|
# Run the awk script to generate all of the .md files, as well as the second pass scripts.
|
||||||
|
gawk -f psx-spx-pass1.awk psx-spx.html
|
||||||
|
|
||||||
|
# Invoke the second pass generated by the first pass script, to adjust all the cross references.
|
||||||
|
. ./run-pass2.sh
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
rm psx-spx.html run-pass2.sh psx-spx-pass2.sed
|
||||||
|
mv *.md ..
|
Loading…
Reference in New Issue