kvisc/as/k-as.py

#!/usr/bin/python3

# The OS/K Team licenses this file to you under the MIT license.
# See the LICENSE file in the project root for more information.

import os
import sys
import subprocess
from array import array
from tempfile import TemporaryFile

WANT_DISASM = False

if len(sys.argv) != 4:
    print("Usage: {} (output file) (memory entry point) (source file)"
                    .format(sys.argv[0]))
    sys.exit(1)

source = TemporaryFile(mode="w+")
instrs = TemporaryFile(mode="w+")
b_data = TemporaryFile(mode="w+b")
b_text = TemporaryFile(mode="w+b")

lst_regs = open(os.path.join(sys.path[0], "regs.lst"))
lst_instrs = open(os.path.join(sys.path[0], "instrs.lst"))

main_src = open(sys.argv[1])
b_out = open(sys.argv[3], "wb")

start_addr = int(sys.argv[2], base=0)

def leave():
    source.close()
    instrs.close()
    b_out.close()
    b_data.close()
    b_text.close()
    main_src.close()
    lst_regs.close()
    lst_instrs.close()

#-------------------------------------------------------------------------------

# Defines
pdefs = dict()

# registers
pregs = list()

# instructions
pinstrs = list()

# labels
plabels_text = dict()
plabels_data = dict()

# size of .data section
pdata = 0

# size of .text section
ptext = 0

# for local labels
plastlabel = ''

# after parse() is done, pdata and ptext are never modified

# padding bytes between .text and .data
pdata_pad = 0

#-------------------------------------------------------------------------------

def name_valid(name):
    for c in name.lower():
        if not(c in 'abcdefghijklmnopqrstuvwxyz0123456789[$._+]=,'):
            return False
    return True

def is_number(s):
    try:
        int(s, base=0)

    except ValueError:
        return False

    return True
#-------------------------------------------------------------------------------

def parse_lst_regs():
    global pregs
    for _, line in enumerate(lst_regs):
        pregs.append(line.strip())

def parse_lst_instrs():
    global pinstrs
    for _, line in enumerate(lst_instrs):
        pinstrs.append(line.strip())

#-------------------------------------------------------------------------------

inc_depth = 0
inc_depth_max = 16

# Quickly goes through source file and resolves "include" directives ONLY
def do_includes(fi):
    global inc_depth
    for _, line in enumerate(fi):
        line = line.rstrip()
        tok = line.split(' ', 1)

        if len(tok) == 0:
            continue

        if tok[0] == "include":
            if len(tok) == 1:
                print("Missing parameter for include directive")
                leave()
                sys.exit(1)

            if tok[1][0] not in "'\"" or tok[1][-1] != tok[1][0]:
                print("Invalid format for include directive: {}".format(line))
                leave()
                sys.exit(1)

            inc = tok[1][1:-1]

            try:
                new_fi = open(inc, "r")

            except:
                print("Couldn't open file: {}".format(line))
                leave()
                sys.exit(1)

            inc_depth += 1
            if inc_depth >= inc_depth_max:
                print("Maximal include depth reached: {}".format(line))
                leave()
                sys.exit(1)

            do_includes(new_fi)

        else:
            source.write("{}\n".format(line))


#-------------------------------------------------------------------------------

def parse():
    global ptext
    global plastlabel

    source.seek(0)

    for count, line in enumerate(source):
        line = line.rstrip()

        if len(line) == 0:
            continue

        quote = False
        for i in range(len(line)):
            if line[i] in "'\"":
                quote = not quote

            if line[i] in '#;@!/' and not quote:
                line = line[:i].rstrip()
                break

        if quote:
            print("Unterminated string in line: {}".format(line))
            leave()
            sys.exit(1)

        if len(line) == 0:
            continue

        if line[0] == ' ' or line[0] == '\t':
            line = line.lstrip()
            ptext += parse_instr(line)
            instrs.write("\n")

            continue

        # Preprocessor or label?
        if line[-1] == ':':
            if name_valid(line[:-1]):
                label = line[:-1]
                if label[0] == '.':
                    label = plastlabel + label
                else:
                    plastlabel = label
                plabels_text[label] = ptext
            else:
                print("Bad label name: {}".format(line[:-1]))
                leave()
                sys.exit(1)
            continue

        # Preprocessor, .data, or invalid
        parse_preproc(line)

#-------------------------------------------------------------------------------

def parse_preproc(line):
    global pdata

    tok = line.split(None, 2)

    # preprocessor
    if len(tok) > 1 and tok[1] == ':=':
        if len(tok) < 3:
            print("Invalid format: {}".format(line))
            leave()
            sys.exit(1)
        pdefs[tok[0]] = tok[2]
        return

    # .data
    if len(tok) > 1 and tok[1] == '=':
        if len(tok) < 3:
            print("Invalid format: {}".format(line))
            leave()
            sys.exit(1)

        label = tok[0]
        if label[0] == '.':
            label = plastlabel + label

        plabels_data[label] = pdata

        # number data
        if is_number(tok[2]):
            written = b_data.write(int(tok[2], base=0).to_bytes(8, byteorder='little', signed=False))
            assert(written == 8)
            pdata += written

        # buffer / bss
        elif tok[2][0] == '[':
            assert(tok[2][-1] == ']')
            
            s = tok[2][1:-1].strip()
            if not is_number(s):
                print("Invalid bss format: {}".format(line))
                leave()
                sys.exit(1)

            i = int(s, base=0)
            i = i + (8 - i % 8)
            written = b_data.write(bytearray(i))
            assert(written == i)

            pdefs[label + "_len"] = s
            pdata += written

        # string data
        elif tok[2][0] in "'\"":
            s = tok[2].strip()
            assert(s[-1] == tok[2][0])

            s = s[1:-1]

            real_len = 0
            escaping = False

            for c in s:
                # escape sequences
                if c == '\\':
                    escaping = True
                    continue

                if escaping:
                    escaping = False

                    if c == 'n':
                        c = '\n'
                    elif c == 't':
                        c = '\t'
                    else:
                        print("Unrecognized escape sequence: {}".format(line))
                        leave()
                        sys.exit(1)

                written = b_data.write(ord(c).to_bytes(1, byteorder='little', signed=False))
                assert(written == 1)
                real_len += 1
                pdata += 1

            # align
            for i in range(8 - len(s) % 8):
                written = b_data.write(int(0).to_bytes(1, byteorder='little', signed=False))
                assert(written == 1)
                pdata += 1

            pdefs[label + "_len"] = str(real_len)

        else:
            print("Invalid format: {}".format(line))
            leave()
            sys.exit(1)

        return

    print("Unrecognized directive: {}".format(line))

#-------------------------------------------------------------------------------

pconds = {
    'c':    0b00001,
    'o':    0b00010,
    'z':    0b00011,
    'e':    0b00011,
    's':    0b00100,
    'p':    0b00101,
    'a':    0b00110,
    'ae':   0b00111,
    'b':    0b01000,
    'be':   0b01001,
    'g':    0b01010,
    'ge':   0b01011,
    'l':    0b01100,
    'le':   0b01101,
    'cxz':  0b01110,
    'cxnz': 0b11110,
}

def get_cond_mask(cond, line):
    mask = 0

    if cond[0] == 'n':
        cond = cond[1:]
        mask = 0b10000
    
    if cond not in pconds:
        print("Invalid condition suffix: {}".format(line))
        leave()
        sys.exit(1)

    return (mask | pconds[cond]) << 10

#-------------------------------------------------------------------------------

pfts = {
    "reg":      0b00001,
    "imm64":    0b00010,

    "bimm64":   0b00100,
    "brr":      0b00101,
    "brri":     0b00110,
    "brrii":    0b00111,

    "wimm64":   0b01000,
    "wrr":      0b01001,
    "wrri":     0b01010,
    "wrrii":    0b01011,

    "limm64":   0b01100,
    "lrr":      0b01101,
    "lrri":     0b01110,
    "lrrii":    0b01111,

    "qimm64":   0b10000,
    "qrr":      0b10001,
    "qrri":     0b10010,
    "qrrii":    0b10011,
}

def get_fts_mask(ft, line):
    if ft not in pfts:
        print("Invalid operand format ({}): {}".format(ft, line))

    else:
        return pfts[ft]
    
#-------------------------------------------------------------------------------

def parse_instr(line):
    if line == None or len(line) == 0:
        return 0

    tok = line.split(' ', 1)

    instr = tok[0].strip()

    if len(tok) > 1:
        params = tok[1].strip()
    else:
        params = None

    fellthrough = False

    size = 4
    
    # Word 2 (rep|cond|ft1|ft2)
    w2 = 0

    if '.' in instr:
        w2 |= get_cond_mask(instr.split('.', 1)[1], line)
        instr = instr.split('.', 1)[0]

    else:
        instr = instr

    if instr == "rep":
        if params == None:
            print("Missing instruction after rep prefix: {}".format(line))
            leave()
            sys.exit(1)

        w2 |= 0x8000    # 16th bit

        if len(params.split(' ', 1)) == 2:
            instr, params = params.split(' ', 1)
        else:
            instr = params.split(' ', 1)[0]
            params = None

    instr_name = instr
    instr_args = ''

    if params == None or len(params) == 0:
        instrs.write("{} ".format(instr_name))
        instrs.write("%%imm16 {}".format(w2))

        return size

    tok = params.split(',')
    
    # FT1 and FT2
    fts = ''

    #
    # Parse operands, generating fts along the way
    #
    for word in tok:
        word = word.strip()

        instr_args += ' '

        gotPref = False
        if len(fts) != 0:
            fts += ' '

        # memory length prefixes
        if len(word) > 2 and '[' in word:
            if word[0] in 'bwlq':
                fts += word[0]
                gotPref = True
            else:
                print("Bad memory length prefix: {}".format(line))
                leave()
                sys.exit(1)

            word = word[1:].strip()
            assert(word[0] == '[')

        #
        # Determine memory format and save it into fts
        #
        if word[0] == '[':
            assert(word[-1] == ']')
            word = word[1:-1]

            #
            # Make sure we got an access length prefix
            #
            if not gotPref:
                print("Missing access length modifier: {}".format(line))
                leave()
                sys.exit(1)

            instr_name += "_m"

            #
            # Offsets
            #
            if '+' in word:

                reg1 = "inv"
                reg2 = "inv"
                imm1 = '1'
                imm2 = '0'

                wtok = word.split('+')

                #
                # [reg] and [reg*imm16]
                #

                if len(wtok) == 1:
                    if '*' in wtok[0]:
                        assert(len(wtok[0].split('*')) == 2)

                        reg2, imm1 = wtok[0].split('*', 1)
                    
                    else:
                        reg1 = wtok[0]

                #
                # [reg+reg], [reg+imm16], [reg*imm16+imm16], [reg+reg*imm16]
                #
                elif len(wtok) == 2:
                    # Must be [reg*imm16+imm16]
                    if '*' in wtok[0]:
                        assert(len(wtok[0].split('*')) == 2)
                        assert(is_number(wtok[1].strip()))

                        print(wtok)
                        reg2, imm1 = wtok[0].split('*', 1)
                        imm2 = wtok[1]

                    # Must be [reg+reg*imm16]
                    elif '*' in wtok[1]:
                        assert(len(wtok[1].split('*')) == 2)

                        reg1 = wtok[0]
                        reg2, imm1 = wtok[1].split('*', 1)

                    elif is_number(wtok[1].strip()):
                        reg1 = wtok[0]
                        imm2 = wtok[1]

                    # Must be [reg+reg]
                    else:
                        reg1 = wtok[0]
                        reg2 = wtok[1]

                #
                # [reg+reg+imm16], [reg+reg*imm16+imm16]
                #
                else:
                    assert(len(wtok) == 3)
                    
                    reg1 = wtok[0]
                    imm2 = wtok[2]
                    
                    if '*' in wtok[1]:
                         assert(len(wtok[1].split('*')) == 2)
                         reg2, imm1 = wtok[1].split('*', 1)

                    else:
                        reg2 = wtok[1]

                #
                # Update fts and instr_args
                #

                instr_args += "{}:{} ".format(reg2.strip(), reg1.strip())
                size += 2

                if imm1 == '1':
                    if imm2 == '0':
                        fts += 'rr'
                    else:
                        fts += 'rri'
                        size += 2
                        instr_args += "%%imm16 {}".format(imm2)

                else:
                    size += 4
                    fts += 'rrii'
                    instr_args += "%%imm16 {} %%imm16 {}".format(imm1, imm2)

                continue
            
                """
                # +2 for A_OFF, +2 for offset, +2 for regoff, +2 for register
                size += 2 + 2 + 2 + 2
                instr_args += "off "

                assert(len(word) > 3)

                regoff = "inv"

                # [reg+off] or [reg+regoff]
                if len(word.split('+')) == 2:
                    reg, off = word.split('+', 1)

                    if not is_number(off):
                        regoff = off
                        off = '0'

                # [reg+regoff+off]
                else:
                    assert(len(word.split('+')) == 3)
                    reg, regoff, off = word.split('+', 2)

                off = off.strip()
                reg = reg.strip()
                regoff = regoff.strip()

                instr_args += "{} {} {}".format(off, regoff, reg)

                continue
                """

            #
            # [imm64] or [reg]
            #
            else:
                fellthrough = True
                # FALLTHROUGH

        # preprocessor
        if word in pdefs:
            word = pdefs[word]
            # Fall through

        # for now every immediate is 64-bit
        if is_number(word):
            # +8 for immediate
            size += 8

            if not fellthrough:
                instr_name += "_i"
            
            fts += "imm64"

            instr_args += "%%imm64 "
            instr_args += word

            fellthrough = False
            continue

        # register
        if word in pregs:
            size += 2

            if not fellthrough:
                instr_name += "_r"
                fts += "reg"

            else:
                fts += "rr"

            instr_args += word
            fellthrough = False
            continue

        # it's a label (a 64-bit immediate)
        # +8 for immediate
        size += 8

        if not fellthrough:
            instr_name += "_i"

        fts += "imm64"
        instr_args += "%%imm64 "

        if word[0] == '.':
            instr_args += plastlabel

        instr_args += word
        fellthrough = False

    #
    # Compute FT1 and FT2
    #
    if ' ' in fts:
        assert(len(fts.split(' ')) == 2)
        ft1, ft2 = fts.split(' ')
        
        w2 |= get_fts_mask(ft1, line) << 5
        w2 |= get_fts_mask(ft2, line)

    else:
        assert(len(fts) > 0)
        w2 |= get_fts_mask(fts, line) << 5

    instrs.write("{} %%imm16 {}{}".format(instr_name, w2, instr_args))
    return size

#-------------------------------------------------------------------------------

special_syms = {
    "%%imm16",
    "%%imm32",
    "%%imm64",
    "%%signed"
}

def gentext():
    instrs.seek(0)

    if WANT_DISASM:
        int(instrs.read())    
        instrs.seek(0)

    text_start = 0x100000
    data_start = text_start + ptext
    data_start += (8 - data_start % 8)

    for _, line in enumerate(instrs):
        tok = line.strip().split(' ')

        for word in tok:
            if len(word) == 0:
                continue
        
            if word in pregs:
                idx = pregs.index(word)
                b_text.write(idx.to_bytes(2, byteorder='little', signed=False))
                continue

            if ':' in word:
                reg2, reg1 = word.split(':', 1)
                idx1 = pregs.index(reg1)
                idx2 = pregs.index(reg2)
                b_text.write(((idx1 << 8) | idx2).to_bytes(2, byteorder='little', signed=False))
                continue

            if word in pinstrs:
                idx = pinstrs.index(word)
                b_text.write(idx.to_bytes(2, byteorder='little', signed=False))
                continue

            if word in plabels_text:
                addr = text_start + plabels_text[word]
                b_text.write(addr.to_bytes(8, byteorder='little', signed=False))
                continue

            if word in plabels_data:
                addr = data_start + plabels_data[word] + pdata_pad
                b_text.write(addr.to_bytes(8, byteorder='little', signed=False))
                continue

            if word in special_syms:
                if word == "%%imm16":
                    lastimm = 2
                elif word == "%%imm32":
                    lastimm = 4
                elif word == "%%imm64":
                    lastimm = 8

                elif word == "%%signed":
                    lastimm = 2
                    isSigned = True
                else:
                    isSigned = False

                continue

            if is_number(word):
                if word[0] == '-':
                    isSigned = True
                else:
                    isSigned = False
                b_text.write(int(word, base=0).to_bytes(lastimm, byteorder='little', signed=isSigned))
                continue

            print("Assembly error, unknown token '{}' in line: {}".format(word, line))
            leave()
            sys.exit(1)

#-------------------------------------------------------------------------------

def genout():
    b_text.seek(0)
    b_data.seek(0)
    b_out.write(b_text.read())

    data_align = (8 - ptext % 8)

    for i in range(data_align):
        b_out.write(int(0).to_bytes(1, byteorder='little', signed=False))

    b_out.write(b_data.read())

#-------------------------------------------------------------------------------

parse_lst_instrs()
parse_lst_regs()
do_includes(main_src)
parse()
gentext()
genout()
leave()
sys.exit(0)