#!/usr/bin/env python3 # Copyright 2020 Viacheslav Slavinsky # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # 3. Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from this # software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, # OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT # OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # OF THE POSSIBILITY OF SUCH DAMAGE. import sys import os ENCODING='utf-8'#None # try 'utf-8' or 'cp1251' ENCODINGS=['utf-8', 'cp1251'] class State: DUMMY0 = 0 DUMMY1 = 1 LINENUM0 = 2 LINENUM1 = 3 TOKENS = 4 END = 5 class Mode: INITIAL = 0x00 TOKENIZE = 0x01 QUOTE = 0x20 VERBATIM = 0x40 class Tokens: QUOTE='"' Chars = ' !"#$%&\'()*+,-./0123456789:;<=>?@ABC' + \ 'DEFGHIJKLMNOPQRSTUVWXYZ[\\]^_ЮАБЦДЕФГ' + \ 'ХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧ' + chr(127) Words = ['CLS','FOR','NEXT','DATA','INPUT','DIM','READ','CUR','GOTO', 'RUN','IF','RESTORE','GOSUB','RETURN','REM','STOP','OUT','ON', 'PLOT','LINE','POKE','PRINT','DEF','CONT','LIST','CLEAR', 'CLOAD','CSAVE','NEW','TAB(','TO','SPC(','FN','THEN','NOT', 'STEP','+','-','*','/','^','AND','OR','>','=','<','SGN','INT', 'ABS','USR','FRE','INP','POS','SQR','RND','LOG','EXP','COS', 'SIN','TAN','ATN','PEEK','LEN','STR$','VAL','ASC','CHR$', 'LEFT$','RIGHT$','MID$','POINT','INKEY$','AT','&','BEEP', 'PAUSE','VERIFY','HOME','EDIT','DELETE','MERGE','AUTO','HIMEM', '@','ASN','ADDR','PI','RENUM','ACS','LG','LPRINT','LLIST', 'SCREEN','COLOR','GET','PUT','BSAVE','BLOAD','PLAY','PAINT', 'CIRCLE'] by_initial = {key:[] for key in "ABCDEFGHIJKLMNOPQRSTUVWXYZ*/^>=<&@+-"} for w in Words: by_initial[w[0]].append(w) def gettext(c): if c < 0x20: return c elif c < 0x80: return ord(Tokens.Chars[c - 0x20]) else: return Tokens.Words[c - 0x80] def chars(text): result = [] for c in text: index = Tokens.Chars.find(c) if index != -1: result.append(0x20 + index) else: result.append(ord(c)) return result def format_token(t): if isinstance(t,int): return chr(t) if isinstance(t, str): return t def process_line(line): return str(line[0]) + ' ' + ''.join([format_token(x) for x in line[1:]]) def readbas(path): result = [] with open(path, 'rb') as fi: mv = memoryview(fi.read()) state = State.DUMMY0 fin = 0 for i in range(len(mv)): c = mv[i] if state == State.DUMMY0: if c == 0: fin = fin + 1 state = State.DUMMY1 elif state == State.DUMMY1: if c == 0: fin = fin + 1 if fin == 3: state = State.END else: state = State.LINENUM0 line = [] elif state == State.LINENUM0: line.append(c) state = State.LINENUM1 elif state == State.LINENUM1: line[0] = line[0] + c * 256 state = State.TOKENS elif state == State.TOKENS: if c == 0: fin = 1 state = State.DUMMY0 result.append(process_line(line)) elif c > 0 and c <= 31: line.append(c) elif c <= 228: line.append(Tokens.gettext(c)) else: line.append(c) elif state == State.END: break return result def isnum(c): return c >= '0' and c <= '9' # For given initial letter in position i, return list of keywords # that start with this letter. # Each entry is ["keyword", count=0, position=i] # If the mode is INITIAL, clear list of words and complete words # If the mode is VERBATIM or QUOTE, return nothing def pick_keywords(initial, i, mode, words, complete): #by_initial = {key:[] for key in "ABCDEFGHIJKLMNOPQRSTUVWXYZ*/^>=<&@+-"} #for w in Tokens.Words: # by_initial[w[0]].append(w) try: if (mode & (Mode.VERBATIM|Mode.QUOTE)) != 0: return kws = [[x,0,i] for x in Tokens.by_initial[initial][:]] for w in kws: if len(w[0]) == 1: complete.append(w) else: words.append(w) except: if mode == Mode.INITIAL: words.clear() complete.clear() # Parse number at the beginning of line and skip initial spaces def get_linenumber(s): linenum = 0 text = 0 for text,c in enumerate(s): if isnum(c): linenum = linenum * 10 + int(c) else: if c == ' ': pass else: break return linenum,text # Update word match count, move complete words to complete, delete mismatches # See pick_keywords def trackwords(c, words, complete): todelete=[] for j,wc in enumerate(words): # track keywords k = wc[1] + 1 if wc[0][k] == c: wc[1] = k if len(wc[0])-1 == k: complete.append(wc) todelete.append(j) else: todelete.append(j) # char mismatch, this word is out for j in reversed(todelete): del words[j] # purge untracked words # input: sorted by start position [[tok, x, pos]] # output: only one token per pos, the longest def suppress_nonmax(complete): pos = -1 result = [] for t in complete: if t[2] == pos: if len(t[0]) > len(result[-1][0]): result[-1] = t else: result.append(t) pos = t[2] return result # for overlapping tokens, pick the longest even if it starts later # IFK=ATHEN3 # AT # THEN <-- winrar def suppress_overlaps(complete): current_end=-1 current_len=-1 result=[] for t in complete: if t[2] < current_end: if len(t[0]) > current_len: result[-1] = t current_len = len(t[0]) current_end = t[2]+current_len else: result.append(t) current_len = len(t[0]) current_end = t[2]+current_len return result def tokenize2(s,addr): tokens=[] words=[] complete=[] linenum,i = get_linenumber(s) seq_start = i pick_keywords(s[i], i, Mode.INITIAL, words, complete) mode = Mode.TOKENIZE while i < len(s): trackwords(s[i], words, complete) breakchar = s[i] if breakchar == Tokens.QUOTE: mode ^= Mode.QUOTE # add keywords that start at the current position to tracking pick_keywords(s[i], i, mode, words, complete) # all tracked words ended, or end of line if len(words) == 0 or i + 1 == len(s): words = [] if len(complete) > 0: # make sure that the tokens are in order of occurrence complete.sort(key=lambda x: x[2], reverse=False) complete = suppress_nonmax(complete) # INPUT vs INP.. complete = suppress_overlaps(complete) # THEN over AT in ATHEN # flush dangling character tokens tokens = tokens + Tokens.chars(s[seq_start:complete[0][2]]) for j,b in enumerate(complete): if j > 0 and b[2] != i: # tokens overlap, only keep 1st break tokens.append(Tokens.Words.index(b[0]) + 0x80) i = b[2] + len(b[0]) seq_start = i if b[0] in ['DATA','REM']: mode = Mode.VERBATIM break # mode switch, cancel following tokens if breakchar != Tokens.QUOTE: i = i - 1 complete = [] else: pass i = i + 1 tokens = tokens + Tokens.chars(s[seq_start:]) + [0] recsize = len(tokens)+4 addr += recsize tokens = [addr&255,addr>>8] + [linenum & 255, (linenum >> 8) & 255] + tokens return tokens,addr def enbas(path): result = [] size = 0 addr = 0x4301 for enc in ENCODINGS: try: print(f'Trying encoding {enc}...') with open(path, 'r', encoding=ENCODING) as fi: for text in fi: print("Input=["+text.strip("\n")+"]") tokens,addr=tokenize2(text.strip("\n"), addr) result = result + tokens break except: print('Failed...') result.append(0) result.append(0) # add padding for perfect file match padding = [0]*(256-(len(result) % 256)) return bytearray(result+padding) #print(suppress_overlaps([["AT",0,3],["TO",0,4]])) #print(suppress_overlaps([["AT",0,3],["THEN",0,4]])) #print(suppress_nonmax([["INP",0,3],["INPUT",0,3],["PUT",0,2]])) #print([chr(x) for x in tokenize2('5 FORI=ATOB', 0x4301)[0]]) # AT wins #print([chr(x) for x in tokenize2('5 IFK=ATHEN3', 0x4301)[0]]) # THEN wins #print([chr(x) for x in tokenize2('5 INPUT"ABC",D', 0x4301)[0]]) #print([chr(x) for x in tokenize2('5 REM*LOL*', 0x4301)[0]]) #print([chr(x) for x in tokenize2('5 IFN=1', 0x4301)[0]]) #print([chr(x) for x in tokenize2('5 PRINT"ABC";K:CUR0,24', 0x4301)[0]]) #print([chr(x) for x in tokenize2('5 PLAY "A","B","C"', 0x4301)[0]]) #print([chr(x) for x in tokenize2('5 COLOR10,128,0', 0x4301)[0]]) #print([chr(x) for x in tokenize2('18 IFZ=0THENC(I,J)=14:C0=C0+1:IFC0>3THEN 17', 0x4301)[0]]) #print([chr(x) for x in tokenize2('1 ""THEN', 0x4301)[0]]) # evil yet unreal #print([(x) for x in tokenize2('1 IFA$="+"THEN1', 0x4301)[0]]) #print([chr(x) for x in tokenize2('1 PRINT"ABC";3', 0x4301)[0]]) #print(tokenize2('40 A$=INKEY$',0x4301)) #print(tokenize2("1 THENPLOT",0x4301)) #print(tokenize2("185 IFX<0ANDY<0THENPLOTABS(X),ABS(Y),1:GOTO 187",0x4301)) #print(tokenize2("1 DATA -123",0x4301)) #print("t2=", tokenize2("1 CLS",0x4301)) #print("t2=", tokenize2("10 RESTORE6",0x4300)) #print("t2=", tokenize2("10 RESTOR6",0x4300)) #print("t2=", tokenize2("10 QTOA",0x4300)) #print("t2=", tokenize2("10 FORI=ATOB",0x4300)) #print("t2=", tokenize2("10 FORJ=0TO5",0x4300)) #print(get_linenumber("1234 PRINTA")) #exit() def usagi(): n =sys.argv[0].split(os.path.sep)[-1] usagi=[ f"Vector-06c BASIC->ASC and ASC->BASIC converter by Viacheslav Slavinsky, svofski@gmail.com", "Usage:", f" {n} source.bas debas.asc rebas.bas complete round-trip for testing", f" {n} source.bas debas.asc convert basic tokenized file into plain text", f" {n} source.asc rebas.bas tokenize plain text and produce .bas file"] print("\n".join(usagi)) if len(sys.argv) == 1: usagi() sys.exit(1) def filenames(): ext1 = os.path.splitext(sys.argv[1]) ext2 = (ext1[0],False) try: ext2 = os.path.splitext(sys.argv[2]) except: pass if ext1[1].lower == ".bas": output_ext = ".asc" else: output_ext = ".bas" fin = ext1[0] + ext1[1] fon = ext2[0] + (output_ext if ext2[1] == 0 else ext2[1]) return fin,fon,ext1[1].lower() if len(sys.argv) == 4: # test mode debas = readbas(sys.argv[1]) with open(sys.argv[2], 'w', encoding=ENCODING) as fo: for line in debas: fo.write(line + '\n') bas = enbas(sys.argv[2]) with open(sys.argv[3], 'wb') as fob: fob.write(bas) elif len(sys.argv) in [2,3]: fin,fon,input_ext = filenames() if input_ext == ".bas": print("Converting BAS->ASC: ", fin, fon) debas = readbas(fin) with open(fon, 'w', encoding=ENCODING) as fo: for line in debas: fo.write(line + '\n') elif input_ext in [".asc",".txt"]: print("Converting asc->bas: ", fin, fon) bas = enbas(fin) with open(fon, 'wb') as fob: fob.write(bas) else: usagi() sys.exit(1)