# file format notes: # https://web.archive.org/web/20130904192356/http://rewiki.regengedanken.de/wiki/.TXT_(Discworld) # https://web.archive.org/web/20130903115000/http://rewiki.regengedanken.de/wiki/.SCN # https://web.archive.org/web/20080331031010/http://rewiki.regengedanken.de/wiki/.IDX # https://web.archive.org/web/20160319050238/http://rewiki.regengedanken.de/wiki/.SMP import os import pprint from struct import unpack, calcsize def read(f, format): "Read some bytes from the file `f` and unpack them according to `format`." buf = f.read(calcsize(format)) return unpack(format, buf) def dialogue_chunk(f, size): result = ["dialogue"] for i in range(64): # looks like there is a variable-length encoding for dialogue length, not documented on the wiki. # if the high-bit is set, the length is encoded in two bytes. # most of the time, this ends up looking like `80 CF` - a one-byte value encoded in two bytes. # AFAICT there is only one line in english.txt that is longer than 255 bytes - it looks like `90 02` # and refers to a piece of text 0x102 bytes long. # Therefore, if the length byte is >= 0x80, we assume the bottom four bits are 0, the top bit is ignored, # and the 3 bits in between are used as the high portion of an 11-bit unsigned integer, with the lower 8 bits # following. length = read(f, "B")[0] if length >= 0x80: assert((length & 0x0f) == 0) length = ((length & 0x7f) << 4) | read(f, "B")[0] entry = f.read(length) # the dialogue in english.txt appears to be in latin-1 (the accented 'e' in "Café Ankh" is encoded as 0xe9) result.append(entry.decode("latin-1")) return result chunk_decoders = { 0x0001: dialogue_chunk } def decode_chunks(f): chunks = [] filesize = os.fstat(f.fileno()).st_size while True: (chunktype, magic, next_offset) = read(f, "= filesize: break f.seek(next_offset, os.SEEK_SET) return chunks def read_dialogue(filename): with open(filename, "rb") as f: return decode_chunks(f) def read_speechindex(filename): index = [] with open(filename, "rb") as f: while True: try: index.append(read(f, "