discworldnoiraudio/extract.py

# file format notes:
# https://web.archive.org/web/20130904192356/http://rewiki.regengedanken.de/wiki/.TXT_(Discworld)
# https://web.archive.org/web/20130903115000/http://rewiki.regengedanken.de/wiki/.SCN
# https://web.archive.org/web/20080331031010/http://rewiki.regengedanken.de/wiki/.IDX
# https://web.archive.org/web/20160319050238/http://rewiki.regengedanken.de/wiki/.SMP

import os
import pprint
from struct import unpack, calcsize

def read(f, format):
    "Read some bytes from the file `f` and unpack them according to `format`."
    buf = f.read(calcsize(format))
    return unpack(format, buf)

def dialogue_chunk(f, size):
    result = ["dialogue"]
    for i in range(64):
        # looks like there is a variable-length encoding for dialogue length, not documented on the wiki.
        # if the high-bit is set, the length is encoded in two bytes.
        # most of the time, this ends up looking like `80 CF` - a one-byte value encoded in two bytes.
        # AFAICT there is only one line in english.txt that is longer than 255 bytes - it looks like `90 02`
        # and refers to a piece of text 0x102 bytes long.
        # Therefore, if the length byte is >= 0x80, we assume the bottom four bits are 0, the top bit is ignored,
        # and the 3 bits in between are used as the high portion of an 11-bit unsigned integer, with the lower 8 bits
        # following.
        length = read(f, "B")[0]
        if length >= 0x80:
            assert((length & 0x0f) == 0)
            length = ((length & 0x7f) << 4) | read(f, "B")[0]
        entry = f.read(length)

        # the dialogue in english.txt appears to be in latin-1 (the accented 'e' in "Café Ankh" is encoded as 0xe9)
        result.append(entry.decode("latin-1"))

    return result

chunk_decoders = {
    0x0001: dialogue_chunk
}

def decode_chunks(f):
    chunks = []
    filesize = os.fstat(f.fileno()).st_size
    while True:
        (chunktype, magic, next_offset) = read(f, "<HHI")
        assert(magic == 0x3334)
        chunksize = next_offset - f.tell()
        decoder = chunk_decoders.get(chunktype)
        if next_offset == 0:
            break
        if decoder:
            chunks.append(decoder(f, chunksize))
        else:
            chunks.append(["unknown", chunktype, f.read(chunksize)])
        if next_offset < f.tell() or next_offset >= filesize:
            break
        f.seek(next_offset, os.SEEK_SET)
    return chunks

def read_dialogue(filename):
    with open(filename, "rb") as f:
        return decode_chunks(f)

def read_speechindex(filename):
    index = []
    with open(filename, "rb") as f:
        while True:
            try:
                index.append(read(f, "<I")[0])
            except:
                break
    return index

def link_speech(chunks, index):
    spoken_dialogue = []
    i = 0
    for chunk in chunks:
        if chunk[0] == "dialogue":
            for line in chunk[1:]:
                if line != '' or index[i] != 0:
                    spoken_dialogue.append((line, index[i]))
                i += 1
    return spoken_dialogue

def extract_speech(filename, filename_out, offset):
    with open(filename, "rb") as inf:
        with open(filename_out, "wb") as outf:
            inf.seek(offset, os.SEEK_SET)
            (unknown, length) = read(inf, "<II")
            outf.write(inf.read(length))

pprint.pp(link_speech(read_dialogue("english.txt"), read_speechindex("english.idx")))

# extract_speech("english.smp", "rightyho.mp2", 115990421)