discworldnoiraudio/extract.py
2024-06-09 16:18:34 -04:00

96 lines
3.4 KiB
Python

# file format notes:
# https://web.archive.org/web/20130904192356/http://rewiki.regengedanken.de/wiki/.TXT_(Discworld)
# https://web.archive.org/web/20130903115000/http://rewiki.regengedanken.de/wiki/.SCN
# https://web.archive.org/web/20080331031010/http://rewiki.regengedanken.de/wiki/.IDX
# https://web.archive.org/web/20160319050238/http://rewiki.regengedanken.de/wiki/.SMP
import os
import pprint
from struct import unpack, calcsize
def read(f, format):
"Read some bytes from the file `f` and unpack them according to `format`."
buf = f.read(calcsize(format))
return unpack(format, buf)
def dialogue_chunk(f, size):
result = ["dialogue"]
for i in range(64):
# looks like there is a variable-length encoding for dialogue length, not documented on the wiki.
# if the high-bit is set, the length is encoded in two bytes.
# most of the time, this ends up looking like `80 CF` - a one-byte value encoded in two bytes.
# AFAICT there is only one line in english.txt that is longer than 255 bytes - it looks like `90 02`
# and refers to a piece of text 0x102 bytes long.
# Therefore, if the length byte is >= 0x80, we assume the bottom four bits are 0, the top bit is ignored,
# and the 3 bits in between are used as the high portion of an 11-bit unsigned integer, with the lower 8 bits
# following.
length = read(f, "B")[0]
if length >= 0x80:
assert((length & 0x0f) == 0)
length = ((length & 0x7f) << 4) | read(f, "B")[0]
entry = f.read(length)
# the dialogue in english.txt appears to be in latin-1 (the accented 'e' in "Café Ankh" is encoded as 0xe9)
result.append(entry.decode("latin-1"))
return result
chunk_decoders = {
0x0001: dialogue_chunk
}
def decode_chunks(f):
chunks = []
filesize = os.fstat(f.fileno()).st_size
while True:
(chunktype, magic, next_offset) = read(f, "<HHI")
assert(magic == 0x3334)
chunksize = next_offset - f.tell()
decoder = chunk_decoders.get(chunktype)
if next_offset == 0:
break
if decoder:
chunks.append(decoder(f, chunksize))
else:
chunks.append(["unknown", chunktype, f.read(chunksize)])
if next_offset < f.tell() or next_offset >= filesize:
break
f.seek(next_offset, os.SEEK_SET)
return chunks
def read_dialogue(filename):
with open(filename, "rb") as f:
return decode_chunks(f)
def read_speechindex(filename):
index = []
with open(filename, "rb") as f:
while True:
try:
index.append(read(f, "<I")[0])
except:
break
return index
def link_speech(chunks, index):
spoken_dialogue = []
i = 0
for chunk in chunks:
if chunk[0] == "dialogue":
for line in chunk[1:]:
if line != '' or index[i] != 0:
spoken_dialogue.append((line, index[i]))
i += 1
return spoken_dialogue
def extract_speech(filename, filename_out, offset):
with open(filename, "rb") as inf:
with open(filename_out, "wb") as outf:
inf.seek(offset, os.SEEK_SET)
(unknown, length) = read(inf, "<II")
outf.write(inf.read(length))
pprint.pp(link_speech(read_dialogue("english.txt"), read_speechindex("english.idx")))
# extract_speech("english.smp", "rightyho.mp2", 115990421)