commit 90ec2d25b529a20808ebfbb228e366b60615d163 Author: Jeremy Penner Date: Sun Jun 9 16:18:34 2024 -0400 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..8a4478d --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +# Discworld Noir audio extraction script + +This is a quick hack I threw together to extract the audio for the "Righty ho" easter egg from Discworld Noir. +You could extend it to extract other audio, if you like. It has no interface; if you want the script to do +something different than it currently does, scroll to the bottom of the file and change it to call different functions. + +This code should work for other language files as well, and probably will work for the earlier Discworld games, +but I haven't tested any of that. But if you've always wanted a robot that constantly says "That doesn't work" +in Eric Idle's voice, you could probably tweak this script to achieve that dream. + +There are three important files that it needs: + +* `english.txt` - contains all the text in the script (including notes for translators and voice actors!) +* `english.smp` - contains all of the voice lines and sound effects in compressed .mp2 format +* `english.idx` - maps from lines of text in the `.txt` file to offsets in the `.smp` file + +`english.txt` and `english.idx` are included in this repo because it's kind of fun to poke around in them, +and they're small. The script in its current form parses all of the text and prints out a list of lines, along +with the offsets into `english.smp` that you'd need to extract the sound for those lines. You'll have to +find your own copy of `english.smp` and modify the script if you want to try to actually rip the audio. + +# Fun voice acting directions hidden in `english.txt` + +* in an over-the-top nautical fashion +* Columbo-style acting dumb +* lapsing into rather bad gangster-speak +* confusion melting into insane realisation +* with non-sexual innuendo +* Hard boiled monologue - after being turned into a toad +* Important: to speak Gable's part you must keep your mouth rigidly open - possibly by putting a closed fist in your mouth. Nothing you will say will be intelligible: don't worry about it. That's just the way gargoyles speak. The subtitles will convey meaning. +* dogmatically, if you'll excuse the pun +* last bit as a Shakespearean Bugs Bunny + +# License + +``` + DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE + Version 2, December 2004 + + Copyright (C) 2004 Sam Hocevar + + Everyone is permitted to copy and distribute verbatim or modified + copies of this license document, and changing it is allowed as long + as the name is changed. + + DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. You just DO WHAT THE FUCK YOU WANT TO. +``` \ No newline at end of file diff --git a/english.idx b/english.idx new file mode 100644 index 0000000..f21fc49 Binary files /dev/null and b/english.idx differ diff --git a/english.txt b/english.txt new file mode 100644 index 0000000..0e5b4ae Binary files /dev/null and b/english.txt differ diff --git a/extract.py b/extract.py new file mode 100644 index 0000000..403e7d1 --- /dev/null +++ b/extract.py @@ -0,0 +1,95 @@ +# file format notes: +# https://web.archive.org/web/20130904192356/http://rewiki.regengedanken.de/wiki/.TXT_(Discworld) +# https://web.archive.org/web/20130903115000/http://rewiki.regengedanken.de/wiki/.SCN +# https://web.archive.org/web/20080331031010/http://rewiki.regengedanken.de/wiki/.IDX +# https://web.archive.org/web/20160319050238/http://rewiki.regengedanken.de/wiki/.SMP + +import os +import pprint +from struct import unpack, calcsize + +def read(f, format): + "Read some bytes from the file `f` and unpack them according to `format`." + buf = f.read(calcsize(format)) + return unpack(format, buf) + +def dialogue_chunk(f, size): + result = ["dialogue"] + for i in range(64): + # looks like there is a variable-length encoding for dialogue length, not documented on the wiki. + # if the high-bit is set, the length is encoded in two bytes. + # most of the time, this ends up looking like `80 CF` - a one-byte value encoded in two bytes. + # AFAICT there is only one line in english.txt that is longer than 255 bytes - it looks like `90 02` + # and refers to a piece of text 0x102 bytes long. + # Therefore, if the length byte is >= 0x80, we assume the bottom four bits are 0, the top bit is ignored, + # and the 3 bits in between are used as the high portion of an 11-bit unsigned integer, with the lower 8 bits + # following. + length = read(f, "B")[0] + if length >= 0x80: + assert((length & 0x0f) == 0) + length = ((length & 0x7f) << 4) | read(f, "B")[0] + entry = f.read(length) + + # the dialogue in english.txt appears to be in latin-1 (the accented 'e' in "Café Ankh" is encoded as 0xe9) + result.append(entry.decode("latin-1")) + + return result + +chunk_decoders = { + 0x0001: dialogue_chunk +} + +def decode_chunks(f): + chunks = [] + filesize = os.fstat(f.fileno()).st_size + while True: + (chunktype, magic, next_offset) = read(f, "= filesize: + break + f.seek(next_offset, os.SEEK_SET) + return chunks + +def read_dialogue(filename): + with open(filename, "rb") as f: + return decode_chunks(f) + +def read_speechindex(filename): + index = [] + with open(filename, "rb") as f: + while True: + try: + index.append(read(f, "