commit 90ec2d25b529a20808ebfbb228e366b60615d163
Author: Jeremy Penner <jeremy@sporktania.com>
Date:   Sun Jun 9 16:18:34 2024 -0400

    Initial commit

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8a4478d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,50 @@
+# Discworld Noir audio extraction script
+
+This is a quick hack I threw together to extract the audio for the "Righty ho" easter egg from Discworld Noir.
+You could extend it to extract other audio, if you like. It has no interface; if you want the script to do 
+something different than it currently does, scroll to the bottom of the file and change it to call different functions.
+
+This code should work for other language files as well, and probably will work for the earlier Discworld games, 
+but I haven't tested any of that. But if you've always wanted a robot that constantly says "That doesn't work"
+in Eric Idle's voice, you could probably tweak this script to achieve that dream.
+
+There are three important files that it needs:
+
+* `english.txt` - contains all the text in the script (including notes for translators and voice actors!)
+* `english.smp` - contains all of the voice lines and sound effects in compressed .mp2 format
+* `english.idx` - maps from lines of text in the `.txt` file to offsets in the `.smp` file
+
+`english.txt` and `english.idx` are included in this repo because it's kind of fun to poke around in them, 
+and they're small. The script in its current form parses all of the text and prints out a list of lines, along
+with the offsets into `english.smp` that you'd need to extract the sound for those lines. You'll have to 
+find your own copy of `english.smp` and modify the script if you want to try to actually rip the audio.
+
+# Fun voice acting directions hidden in `english.txt`
+
+* in an over-the-top nautical fashion
+* Columbo-style acting dumb
+* lapsing into rather bad gangster-speak
+* confusion melting into insane realisation
+* with non-sexual innuendo
+* Hard boiled monologue - after being turned into a toad
+* Important: to speak Gable's part you must keep your mouth rigidly open - possibly by putting a closed fist in your mouth. Nothing you will say will be intelligible: don't worry about it. That's just the way gargoyles speak. The subtitles will convey meaning.
+* dogmatically, if you'll excuse the pun
+* last bit as a Shakespearean Bugs Bunny
+
+# License
+
+```
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+                    Version 2, December 2004
+
+ Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
+
+ Everyone is permitted to copy and distribute verbatim or modified
+ copies of this license document, and changing it is allowed as long
+ as the name is changed.
+
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. You just DO WHAT THE FUCK YOU WANT TO.
+```
\ No newline at end of file
diff --git a/english.idx b/english.idx
new file mode 100644
index 0000000..f21fc49
Binary files /dev/null and b/english.idx differ
diff --git a/english.txt b/english.txt
new file mode 100644
index 0000000..0e5b4ae
Binary files /dev/null and b/english.txt differ
diff --git a/extract.py b/extract.py
new file mode 100644
index 0000000..403e7d1
--- /dev/null
+++ b/extract.py
@@ -0,0 +1,95 @@
+# file format notes:
+# https://web.archive.org/web/20130904192356/http://rewiki.regengedanken.de/wiki/.TXT_(Discworld)
+# https://web.archive.org/web/20130903115000/http://rewiki.regengedanken.de/wiki/.SCN
+# https://web.archive.org/web/20080331031010/http://rewiki.regengedanken.de/wiki/.IDX
+# https://web.archive.org/web/20160319050238/http://rewiki.regengedanken.de/wiki/.SMP
+
+import os
+import pprint
+from struct import unpack, calcsize
+
+def read(f, format):
+    "Read some bytes from the file `f` and unpack them according to `format`."
+    buf = f.read(calcsize(format))
+    return unpack(format, buf)
+
+def dialogue_chunk(f, size):
+    result = ["dialogue"]
+    for i in range(64):
+        # looks like there is a variable-length encoding for dialogue length, not documented on the wiki.
+        # if the high-bit is set, the length is encoded in two bytes.
+        # most of the time, this ends up looking like `80 CF` - a one-byte value encoded in two bytes.
+        # AFAICT there is only one line in english.txt that is longer than 255 bytes - it looks like `90 02`
+        # and refers to a piece of text 0x102 bytes long.
+        # Therefore, if the length byte is >= 0x80, we assume the bottom four bits are 0, the top bit is ignored,
+        # and the 3 bits in between are used as the high portion of an 11-bit unsigned integer, with the lower 8 bits
+        # following.
+        length = read(f, "B")[0]
+        if length >= 0x80:
+            assert((length & 0x0f) == 0)
+            length = ((length & 0x7f) << 4) | read(f, "B")[0]
+        entry = f.read(length)
+
+        # the dialogue in english.txt appears to be in latin-1 (the accented 'e' in "Café Ankh" is encoded as 0xe9)
+        result.append(entry.decode("latin-1"))
+
+    return result
+
+chunk_decoders = {
+    0x0001: dialogue_chunk
+}
+
+def decode_chunks(f):
+    chunks = []
+    filesize = os.fstat(f.fileno()).st_size
+    while True:
+        (chunktype, magic, next_offset) = read(f, "<HHI")
+        assert(magic == 0x3334)
+        chunksize = next_offset - f.tell()
+        decoder = chunk_decoders.get(chunktype)
+        if next_offset == 0:
+            break
+        if decoder:
+            chunks.append(decoder(f, chunksize))
+        else:
+            chunks.append(["unknown", chunktype, f.read(chunksize)])
+        if next_offset < f.tell() or next_offset >= filesize:
+            break
+        f.seek(next_offset, os.SEEK_SET)
+    return chunks
+
+def read_dialogue(filename):
+    with open(filename, "rb") as f:
+        return decode_chunks(f)
+
+def read_speechindex(filename):
+    index = []
+    with open(filename, "rb") as f:
+        while True:
+            try:
+                index.append(read(f, "<I")[0])
+            except:
+                break
+    return index
+
+def link_speech(chunks, index):
+    spoken_dialogue = []
+    i = 0
+    for chunk in chunks:
+        if chunk[0] == "dialogue":
+            for line in chunk[1:]:
+                if line != '' or index[i] != 0:
+                    spoken_dialogue.append((line, index[i]))
+                i += 1
+    return spoken_dialogue
+
+def extract_speech(filename, filename_out, offset):
+    with open(filename, "rb") as inf:
+        with open(filename_out, "wb") as outf:
+            inf.seek(offset, os.SEEK_SET)
+            (unknown, length) = read(inf, "<II")
+            outf.write(inf.read(length))
+
+pprint.pp(link_speech(read_dialogue("english.txt"), read_speechindex("english.idx")))
+
+# extract_speech("english.smp", "rightyho.mp2", 115990421)
diff --git a/rightyho.mp2 b/rightyho.mp2
new file mode 100644
index 0000000..4ef2c9c
Binary files /dev/null and b/rightyho.mp2 differ