m3te/tools/measure_pitch.py

#!/usr/bin/env python3
"""Estimate the dominant/fundamental frequency of a mono 16-bit PCM WAV.

Build-time verification tool only (the app never runs it). Pure stdlib: a
radix-2 Cooley-Tukey FFT over the loudest windowed segment, then a parabolic
peak interpolation. Prints "<file>  fundamental=<Hz>  peak_bin=<Hz>  dur=<s>".

Usage: python3 tools/measure_pitch.py <file.wav> [<file.wav> ...]
"""
import cmath
import math
import struct
import sys


def read_mono16(path):
    # Manual RIFF walk: afconvert emits WAVE_FORMAT_EXTENSIBLE (tag 0xFFFE) and
    # a padded data offset, which Python's `wave` module rejects. We only need
    # the fmt geometry and the raw 16-bit samples, so parse the chunks directly.
    with open(path, "rb") as f:
        data = f.read()
    assert data[0:4] == b"RIFF" and data[8:12] == b"WAVE", "not a RIFF/WAVE file"
    ch = rate = bits = None
    pcm = None
    pos = 12
    while pos + 8 <= len(data):
        cid = data[pos:pos + 4]
        size = struct.unpack("<I", data[pos + 4:pos + 8])[0]
        body = data[pos + 8:pos + 8 + size]
        if cid == b"fmt ":
            ch, rate = struct.unpack("<HI", body[2:8])
            bits = struct.unpack("<H", body[14:16])[0]
        elif cid == b"data":
            pcm = body
        pos += 8 + size + (size & 1)  # chunks are word-aligned
    assert bits == 16, "expected 16-bit PCM, got %r" % bits
    samples = struct.unpack("<%dh" % (len(pcm) // 2), pcm)
    if ch > 1:  # average down to mono
        samples = [sum(samples[i:i + ch]) / ch for i in range(0, len(samples), ch)]
    return list(samples), rate


def fft(a):
    n = len(a)
    if n == 1:
        return a
    even = fft(a[0::2])
    odd = fft(a[1::2])
    out = [0] * n
    for k in range(n // 2):
        t = cmath.exp(-2j * math.pi * k / n) * odd[k]
        out[k] = even[k] + t
        out[k + n // 2] = even[k] - t
    return out


def dominant_freq(samples, rate, fft_size=16384):
    # Pick the loudest fft_size-long window (the tonal body, not silence).
    if len(samples) < fft_size:
        seg = list(samples) + [0.0] * (fft_size - len(samples))
    else:
        step = fft_size // 2
        best_e, best_i = -1.0, 0
        for i in range(0, len(samples) - fft_size + 1, step):
            e = sum(s * s for s in samples[i:i + fft_size])
            if e > best_e:
                best_e, best_i = e, i
        seg = list(samples[best_i:best_i + fft_size])
    # Hann window to tame leakage, then FFT.
    win = [seg[i] * (0.5 - 0.5 * math.cos(2 * math.pi * i / (fft_size - 1)))
           for i in range(fft_size)]
    spec = fft(win)
    half = fft_size // 2
    mag = [abs(spec[k]) for k in range(half)]
    # Ignore DC / sub-audible bins below ~50 Hz.
    lo = max(1, int(50 * fft_size / rate))
    peak = max(range(lo, half), key=lambda k: mag[k])
    # Parabolic interpolation around the peak for sub-bin accuracy.
    if 0 < peak < half - 1:
        a, b, c = mag[peak - 1], mag[peak], mag[peak + 1]
        denom = (a - 2 * b + c)
        delta = 0.5 * (a - c) / denom if denom != 0 else 0.0
    else:
        delta = 0.0
    return (peak + delta) * rate / fft_size, peak * rate / fft_size


def main():
    for path in sys.argv[1:]:
        samples, rate = read_mono16(path)
        f, peak_bin = dominant_freq(samples, rate)
        dur = len(samples) / rate
        print("%-28s fundamental=%7.1f Hz  peak_bin=%7.1f Hz  dur=%.3f s" %
              (path, f, peak_bin, dur))


if __name__ == "__main__":
    main()