Source code for pytheory.audio

"""Audio import — turn a recording back into a Score.

Hum a melody into your phone, whistle a hook, play a bass line —
then load the WAV and get notes you can edit, harmonize, export to
MIDI, or print as sheet music::

    from pytheory import Score

    score = Score.from_wav("hum.wav")
    print(score.parts["melody"].notes)
    score.save_midi("hum.mid")

Transcription is **monophonic**: one note at a time (voice, whistle,
a single instrument line). Chords and polyphonic recordings are a
much harder problem — run melody and bass as separate takes.

The pitch tracker is the YIN algorithm (de Cheveigné & Kawahara,
2002) — the classic autocorrelation-with-a-twist method that powers
most monophonic tuners — implemented in pure numpy.
"""

import numpy

SAMPLE_RATE = 44_100

_NOTE_NAMES = ['C', 'C#', 'D', 'D#', 'E', 'F',
               'F#', 'G', 'G#', 'A', 'A#', 'B']


[docs] def load_wav(path): """Load an audio file as mono float64 in [-1, 1]. WAV files (8/16/32-bit PCM and float) are read directly; stereo is mixed down. Anything else — .m4a voice memos, .mp3, .aiff — is converted on the fly through ``afconvert`` (built into macOS) or ``ffmpeg``, whichever is on your PATH. Returns: (samples, sample_rate) tuple. """ if not str(path).lower().endswith(".wav"): return _load_via_converter(path) return _read_wav(path)
def _read_wav(path): import warnings from scipy.io import wavfile with warnings.catch_warnings(): warnings.simplefilter("ignore") # metadata chunks are fine to skip sample_rate, data = wavfile.read(path) data = numpy.asarray(data) # Normalize by the source dtype BEFORE the stereo mixdown — # averaging channels converts to float64 and would hide the dtype. if data.dtype == numpy.int16: data = data / 32768.0 elif data.dtype == numpy.int32: data = data / 2147483648.0 elif data.dtype == numpy.uint8: data = (data.astype(numpy.float64) - 128.0) / 128.0 else: data = data.astype(numpy.float64) if data.ndim > 1: data = data.mean(axis=1) return data.astype(numpy.float64), sample_rate def _load_via_converter(path): """Convert a non-WAV file to WAV via afconvert or ffmpeg, then read it.""" import os import shutil import subprocess import tempfile fd, tmp = tempfile.mkstemp(suffix=".wav") os.close(fd) try: if shutil.which("afconvert"): subprocess.run( ["afconvert", "-f", "WAVE", "-d", "LEI16@44100", str(path), tmp], check=True, capture_output=True) elif shutil.which("ffmpeg"): subprocess.run( ["ffmpeg", "-y", "-i", str(path), "-ar", "44100", tmp], check=True, capture_output=True) else: raise RuntimeError( f"Can't read {path!r}: converting non-WAV audio needs " f"afconvert (built into macOS) or ffmpeg on your PATH.") return _read_wav(tmp) finally: if os.path.exists(tmp): os.unlink(tmp)
[docs] def hpss(samples, sample_rate=SAMPLE_RATE, *, kernel=31): """Harmonic-percussive source separation. Drums and notes look completely different on a spectrogram: a held note is a horizontal line (steady frequency over time), a drum hit is a vertical line (all frequencies at one instant). Median-filter the spectrogram along time and you keep the horizontals; along frequency and you keep the verticals. Soft masks built from the two estimates split the signal. Returns: (harmonic, percussive) sample arrays, same length as input. """ from scipy.signal import stft, istft from scipy.ndimage import median_filter nperseg = 2048 f, t, Z = stft(samples, fs=sample_rate, nperseg=nperseg, noverlap=nperseg * 3 // 4) mag = numpy.abs(Z) harm = median_filter(mag, size=(1, kernel)) perc = median_filter(mag, size=(kernel, 1)) # Soft Wiener-style masks h2, p2 = harm ** 2, perc ** 2 total = h2 + p2 + 1e-12 _, harmonic = istft(Z * (h2 / total), fs=sample_rate, nperseg=nperseg, noverlap=nperseg * 3 // 4) _, percussive = istft(Z * (p2 / total), fs=sample_rate, nperseg=nperseg, noverlap=nperseg * 3 // 4) n = len(samples) return harmonic[:n], percussive[:n]
[docs] def estimate_tempo(samples, sample_rate=SAMPLE_RATE, *, bpm_min=60, bpm_max=200): """Estimate tempo from the onset pattern. Builds an onset-strength envelope (spectral flux — how much new energy appears frame to frame), autocorrelates it, and finds the beat period that explains the recording best, gently preferring tempos near 120. Returns: Estimated BPM as an int, or None if the recording doesn't have a confident pulse (e.g. rubato humming). """ from scipy.signal import stft hop = 512 nperseg = 2048 if len(samples) < sample_rate * 2: return None f, t, Z = stft(samples, fs=sample_rate, nperseg=nperseg, noverlap=nperseg - hop) mag = numpy.abs(Z) flux = numpy.maximum(numpy.diff(mag, axis=1), 0).sum(axis=0) if flux.std() < 1e-9: return None flux = (flux - flux.mean()) / flux.std() frame_rate = sample_rate / hop lag_min = int(frame_rate * 60.0 / bpm_max) lag_max = min(int(frame_rate * 60.0 / bpm_min), len(flux) - 1) if lag_max <= lag_min: return None ac = numpy.correlate(flux, flux, mode='full')[len(flux) - 1:] ac = ac / (ac[0] or 1.0) lags = numpy.arange(lag_min, lag_max + 1) bpms = 60.0 * frame_rate / lags # Log-gaussian prior centered at 120 BPM — resolves the # half/double-time ambiguity the way a human tapping along would prior = numpy.exp(-0.5 * (numpy.log2(bpms / 120.0) / 0.9) ** 2) scores = ac[lags] * prior best = int(numpy.argmax(scores)) if ac[lags][best] < 0.1: return None # no confident pulse return int(round(bpms[best]))
[docs] def detect_pitch(samples, sample_rate=SAMPLE_RATE, *, frame_size=2048, hop=512, fmin=50.0, fmax=1500.0, threshold=0.12): """Track pitch over time with the YIN algorithm. Args: samples: Mono float array. sample_rate: Sample rate in Hz. frame_size: Analysis window (default 2048 ≈ 46ms). hop: Samples between frames (default 512 ≈ 12ms). fmin/fmax: Pitch search range in Hz. threshold: YIN aperiodicity threshold — lower is stricter about what counts as a pitched sound. Returns: (times, freqs, voiced) arrays — one entry per frame. ``freqs`` is 0 where ``voiced`` is False. """ n = len(samples) tau_max = min(int(sample_rate / fmin), frame_size - 2) tau_min = max(2, int(sample_rate / fmax)) n_frames = max(0, (n - frame_size) // hop + 1) times = numpy.arange(n_frames) * hop / sample_rate freqs = numpy.zeros(n_frames) voiced = numpy.zeros(n_frames, dtype=bool) if n_frames == 0: return times, freqs, voiced peak = numpy.abs(samples).max() or 1.0 fft_size = 1 while fft_size < 2 * frame_size: fft_size *= 2 for i in range(n_frames): frame = samples[i * hop:i * hop + frame_size] # Skip silence outright rms = numpy.sqrt((frame ** 2).mean()) if rms < 0.015 * peak: continue # Difference function via FFT autocorrelation: # d(τ) = e[0:W-τ] + e[τ:W] - 2·acf(τ) spectrum = numpy.fft.rfft(frame, fft_size) acf = numpy.fft.irfft(spectrum * numpy.conj(spectrum))[:tau_max + 1] energy = numpy.concatenate(([0.0], numpy.cumsum(frame ** 2))) w = frame_size taus = numpy.arange(tau_max + 1) e_head = energy[w - taus] - energy[0] # Σ x[0:W-τ]² e_tail = energy[w] - energy[taus] # Σ x[τ:W]² d = e_head + e_tail - 2 * acf # Cumulative-mean-normalized difference (the YIN twist — # stops the trivial d(0)=0 minimum from winning) cmndf = numpy.ones(tau_max + 1) cum = numpy.cumsum(d[1:]) cmndf[1:] = d[1:] * taus[1:] / numpy.where(cum > 0, cum, 1e-12) # First dip below threshold wins (prefers the true period # over its subharmonics); fall back to the global minimum. search = cmndf[tau_min:tau_max + 1] below = numpy.flatnonzero(search < threshold) if len(below): tau = tau_min + below[0] # Walk to the local minimum of this dip while tau + 1 <= tau_max and cmndf[tau + 1] < cmndf[tau]: tau += 1 else: tau = tau_min + int(numpy.argmin(search)) if cmndf[tau] > 0.4: continue # too aperiodic — unvoiced # Parabolic interpolation for sub-sample precision if 1 <= tau < tau_max: a, b, c = cmndf[tau - 1], cmndf[tau], cmndf[tau + 1] denom = a - 2 * b + c if abs(denom) > 1e-12: tau = tau + 0.5 * (a - c) / denom freqs[i] = sample_rate / tau voiced[i] = True return times, freqs, voiced
def _segment_notes(times, freqs, voiced, samples, sample_rate, hop, min_note=0.06, gap_frames=3): """Group the frame-wise pitch track into discrete note events. Returns a list of (start_sec, dur_sec, midi_note, velocity). """ n_frames = len(times) midi = numpy.zeros(n_frames) m = voiced & (freqs > 0) midi[m] = 69 + 12 * numpy.log2(freqs[m] / 440.0) # Median-smooth the pitch track to kill single-frame blips if n_frames >= 5: from scipy.signal import medfilt midi_s = medfilt(midi, 5) midi = numpy.where(m, numpy.where(midi_s > 0, midi_s, midi), 0) # Frame RMS for velocities and re-articulation onsets frame_rms = numpy.zeros(n_frames) for i in range(n_frames): seg = samples[i * hop:i * hop + hop] if len(seg): frame_rms[i] = numpy.sqrt((seg ** 2).mean()) peak_rms = frame_rms.max() or 1.0 events = [] cur_frames = [] cur_start = 0 silent_run = 0 def close(): if not cur_frames: return seg_midi = numpy.array([midi[j] for j in cur_frames]) note = int(numpy.round(numpy.median(seg_midi))) start = times[cur_start] end = times[cur_frames[-1]] + hop / sample_rate dur = end - start if dur < min_note or not (12 <= note <= 119): return seg_rms = frame_rms[cur_frames].max() vel = int(numpy.clip(40 + 80 * seg_rms / peak_rms, 1, 127)) events.append((start, dur, note, vel)) for i in range(n_frames): if voiced[i] and midi[i] > 0: if not cur_frames: cur_frames = [i] cur_start = i else: ref = numpy.median([midi[j] for j in cur_frames]) re_attack = ( len(cur_frames) > 6 and frame_rms[i] > 2.5 * frame_rms[max(0, i - 3):i].min() and frame_rms[i] > 0.15 * peak_rms) if abs(midi[i] - ref) > 0.6 or re_attack: close() cur_frames = [i] cur_start = i else: cur_frames.append(i) silent_run = 0 else: if cur_frames: silent_run += 1 if silent_run > gap_frames: close() cur_frames = [] silent_run = 0 close() return events def _chromagram(samples, sample_rate=SAMPLE_RATE, *, fmin=55.0, fmax=5000.0, nperseg=4096, normalized=True): """Fold a spectrogram into 12 pitch classes over time. Every FFT bin maps to the pitch class of its frequency; summing magnitudes per class gives a "chroma" vector per frame — a fingerprint of what harmony is sounding, regardless of octave. Returns: (chroma, frame_times) — chroma is shape (12, n_frames), columns normalized to unit sum (or raw magnitudes with ``normalized=False``, so louder frames carry more weight when averaged). """ from scipy.signal import stft hop = nperseg // 4 f, t, Z = stft(samples, fs=sample_rate, nperseg=nperseg, noverlap=nperseg - hop) mag = numpy.abs(Z) # Map bins to pitch classes (ignore rumble and air) usable = (f >= fmin) & (f <= fmax) midi = 69 + 12 * numpy.log2(f[usable] / 440.0) pcs = numpy.round(midi).astype(int) % 12 chroma = numpy.zeros((12, mag.shape[1])) usable_mag = mag[usable] for pc in range(12): sel = pcs == pc if sel.any(): chroma[pc] = usable_mag[sel].sum(axis=0) if not normalized: return chroma, t totals = chroma.sum(axis=0) totals[totals == 0] = 1.0 return chroma / totals, t # Chord templates: which pitch classes (relative to the root) sound. # The root gets extra weight — it's usually doubled and in the bass. # Each entry is (intervals, prior): four-note chords get a slight # handicap so a plain triad with a passing melody note doesn't get # promoted to a 7th. _CHORD_QUALITIES = { "": ((0, 4, 7), 1.0), # major "m": ((0, 3, 7), 1.0), # minor "7": ((0, 4, 7, 10), 0.96), # dominant 7th "maj7": ((0, 4, 7, 11), 0.96), "m7": ((0, 3, 7, 10), 0.96), "sus2": ((0, 2, 7), 0.94), "sus4": ((0, 5, 7), 0.94), } def _grid_phase(samples, sample_rate, window_sec): """Find where the chord grid should start, in seconds. Chord changes land on beats, but the recording rarely starts on one. Fold onset energy (spectral flux) onto the window period — a circular histogram of "when in the window do onsets happen" — and start the grid at the phase where they concentrate. """ from scipy.signal import stft hop = 512 nperseg = 2048 if len(samples) < nperseg * 2: return 0.0 f, t, Z = stft(samples, fs=sample_rate, nperseg=nperseg, noverlap=nperseg - hop) mag = numpy.abs(Z) flux = numpy.maximum(numpy.diff(mag, axis=1), 0).sum(axis=0) ftimes = t[1:] if flux.std() < 1e-9: return 0.0 flux = flux - flux.min() nbins = 64 hist = numpy.zeros(nbins) idx = ((ftimes % window_sec) / window_sec * nbins).astype(int) % nbins numpy.add.at(hist, idx, flux) # Circular smoothing so an onset straddling two bins still wins kernel = numpy.array([0.25, 0.5, 1.0, 0.5, 0.25]) smooth = sum(k * numpy.roll(hist, s) for k, s in zip(kernel, range(-2, 3))) return float(smooth.argmax()) / nbins * window_sec
[docs] def identify_chord(samples, sample_rate=SAMPLE_RATE, *, min_confidence=0.7): """Identify the chord sounding in a buffer of audio. The one-shot, "what am I strumming right now?" version of :func:`detect_chords` — fold the buffer into a chromagram and match it against the same major/minor/sus/7th templates on all twelve roots. Harmonics are discounted before matching — each chord tone's 3rd, 5th, and 7th partials land a fifth, major third, and flat seventh above it in pitch-class space, which is what makes a bright C major read as Cmaj7 if you match the raw chromagram. A polyphony gate rejects single notes (whose energy concentrates on too few pitch classes) rather than misreading a melody note as a chord. Coefficients are calibrated against pytheory's own guitar/piano/rhodes renders: 93% on an 81-case battery. Args: samples: Mono float array, ideally ~0.5–1.5 s of audio. sample_rate: Sample rate in Hz. min_confidence: Template match score (0–1) below which ``None`` is returned. Returns: Dict with ``symbol`` (e.g. ``"Am"``), ``confidence``, and ``notes`` (the chord tones, low to high from the root) — or ``None`` if no chord is confidently sounding. """ samples = numpy.asarray(samples, dtype=numpy.float64) if len(samples) < 8192: return None if numpy.sqrt((samples ** 2).mean()) < 1e-3: return None # Long FFT window (8192 ≈ 5.4 Hz bins) so low chord voicings # resolve; 100 Hz floor keeps fundamentals down to ~G2. Raw # (unnormalized) chroma so louder frames carry more weight. chroma, _ = _chromagram(samples, sample_rate, fmin=100.0, nperseg=8192, normalized=False) if chroma.shape[1] == 0: return None raw = chroma.mean(axis=1) # Harmonic discounting: subtract the spill each pitch class # receives from the 3rd partial (a fifth below it), 5th partial # (a major third below), and 7th partial (a whole tone above — # i.e. the note it's the flat 7th of). adj = numpy.maximum(0.0, raw - 0.20 * numpy.roll(raw, 7) - 0.20 * numpy.roll(raw, 4) - 0.12 * numpy.roll(raw, 10)) norm = numpy.linalg.norm(adj) if norm < 1e-9: return None avg = adj / norm # Polyphony gate — a chord puts real energy on at least three # pitch classes even after discounting; a lone note doesn't. ranked = numpy.sort(avg)[::-1] if ranked[2] < 0.35 * ranked[0]: return None best = None for root in range(12): for quality, (intervals, prior) in _CHORD_QUALITIES.items(): vec = numpy.zeros(12) for iv in intervals: vec[(root + iv) % 12] = 1.0 vec /= numpy.linalg.norm(vec) score = float(avg @ vec) * prior if best is None or score > best[0]: best = (score, root, quality, intervals) score, root, quality, intervals = best if score < min_confidence: return None return { "symbol": _NOTE_NAMES[root] + quality, "confidence": round(score, 3), "notes": [_NOTE_NAMES[(root + iv) % 12] for iv in intervals], }
def _bass_is_real(bass_sig, sample_rate, lo, hi, f_bass): """Is there actual spectral energy at the detected bass pitch? YIN reports the period of the *composite* waveform, so a chord with no bass note still yields its missing fundamental (Csus4 looks like a phantom F2). A real bass note carries energy at its own fundamental; a phantom doesn't. """ seg = bass_sig[int(lo * sample_rate):int(hi * sample_rate)] if len(seg) < 1024 or f_bass <= 0: return False spec = numpy.abs(numpy.fft.rfft(seg * numpy.hanning(len(seg)))) ** 2 freqs = numpy.fft.rfftfreq(len(seg), 1 / sample_rate) fund = spec[(freqs > f_bass * 0.94) & (freqs < f_bass * 1.06)].sum() low = spec[freqs < 320.0].sum() + 1e-12 return fund > 0.2 * low
[docs] def detect_chords(samples, sample_rate=SAMPLE_RATE, *, bpm=120, beats_per_chord=2.0): """Detect a chord progression from audio. Folds the harmonic content into pitch classes (a chromagram), averages it over chord-sized windows on a beat grid aligned to the music's own onsets, and matches each window against major/minor/sus triad and 7th-chord templates on all twelve roots. When the bass clearly sits on a chord tone other than the root, the chord is reported as a slash chord (``"C/E"``). Returns: List of (start_beat, duration_beats, symbol) tuples, with consecutive identical chords merged — e.g. ``[(0.0, 8.0, "Am"), (8.0, 4.0, "F")]``. """ samples = numpy.asarray(samples, dtype=numpy.float64) if len(samples) < 8192: return [] # Chroma from 130 Hz up — below that, FFT bins are a semitone # wide and a loud bass smears into neighboring pitch classes. # The bass still votes through its harmonics. chroma, times = _chromagram(samples, sample_rate, fmin=130.0) if chroma.shape[1] == 0: return [] # Bass pitch track for inversion detection — the chromagram is # octave-blind (and FFT bins are semitones wide down low), so # "what's in the bass?" gets its own YIN pass on the lowpassed # signal, like the bass stem in transcribe(). from scipy.signal import butter, filtfilt bl, al = butter(4, 320, btype='low', fs=sample_rate) bass_sig = filtfilt(bl, al, samples) btimes, bfreqs, bvoiced = detect_pitch(bass_sig, sample_rate, fmin=40.0, fmax=300.0) # Build the templates once (12 roots × qualities) templates = [] for root in range(12): for quality, (intervals, prior) in _CHORD_QUALITIES.items(): vec = numpy.zeros(12) for iv in intervals: vec[(root + iv) % 12] = 1.0 vec[root] = 1.5 # weight the root vec /= numpy.linalg.norm(vec) templates.append((root, quality, frozenset((root + iv) % 12 for iv in intervals), vec * prior)) window_sec = beats_per_chord * 60.0 / bpm total_sec = times[-1] if total_sec < window_sec / 2: return [] # Beat-align the grid: window boundaries snap to the phase where # the music's onsets land, instead of marching blindly from t=0. offset = _grid_phase(samples, sample_rate, window_sec) boundaries = list(numpy.arange(offset, total_sec, window_sec)) if not boundaries or boundaries[0] > 1e-6: # Leading partial window — usually a silent lead-in (the # energy gate below skips it) or the tail of chord one # (merged below). boundaries.insert(0, 0.0) boundaries.append(max(total_sec, boundaries[-1] + 1e-6)) def to_beats(sec): # Snap to sixteenths so the score grid stays tidy return round(sec * bpm / 60.0 * 4) / 4.0 peak_rms = numpy.sqrt((samples ** 2).mean()) or 1.0 raw = [] for lo, hi in zip(boundaries[:-1], boundaries[1:]): cols = (times >= lo) & (times < hi) if not cols.any(): continue # Skip near-silent windows (lead-ins, gaps) — any chroma # there is just noise. seg = samples[int(lo * sample_rate):int(hi * sample_rate)] if len(seg) and numpy.sqrt((seg ** 2).mean()) < 0.05 * peak_rms: continue avg = chroma[:, cols].mean(axis=1) norm = numpy.linalg.norm(avg) if norm < 1e-9: continue avg = avg / norm scores = [(avg @ vec, root, quality, pcs) for root, quality, pcs, vec in templates] score, root, quality, pcs = max(scores) symbol = _NOTE_NAMES[root] + quality # Inversion: a confident, steady bass note on a chord tone # that isn't the root makes it a slash chord. bsel = bvoiced & (btimes >= lo) & (btimes < hi) n_window = max(1, int(((btimes >= lo) & (btimes < hi)).sum())) if bsel.sum() >= 0.4 * n_window: bmidi = numpy.round( 69 + 12 * numpy.log2(bfreqs[bsel] / 440.0)).astype(int) bpcs, counts = numpy.unique(bmidi % 12, return_counts=True) bass_pc = int(bpcs[counts.argmax()]) if (counts.max() >= 0.6 * bsel.sum() and bass_pc != root and bass_pc in pcs and _bass_is_real(bass_sig, sample_rate, lo, hi, float(numpy.median( bfreqs[bsel][bmidi % 12 == bass_pc])))): symbol += "/" + _NOTE_NAMES[bass_pc] start_b, end_b = to_beats(lo), to_beats(hi) if end_b > start_b: raw.append((start_b, end_b - start_b, symbol)) # Merge consecutive identical chords merged = [] for start, dur, sym in raw: if merged and merged[-1][2] == sym \ and abs(merged[-1][0] + merged[-1][1] - start) < 1e-6: prev = merged.pop() merged.append((prev[0], prev[1] + dur, sym)) else: merged.append((start, dur, sym)) return merged
[docs] def detect_drums(samples, sample_rate=SAMPLE_RATE, *, bpm=120, quantize=0.25): """Detect drum hits from (ideally percussive) audio. Finds onsets in the energy envelope, then classifies each by where its energy lives: kicks are bottom-heavy, hats are all sizzle, snares are the broadband middle. Returns: List of (beat_position, sound_name, velocity) tuples, where sound_name is ``"kick"``, ``"snare"``, or ``"closed_hat"``. """ hop = 256 n_frames = (len(samples) - hop) // hop if n_frames < 4: return [] frames = samples[:n_frames * hop].reshape(n_frames, hop) env = numpy.sqrt((frames ** 2).mean(axis=1)) peak_env = env.max() or 1.0 # Onsets: env rises sharply above its local past. Pad with # silence so a hit on the very first sample still registers. pad = 6 env_p = numpy.concatenate([numpy.zeros(pad), env]) onsets = [] last = -10_000 for i in range(n_frames): recent = env_p[i:i + pad].min() if (env[i] > 2.0 * recent + 0.02 * peak_env and env[i] > 0.08 * peak_env and i - last > int(0.09 * sample_rate / hop)): onsets.append(i) last = i hits = [] win = int(sample_rate * 0.05) for i in onsets: start = i * hop seg = samples[start:start + win] if len(seg) < 64: continue spec = numpy.abs(numpy.fft.rfft(seg * numpy.hanning(len(seg)))) freqs = numpy.fft.rfftfreq(len(seg), 1 / sample_rate) # Per-band MEAN magnitude (sums would bias toward the high # band, which has ~100x more FFT bins than the low band) low = spec[freqs < 150].mean() mid = spec[(freqs >= 150) & (freqs < 2000)].mean() high = spec[freqs >= 5000].mean() power = spec ** 2 cent = (power * freqs).sum() / (power.sum() + 1e-12) # Thresholds calibrated against pytheory's own kick/snare/hat # synths, alone and in mixtures sounds = [] if low > 5 * (mid + 1e-12): sounds.append("kick") if high > 0.15 * mid: sounds.append("closed_hat") # hat hiding under the kick elif cent > 8800: sounds.append("closed_hat") else: sounds.append("snare") beat = start / sample_rate * bpm / 60.0 if quantize: beat = round(beat / quantize) * quantize vel = int(numpy.clip(50 + 70 * env[i] / peak_env, 1, 127)) for sound in sounds: hits.append((beat, sound, vel)) # Dedupe hits quantized onto the same grid slot with the same sound seen = set() out = [] for beat, sound, vel in hits: if (beat, sound) not in seen: seen.add((beat, sound)) out.append((beat, sound, vel)) return out
def _events_to_part(part, events, bpm, quantize): """Write (start, dur, midi, vel) events into a Part as notes/rests.""" def snap(beats): if quantize: return max(quantize, round(beats / quantize) * quantize) return beats pos = 0.0 for start_s, dur_s, note, vel in events: start_b = start_s * bpm / 60.0 dur_b = dur_s * bpm / 60.0 if quantize: start_b = round(start_b / quantize) * quantize gap = start_b - pos if gap > 1e-3: part.rest(gap) pos = start_b dur_b = snap(dur_b) name = f"{_NOTE_NAMES[note % 12]}{note // 12 - 1}" part.add(name, dur_b, velocity=vel) pos += dur_b def _track_events(samples, sample_rate, fmin, fmax): """Pitch-track a signal and return segmented note events.""" hop = 512 times, freqs, voiced = detect_pitch( samples, sample_rate, hop=hop, fmin=fmin, fmax=fmax) return _segment_notes(times, freqs, voiced, samples, sample_rate, hop)
[docs] def transcribe(path, *, bpm=None, quantize=None, split=False, part_name="melody", synth="piano_synth", fmin=50.0, fmax=1500.0): """Transcribe an audio recording into a Score. Args: path: Audio file path — WAV directly, anything else (.m4a, .mp3) via afconvert/ffmpeg. Or a (samples, sample_rate) tuple. bpm: Tempo to interpret the timing against. Default ``None`` estimates it from the recording's onset pattern, falling back to 120 when there's no confident pulse (rubato humming). Pass a number to pin it. quantize: Optional grid in beats (e.g. ``0.25`` snaps note starts and lengths to sixteenths). Default: no snapping — you get the timing as performed. split: If True, run harmonic-percussive separation first and transcribe **two** parts from the harmonic signal — a ``"bass"`` part (40-200 Hz) and a ``"melody"`` part (200 Hz up, with the bass filtered out). Use this on full mixes; expect the bassline to come out well and the melody to come out only as well as it dominates the mix. part_name: Name for the created part (non-split mode). synth: Synth for playback of the transcription. fmin/fmax: Pitch range to search, in Hz (non-split mode). Tighten these for better results (e.g. ``fmin=60, fmax=350`` for a bass). Returns: A :class:`~pytheory.rhythm.Score` holding the detected notes, rests, and velocities. """ from .rhythm import Score if isinstance(path, tuple): samples, sample_rate = path else: samples, sample_rate = load_wav(path) if bpm is None: bpm = estimate_tempo(samples, sample_rate) or 120 score = Score("4/4", bpm=int(bpm)) if split: from scipy.signal import butter, filtfilt harmonic, percussive = hpss(samples, sample_rate) # Bass pass: lowpassed harmonic signal, bass-register search bl, al = butter(4, 300, btype='low', fs=sample_rate) bass_sig = filtfilt(bl, al, harmonic) bass_events = _track_events(bass_sig, sample_rate, 40.0, 200.0) # Melody pass: bass filtered out, mid/high-register search bh, ah = butter(4, 250, btype='high', fs=sample_rate) mel_sig = filtfilt(bh, ah, harmonic) mel_events = _track_events(mel_sig, sample_rate, 200.0, 1500.0) melody = score.part("melody", synth=synth) _events_to_part(melody, mel_events, bpm, quantize) bass = score.part("bass", synth="bass_guitar") _events_to_part(bass, bass_events, bpm, quantize) # Chord pass: chromagram template matching on the harmonic stem chord_track = detect_chords(harmonic, sample_rate, bpm=bpm) if chord_track: from .chords import Chord chords = score.part("chords", synth="rhodes", volume=0.4) pos = 0.0 for start, dur, symbol in chord_track: if start - pos > 1e-6: chords.rest(start - pos) pos = start chords.add(Chord.from_symbol(symbol), dur) pos += dur # Drum pass: onset classification on the percussive stem drum_hits = detect_drums(percussive, sample_rate, bpm=bpm, quantize=quantize or 0.25) if drum_hits: from .rhythm import (Pattern, DrumSound, _Hit) sound_map = {"kick": DrumSound.KICK, "snare": DrumSound.SNARE, "closed_hat": DrumSound.CLOSED_HAT} hits = [_Hit(sound_map[s], beat, vel) for beat, s, vel in drum_hits] total = max(h.position for h in hits) + 1.0 score.add_pattern(Pattern("transcribed", hits, beats=total), repeats=1) # Key detection from everything pitched we heard — full chord # tones, not just roots (Am-F-G's roots alone are ambiguous; # its tones spell out C major / A minor exactly) from .chords import Chord from .scales import Key pitch_classes = [] for events in (mel_events, bass_events): pitch_classes.extend(_NOTE_NAMES[note % 12] for _, _, note, _ in events) for _, _, symbol in chord_track: try: pitch_classes.extend( t.name for t in Chord.from_symbol(symbol).tones) except ValueError: pass score.detected_key = (Key.detect(*dict.fromkeys(pitch_classes)) if pitch_classes else None) return score events = _track_events(samples, sample_rate, fmin, fmax) part = score.part(part_name, synth=synth) _events_to_part(part, events, bpm, quantize) return score