Skip to content

Audio Signals API

SpectralSignal

Bases: BaseSignal

Detects AI-generated audio by analyzing spectral anomalies.

This detector focuses on artifacts left by neural vocoders (like HiFi-GAN, WaveGlow), which often exhibit: - High-frequency phase discontinuities (>8kHz) - Unnatural spectral envelope patterns - Anomalous energy distribution across frequency bands

This is a lightweight, CPU-friendly detector suitable for real-time use.

Attributes:

Name Type Description
name str

'spectral_audio_detector'

dtype str

'audio'

Source code in veridex/audio/spectral.py
class SpectralSignal(BaseSignal):
    """
    Detects AI-generated audio by analyzing spectral anomalies.

    This detector focuses on artifacts left by neural vocoders (like HiFi-GAN, WaveGlow),
    which often exhibit:
    - High-frequency phase discontinuities (>8kHz)
    - Unnatural spectral envelope patterns
    - Anomalous energy distribution across frequency bands

    This is a lightweight, CPU-friendly detector suitable for real-time use.

    Attributes:
        name (str): 'spectral_audio_detector'
        dtype (str): 'audio'
    """

    def __init__(
        self,
        target_sr: int = 16000,
        n_fft: int = 2048,
        hop_length: int = 512,
    ):
        """
        Initialize the spectral detector.

        Args:
            target_sr: Target sample rate for audio resampling
            n_fft: FFT window size
            hop_length: Hop length for STFT
        """
        self.target_sr = target_sr
        self.n_fft = n_fft
        self.hop_length = hop_length

    @property
    def name(self) -> str:
        return "spectral_audio_detector"

    @property
    def dtype(self) -> str:
        return "audio"

    def check_dependencies(self) -> None:
        try:
            import librosa
            import soundfile
            import scipy
        except ImportError:
            raise ImportError(
                "Audio detection requires 'librosa', 'soundfile', and 'scipy'. "
                "Install with: pip install veridex[audio]"
            )

    def run(self, input_data: Any) -> DetectionResult:
        """
        Analyze audio for AI-generation artifacts.

        Args:
            input_data: Path to audio file (str or Path)

        Returns:
            DetectionResult with score, confidence, and spectral metadata
        """
        if not isinstance(input_data, (str, Path)):
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error="Input must be a file path (str or Path)"
            )

        try:
            self.check_dependencies()
        except ImportError as e:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=str(e)
            )

        try:
            from veridex.audio.utils import load_audio, validate_audio, compute_spectrogram
            import scipy.signal as signal
            import scipy.stats as stats

            # Load audio
            audio, sr = load_audio(input_data, target_sr=self.target_sr)

            # Validate audio
            is_valid, error_msg = validate_audio(audio, sr)
            if not is_valid:
                return DetectionResult(
                    score=0.0,
                    confidence=0.0,
                    error=error_msg
                )

            # Compute spectrogram
            spectrogram = compute_spectrogram(
                audio,
                sr=sr,
                n_fft=self.n_fft,
                hop_length=self.hop_length
            )

            # Analyze spectral features
            features = self._extract_spectral_features(spectrogram, sr)

            # Compute AI probability score
            score = self._compute_score(features)

            # Estimate confidence based on audio quality
            confidence = self._estimate_confidence(audio, features)

            return DetectionResult(
                score=score,
                confidence=confidence,
                metadata=features
            )

        except Exception as e:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=f"Spectral analysis failed: {e}"
            )

    def _extract_spectral_features(
        self,
        spectrogram: np.ndarray,
        sr: int
    ) -> dict:
        """Extract features for deepfake detection."""
        import scipy.stats as stats

        # Frequency bins
        freqs = np.fft.rfftfreq(self.n_fft, 1/sr)

        # Separate low/mid/high frequency regions
        low_freq_mask = freqs < 2000  # < 2kHz
        mid_freq_mask = (freqs >= 2000) & (freqs < 8000)  # 2-8 kHz
        high_freq_mask = freqs >= 8000  # > 8kHz (vocoder artifacts region)

        # Energy in each band (average over time)
        low_energy = np.mean(spectrogram[low_freq_mask, :])
        mid_energy = np.mean(spectrogram[mid_freq_mask, :])
        high_energy = np.mean(spectrogram[high_freq_mask, :])

        # Spectral roll-off (frequency below which 85% of energy concentrates)
        spectral_rolloff = self._compute_rolloff(spectrogram, freqs)

        # High-frequency entropy (natural audio has chaotic high-freq)
        high_freq_entropy = stats.entropy(
            spectrogram[high_freq_mask, :].flatten() + 1e-10
        )

        # Spectral flatness (measure of "noisiness")
        spectral_flatness = self._compute_flatness(spectrogram)

        # Temporal stability in high frequencies
        # AI vocoders often produce overly stable high-freq patterns
        high_freq_stability = np.std(
            np.mean(spectrogram[high_freq_mask, :], axis=0)
        )

        return {
            "low_freq_energy": float(low_energy),
            "mid_freq_energy": float(mid_energy),
            "high_freq_energy": float(high_energy),
            "spectral_rolloff": float(spectral_rolloff),
            "high_freq_entropy": float(high_freq_entropy),
            "spectral_flatness": float(spectral_flatness),
            "high_freq_stability": float(high_freq_stability),
        }

    def _compute_rolloff(self, spectrogram: np.ndarray, freqs: np.ndarray) -> float:
        """Compute spectral roll-off point."""
        # Average over time
        avg_spectrum = np.mean(spectrogram, axis=1)

        # Cumulative energy
        cumsum = np.cumsum(avg_spectrum)
        total = cumsum[-1]

        # Find frequency where 85% of energy is below
        threshold = 0.85 * total
        rolloff_idx = np.where(cumsum >= threshold)[0]

        if len(rolloff_idx) > 0:
            return freqs[rolloff_idx[0]]
        return freqs[-1]

    def _compute_flatness(self, spectrogram: np.ndarray) -> float:
        """Compute spectral flatness (geometric mean / arithmetic mean)."""
        # Average over time
        avg_spectrum = np.mean(spectrogram, axis=1) + 1e-10

        # Geometric mean
        geo_mean = np.exp(np.mean(np.log(avg_spectrum)))

        # Arithmetic mean
        arith_mean = np.mean(avg_spectrum)

        flatness = geo_mean / arith_mean
        return float(flatness)

    def _compute_score(self, features: dict) -> float:
        """
        Compute AI probability score from spectral features.

        Heuristic scoring based on typical vocoder artifacts:
        - AI audio tends to have lower high-frequency energy
        - Lower spectral roll-off (energy concentrated in lower frequencies)
        - Lower high-frequency entropy (more regular patterns)
        - Higher temporal stability in high frequencies
        """
        score = 0.0

        # 1. High-frequency energy (AI typically has less)
        # Natural speech: high_energy > 10, AI: < 5
        if features["high_freq_energy"] < 5.0:
            score += 0.3
        elif features["high_freq_energy"] < 10.0:
            score += 0.15

        # 2. Spectral rolloff (AI concentrates energy lower)
        # Natural: > 6000 Hz, AI: < 4000 Hz
        if features["spectral_rolloff"] < 4000:
            score += 0.25
        elif features["spectral_rolloff"] < 6000:
            score += 0.1

        # 3. High-frequency entropy (AI is more regular)
        # Natural: > 5.0, AI: < 3.0
        if features["high_freq_entropy"] < 3.0:
            score += 0.25
        elif features["high_freq_entropy"] < 5.0:
            score += 0.1

        # 4. High-frequency stability (AI is more stable)
        # Natural: > 2.0, AI: < 1.0
        if features["high_freq_stability"] < 1.0:
            score += 0.2
        elif features["high_freq_stability"] < 2.0:
            score += 0.1

        return min(score, 1.0)

    def _estimate_confidence(self, audio: np.ndarray, features: dict) -> float:
        """
        Estimate confidence based on audio quality and feature reliability.
        """
        confidence = 0.7  # Base confidence

        # Reduce confidence for very short audio
        duration = len(audio) / self.target_sr
        if duration < 2.0:
            confidence *= 0.6
        elif duration < 5.0:
            confidence *= 0.8

        # Reduce confidence if features are borderline
        if 4000 <= features["spectral_rolloff"] <= 6000:
            confidence *= 0.9

        return min(confidence, 1.0)

__init__(target_sr=16000, n_fft=2048, hop_length=512)

Initialize the spectral detector.

Parameters:

Name Type Description Default
target_sr int

Target sample rate for audio resampling

16000
n_fft int

FFT window size

2048
hop_length int

Hop length for STFT

512
Source code in veridex/audio/spectral.py
def __init__(
    self,
    target_sr: int = 16000,
    n_fft: int = 2048,
    hop_length: int = 512,
):
    """
    Initialize the spectral detector.

    Args:
        target_sr: Target sample rate for audio resampling
        n_fft: FFT window size
        hop_length: Hop length for STFT
    """
    self.target_sr = target_sr
    self.n_fft = n_fft
    self.hop_length = hop_length

run(input_data)

Analyze audio for AI-generation artifacts.

Parameters:

Name Type Description Default
input_data Any

Path to audio file (str or Path)

required

Returns:

Type Description
DetectionResult

DetectionResult with score, confidence, and spectral metadata

Source code in veridex/audio/spectral.py
def run(self, input_data: Any) -> DetectionResult:
    """
    Analyze audio for AI-generation artifacts.

    Args:
        input_data: Path to audio file (str or Path)

    Returns:
        DetectionResult with score, confidence, and spectral metadata
    """
    if not isinstance(input_data, (str, Path)):
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error="Input must be a file path (str or Path)"
        )

    try:
        self.check_dependencies()
    except ImportError as e:
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error=str(e)
        )

    try:
        from veridex.audio.utils import load_audio, validate_audio, compute_spectrogram
        import scipy.signal as signal
        import scipy.stats as stats

        # Load audio
        audio, sr = load_audio(input_data, target_sr=self.target_sr)

        # Validate audio
        is_valid, error_msg = validate_audio(audio, sr)
        if not is_valid:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=error_msg
            )

        # Compute spectrogram
        spectrogram = compute_spectrogram(
            audio,
            sr=sr,
            n_fft=self.n_fft,
            hop_length=self.hop_length
        )

        # Analyze spectral features
        features = self._extract_spectral_features(spectrogram, sr)

        # Compute AI probability score
        score = self._compute_score(features)

        # Estimate confidence based on audio quality
        confidence = self._estimate_confidence(audio, features)

        return DetectionResult(
            score=score,
            confidence=confidence,
            metadata=features
        )

    except Exception as e:
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error=f"Spectral analysis failed: {e}"
        )

Wav2VecSignal

Bases: BaseSignal

Audio deepfake detector using Wav2Vec 2.0 foundation models.

Leverages pre-trained models fine-tuned for anti-spoofing to detect synthetic speech. Models typically include: - nii-yamagishilab/wav2vec-large-anti-deepfake - facebook/wav2vec2-base-960h (for feature extraction)

These models provide state-of-the-art accuracy with strong generalization to unseen vocoders and TTS systems.

Note

Requires robust hardware (preferably GPU) for reasonable inference speeds.

Attributes:

Name Type Description
name str

'wav2vec_audio_detector'

dtype str

'audio'

model_id str

HuggingFace model identifier.

Source code in veridex/audio/wav2vec_signal.py
class Wav2VecSignal(BaseSignal):
    """
    Audio deepfake detector using Wav2Vec 2.0 foundation models.

    Leverages pre-trained models fine-tuned for anti-spoofing to detect synthetic speech.
    Models typically include:
    - nii-yamagishilab/wav2vec-large-anti-deepfake
    - facebook/wav2vec2-base-960h (for feature extraction)

    These models provide state-of-the-art accuracy with strong generalization to
    unseen vocoders and TTS systems.

    Note:
        Requires robust hardware (preferably GPU) for reasonable inference speeds.

    Attributes:
        name (str): 'wav2vec_audio_detector'
        dtype (str): 'audio'
        model_id (str): HuggingFace model identifier.
    """

    def __init__(
        self,
        model_id: str = "nii-yamagishilab/wav2vec-large-anti-deepfake",
        use_gpu: bool = True,
    ):
        """
        Initialize the Wav2Vec detector.

        Args:
            model_id: HuggingFace model identifier or local path
            use_gpu: Use GPU acceleration if available
        """
        self.model_id = model_id
        self.use_gpu = use_gpu
        self._model = None
        self._processor = None
        self._device = None

    @property
    def name(self) -> str:
        return "wav2vec_audio_detector"

    @property
    def dtype(self) -> str:
        return "audio"

    def check_dependencies(self) -> None:
        try:
            import torch
            import transformers
            import librosa
            import soundfile
        except ImportError:
            raise ImportError(
                "Wav2Vec detector requires 'torch', 'transformers', 'librosa', and 'soundfile'. "
                "Install with: pip install veridex[audio]"
            )

    def _load_model(self):
        """Lazy load the model."""
        if self._model is not None:
            return

        self.check_dependencies()

        import torch
        from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

        # Determine device
        if self.use_gpu and torch.cuda.is_available():
            self._device = torch.device("cuda")
        else:
            self._device = torch.device("cpu")

        try:
            # Load processor and model
            self._processor = Wav2Vec2Processor.from_pretrained(self.model_id)
            self._model = Wav2Vec2ForSequenceClassification.from_pretrained(
                self.model_id
            ).to(self._device)
            self._model.eval()

        except Exception as e:
            # Fallback: use base wav2vec for feature extraction
            # and simple classifier (less accurate but works)
            raise ImportError(
                f"Failed to load model '{self.model_id}': {e}\n"
                "The model may not be available. Consider using SpectralSignal instead."
            )

    def run(self, input_data: Any) -> DetectionResult:
        """
        Detect AI-generated audio using Wav2Vec 2.0.

        Args:
            input_data: Path to audio file (str or Path)

        Returns:
            DetectionResult with score, confidence, and model metadata
        """
        if not isinstance(input_data, (str, Path)):
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error="Input must be a file path (str or Path)"
            )

        try:
            self._load_model()
        except ImportError as e:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=str(e)
            )

        try:
            import torch
            from veridex.audio.utils import load_audio, validate_audio

            # Load audio at 16kHz (Wav2Vec standard)
            audio, sr = load_audio(input_data, target_sr=16000)

            # Validate
            is_valid, error_msg = validate_audio(audio, sr)
            if not is_valid:
                return DetectionResult(
                    score=0.0,
                    confidence=0.0,
                    error=error_msg
                )

            # Preprocess audio
            inputs = self._processor(
                audio,
                sampling_rate=sr,
                return_tensors="pt",
                padding=True
            )

            # Move to device
            inputs = {k: v.to(self._device) for k, v in inputs.items()}

            # Inference
            with torch.no_grad():
                outputs = self._model(**inputs)
                logits = outputs.logits

                # Get probabilities
                probs = torch.nn.functional.softmax(logits, dim=-1)

                # Assuming binary classification: [Real, Fake]
                # Model may use different label orders, check config
                fake_prob = probs[0, 1].item() if probs.shape[1] == 2 else probs[0, 0].item()

            # Compute confidence based on margin
            confidence = self._compute_confidence(probs[0].cpu().numpy())

            return DetectionResult(
                score=float(fake_prob),
                confidence=confidence,
                metadata={
                    "model_id": self.model_id,
                    "device": str(self._device),
                    "audio_duration": len(audio) / sr,
                    "logits": logits[0].cpu().tolist(),
                }
            )

        except Exception as e:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=f"Wav2Vec detection failed: {e}"
            )

    def _compute_confidence(self, probs: np.ndarray) -> float:
        """
        Compute confidence based on prediction margin.

        Higher margin = more confident prediction.
        """
        # Confidence based on max probability
        max_prob = np.max(probs)

        # Also consider the margin to second-best
        sorted_probs = np.sort(probs)[::-1]
        if len(sorted_probs) > 1:
            margin = sorted_probs[0] - sorted_probs[1]
            # Confidence increases with margin
            confidence = 0.5 + 0.5 * margin
        else:
            confidence = max_prob

        return float(confidence)

__init__(model_id='nii-yamagishilab/wav2vec-large-anti-deepfake', use_gpu=True)

Initialize the Wav2Vec detector.

Parameters:

Name Type Description Default
model_id str

HuggingFace model identifier or local path

'nii-yamagishilab/wav2vec-large-anti-deepfake'
use_gpu bool

Use GPU acceleration if available

True
Source code in veridex/audio/wav2vec_signal.py
def __init__(
    self,
    model_id: str = "nii-yamagishilab/wav2vec-large-anti-deepfake",
    use_gpu: bool = True,
):
    """
    Initialize the Wav2Vec detector.

    Args:
        model_id: HuggingFace model identifier or local path
        use_gpu: Use GPU acceleration if available
    """
    self.model_id = model_id
    self.use_gpu = use_gpu
    self._model = None
    self._processor = None
    self._device = None

run(input_data)

Detect AI-generated audio using Wav2Vec 2.0.

Parameters:

Name Type Description Default
input_data Any

Path to audio file (str or Path)

required

Returns:

Type Description
DetectionResult

DetectionResult with score, confidence, and model metadata

Source code in veridex/audio/wav2vec_signal.py
def run(self, input_data: Any) -> DetectionResult:
    """
    Detect AI-generated audio using Wav2Vec 2.0.

    Args:
        input_data: Path to audio file (str or Path)

    Returns:
        DetectionResult with score, confidence, and model metadata
    """
    if not isinstance(input_data, (str, Path)):
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error="Input must be a file path (str or Path)"
        )

    try:
        self._load_model()
    except ImportError as e:
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error=str(e)
        )

    try:
        import torch
        from veridex.audio.utils import load_audio, validate_audio

        # Load audio at 16kHz (Wav2Vec standard)
        audio, sr = load_audio(input_data, target_sr=16000)

        # Validate
        is_valid, error_msg = validate_audio(audio, sr)
        if not is_valid:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=error_msg
            )

        # Preprocess audio
        inputs = self._processor(
            audio,
            sampling_rate=sr,
            return_tensors="pt",
            padding=True
        )

        # Move to device
        inputs = {k: v.to(self._device) for k, v in inputs.items()}

        # Inference
        with torch.no_grad():
            outputs = self._model(**inputs)
            logits = outputs.logits

            # Get probabilities
            probs = torch.nn.functional.softmax(logits, dim=-1)

            # Assuming binary classification: [Real, Fake]
            # Model may use different label orders, check config
            fake_prob = probs[0, 1].item() if probs.shape[1] == 2 else probs[0, 0].item()

        # Compute confidence based on margin
        confidence = self._compute_confidence(probs[0].cpu().numpy())

        return DetectionResult(
            score=float(fake_prob),
            confidence=confidence,
            metadata={
                "model_id": self.model_id,
                "device": str(self._device),
                "audio_duration": len(audio) / sr,
                "logits": logits[0].cpu().tolist(),
            }
        )

    except Exception as e:
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error=f"Wav2Vec detection failed: {e}"
        )

AASISTSignal

Bases: BaseSignal

AASIST-inspired audio deepfake detector.

Extracts spectro-temporal features that are effective for detecting vocoder artifacts: - Temporal variation patterns in spectral components - Non-local correlations between frequency and time - Phase coherence across frequency bands

This implementation provides a feature-based approach that captures the key insights of the AASIST architecture without the full graph attention network complexity, balancing accuracy and computational efficiency.

Attributes:

Name Type Description
name str

'aasist_audio_detector'

dtype str

'audio'

Source code in veridex/audio/aasist_signal.py
class AASISTSignal(BaseSignal):
    """
    AASIST-inspired audio deepfake detector.

    Extracts spectro-temporal features that are effective for detecting vocoder artifacts:
    - Temporal variation patterns in spectral components
    - Non-local correlations between frequency and time
    - Phase coherence across frequency bands

    This implementation provides a feature-based approach that captures the key insights
    of the AASIST architecture without the full graph attention network complexity,
    balancing accuracy and computational efficiency.

    Attributes:
        name (str): 'aasist_audio_detector'
        dtype (str): 'audio'
    """

    def __init__(
        self,
        target_sr: int = 16000,
        n_fft: int = 512,
        hop_length: int = 256,
    ):
        """
        Initialize the AASIST-inspired detector.

        Args:
            target_sr: Target sample rate
            n_fft: FFT window size
            hop_length: Hop length for STFT
        """
        self.target_sr = target_sr
        self.n_fft = n_fft
        self.hop_length = hop_length

    @property
    def name(self) -> str:
        return "aasist_audio_detector"

    @property
    def dtype(self) -> str:
        return "audio"

    def check_dependencies(self) -> None:
        try:
            import librosa
            import soundfile
            import scipy
        except ImportError:
            raise ImportError(
                "AASIST detector requires 'librosa', 'soundfile', and 'scipy'. "
                "Install with: pip install veridex[audio]"
            )

    def run(self, input_data: Any) -> DetectionResult:
        """
        Analyze audio using spectro-temporal features.

        Args:
            input_data: Path to audio file (str or Path)

        Returns:
            DetectionResult with score, confidence, and feature metadata
        """
        if not isinstance(input_data, (str, Path)):
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error="Input must be a file path (str or Path)"
            )

        try:
            self.check_dependencies()
        except ImportError as e:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=str(e)
            )

        try:
            from veridex.audio.utils import (
                load_audio,
                validate_audio,
                extract_mel_spectrogram,
            )
            import scipy.stats as stats

            # Load audio
            audio, sr = load_audio(input_data, target_sr=self.target_sr)

            # Validate
            is_valid, error_msg = validate_audio(audio, sr)
            if not is_valid:
                return DetectionResult(
                    score=0.0,
                    confidence=0.0,
                    error=error_msg
                )

            # Extract mel-spectrogram
            mel_spec = extract_mel_spectrogram(
                audio,
                sr=sr,
                n_mels=80,
                n_fft=self.n_fft,
                hop_length=self.hop_length
            )

            # Extract spectro-temporal features
            features = self._extract_spectro_temporal_features(mel_spec, audio, sr)

            # Compute AI probability score
            score = self._compute_score(features)

            # Estimate confidence
            confidence = self._estimate_confidence(audio, features, sr)

            return DetectionResult(
                score=score,
                confidence=confidence,
                metadata=features
            )

        except Exception as e:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=f"AASIST analysis failed: {e}"
            )

    def _extract_spectro_temporal_features(
        self,
        mel_spec: np.ndarray,
        audio: np.ndarray,
        sr: int
    ) -> dict:
        """
        Extract spectro-temporal features inspired by AASIST.
        """
        import scipy.stats as stats
        from scipy.signal import stft

        # 1. Temporal modulation features
        # Measure variation in each frequency band over time
        temporal_variation = np.std(mel_spec, axis=1)
        mean_temporal_variation = float(np.mean(temporal_variation))
        max_temporal_variation = float(np.max(temporal_variation))

        # 2. Spectral modulation features
        # Measure variation across frequency at each time
        spectral_variation = np.std(mel_spec, axis=0)
        mean_spectral_variation = float(np.mean(spectral_variation))

        # 3. Phase coherence
        # AI vocoders often have unnatural phase relationships
        f, t, Zxx = stft(audio, fs=sr, nperseg=self.n_fft, noverlap=self.n_fft - self.hop_length)
        phase = np.angle(Zxx)

        # Phase derivative (instantaneous frequency deviation)
        phase_diff = np.diff(phase, axis=1)
        phase_coherence = float(np.mean(np.abs(phase_diff)))
        phase_std = float(np.std(phase_diff))

        # 4. Energy distribution over time
        # AI often has more uniform energy distribution
        frame_energy = np.sum(mel_spec, axis=0)
        energy_entropy = float(stats.entropy(frame_energy + 1e-10))
        energy_uniformity = float(1.0 / (1.0 + np.std(frame_energy)))

        # 5. Cross-correlation between frequency bands
        # Natural speech has specific correlations, AI differs
        # Sample a few bands to reduce computation
        bands = [0, mel_spec.shape[0]//4, mel_spec.shape[0]//2, 3*mel_spec.shape[0]//4, -1]
        correlations = []
        for i in range(len(bands) - 1):
            band1 = mel_spec[bands[i], :]
            band2 = mel_spec[bands[i+1], :]
            corr = np.corrcoef(band1, band2)[0, 1]
            if not np.isnan(corr):
                correlations.append(abs(corr))

        mean_band_correlation = float(np.mean(correlations)) if correlations else 0.0

        # 6. Spectral flux (measure of spectral change)
        spectral_flux = np.sqrt(np.sum(np.diff(mel_spec, axis=1)**2, axis=0))
        mean_spectral_flux = float(np.mean(spectral_flux))

        return {
            "mean_temporal_variation": mean_temporal_variation,
            "max_temporal_variation": max_temporal_variation,
            "mean_spectral_variation": mean_spectral_variation,
            "phase_coherence": phase_coherence,
            "phase_std": phase_std,
            "energy_entropy": energy_entropy,
            "energy_uniformity": energy_uniformity,
            "mean_band_correlation": mean_band_correlation,
            "mean_spectral_flux": mean_spectral_flux,
        }

    def _compute_score(self, features: dict) -> float:
        """
        Compute AI probability from spectro-temporal features.

        AI-generated audio typically shows:
        - Lower temporal variation (smoother transitions)
        - Higher energy uniformity (more consistent amplitude)
        - Unusual phase relationships
        - Lower spectral flux (less dynamic spectral changes)
        """
        score = 0.0

        # 1. Temporal variation (AI is smoother)
        # Natural: > 15, AI: < 10
        if features["mean_temporal_variation"] < 8.0:
            score += 0.2
        elif features["mean_temporal_variation"] < 12.0:
            score += 0.1

        # 2. Energy uniformity (AI is more uniform)
        # Natural: < 0.3, AI: > 0.5
        if features["energy_uniformity"] > 0.6:
            score += 0.25
        elif features["energy_uniformity"] > 0.4:
            score += 0.15

        # 3. Phase coherence (AI has different patterns)
        # This is complex, use as moderate signal
        if features["phase_coherence"] < 1.0 or features["phase_coherence"] > 3.0:
            score += 0.15

        # 4. Spectral flux (AI has less dynamic changes)
        # Natural: > 20, AI: < 15
        if features["mean_spectral_flux"] < 12.0:
            score += 0.2
        elif features["mean_spectral_flux"] < 18.0:
            score += 0.1

        # 5. Band correlation (AI has unusual patterns)
        # Natural: 0.3-0.7, AI: < 0.3 or > 0.8
        if features["mean_band_correlation"] > 0.8 or features["mean_band_correlation"] < 0.25:
            score += 0.2

        return min(score, 1.0)

    def _estimate_confidence(
        self,
        audio: np.ndarray,
        features: dict,
        sr: int
    ) -> float:
        """Estimate confidence based on signal quality."""
        confidence = 0.65  # Base confidence

        # Longer audio = more confident
        duration = len(audio) / sr
        if duration > 5.0:
            confidence += 0.1
        elif duration < 1.0:
            confidence -= 0.2

        # Check if features are in decisive ranges
        if features["energy_uniformity"] > 0.6 or features["energy_uniformity"] < 0.2:
            confidence += 0.05

        return min(max(confidence, 0.0), 1.0)

__init__(target_sr=16000, n_fft=512, hop_length=256)

Initialize the AASIST-inspired detector.

Parameters:

Name Type Description Default
target_sr int

Target sample rate

16000
n_fft int

FFT window size

512
hop_length int

Hop length for STFT

256
Source code in veridex/audio/aasist_signal.py
def __init__(
    self,
    target_sr: int = 16000,
    n_fft: int = 512,
    hop_length: int = 256,
):
    """
    Initialize the AASIST-inspired detector.

    Args:
        target_sr: Target sample rate
        n_fft: FFT window size
        hop_length: Hop length for STFT
    """
    self.target_sr = target_sr
    self.n_fft = n_fft
    self.hop_length = hop_length

run(input_data)

Analyze audio using spectro-temporal features.

Parameters:

Name Type Description Default
input_data Any

Path to audio file (str or Path)

required

Returns:

Type Description
DetectionResult

DetectionResult with score, confidence, and feature metadata

Source code in veridex/audio/aasist_signal.py
def run(self, input_data: Any) -> DetectionResult:
    """
    Analyze audio using spectro-temporal features.

    Args:
        input_data: Path to audio file (str or Path)

    Returns:
        DetectionResult with score, confidence, and feature metadata
    """
    if not isinstance(input_data, (str, Path)):
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error="Input must be a file path (str or Path)"
        )

    try:
        self.check_dependencies()
    except ImportError as e:
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error=str(e)
        )

    try:
        from veridex.audio.utils import (
            load_audio,
            validate_audio,
            extract_mel_spectrogram,
        )
        import scipy.stats as stats

        # Load audio
        audio, sr = load_audio(input_data, target_sr=self.target_sr)

        # Validate
        is_valid, error_msg = validate_audio(audio, sr)
        if not is_valid:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=error_msg
            )

        # Extract mel-spectrogram
        mel_spec = extract_mel_spectrogram(
            audio,
            sr=sr,
            n_mels=80,
            n_fft=self.n_fft,
            hop_length=self.hop_length
        )

        # Extract spectro-temporal features
        features = self._extract_spectro_temporal_features(mel_spec, audio, sr)

        # Compute AI probability score
        score = self._compute_score(features)

        # Estimate confidence
        confidence = self._estimate_confidence(audio, features, sr)

        return DetectionResult(
            score=score,
            confidence=confidence,
            metadata=features
        )

    except Exception as e:
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error=f"AASIST analysis failed: {e}"
        )

BreathingSignal

Bases: BaseSignal

Detects AI-generated audio by analyzing breathing patterns.

Natural speech has regular inhalations (breaths) that have distinct spectral and temporal characteristics (broadband noise, specific duration). AI models often suppress these or generate them poorly.

Attributes:

Name Type Description
name str

'breathing_audio_detector'

dtype str

'audio'

Source code in veridex/audio/breathing_signal.py
class BreathingSignal(BaseSignal):
    """
    Detects AI-generated audio by analyzing breathing patterns.

    Natural speech has regular inhalations (breaths) that have distinct
    spectral and temporal characteristics (broadband noise, specific duration).
    AI models often suppress these or generate them poorly.

    Attributes:
        name (str): 'breathing_audio_detector'
        dtype (str): 'audio'
    """

    def __init__(self, target_sr: int = 16000):
        self.target_sr = target_sr

    @property
    def name(self) -> str:
        return "breathing_audio_detector"

    @property
    def dtype(self) -> str:
        return "audio"

    def check_dependencies(self) -> None:
        try:
            import librosa
            import scipy
        except ImportError:
            raise ImportError(
                "Breathing detector requires 'librosa' and 'scipy'. "
                "Install with: pip install veridex[audio]"
            )

    def run(self, input_data: Any) -> DetectionResult:
        """
        Analyze audio for breathing patterns.

        Args:
            input_data: Path to audio file.

        Returns:
            DetectionResult with score (high score = AI), confidence, and metadata.
        """
        if not isinstance(input_data, (str, Path)):
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error="Input must be a file path"
            )

        try:
            self.check_dependencies()
        except ImportError as e:
            return DetectionResult(score=0.0, confidence=0.0, error=str(e))

        try:
            from veridex.audio.utils import load_audio, validate_audio
            import librosa

            # Load audio
            audio, sr = load_audio(input_data, target_sr=self.target_sr)

            # Validate
            is_valid, error_msg = validate_audio(audio, sr)
            if not is_valid:
                return DetectionResult(score=0.0, confidence=0.0, error=error_msg)

            # Detect breaths
            breaths = self._detect_breaths(audio, sr)

            # Compute features
            metrics = self._compute_breath_metrics(breaths, len(audio)/sr)

            # Compute score
            score = self._compute_score(metrics)

            # Compute confidence
            confidence = self._compute_confidence(metrics, len(audio)/sr)

            return DetectionResult(
                score=score,
                confidence=confidence,
                metadata=metrics
            )

        except Exception as e:
            return DetectionResult(score=0.0, confidence=0.0, error=f"Breathing detection failed: {e}")

    def _detect_breaths(self, audio: np.ndarray, sr: int) -> List[Tuple[float, float]]:
        """
        Detect potential breath segments.

        Breaths are characterized by:
        - Low to medium energy (silence < breath < speech)
        - High spectral centroid (high frequency noise)
        - Specific duration (typically 0.1s to 0.8s)
        """
        import librosa

        # 1. Feature Extraction
        hop_length = 512
        frame_length = 2048

        # RMS Energy
        rms = librosa.feature.rms(y=audio, frame_length=frame_length, hop_length=hop_length)[0]

        # Spectral Centroid
        centroid = librosa.feature.spectral_centroid(y=audio, sr=sr, n_fft=frame_length, hop_length=hop_length)[0]

        # Zero Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(y=audio, frame_length=frame_length, hop_length=hop_length)[0]

        # 2. Thresholding
        # Heuristics for breath detection (simplified from literature)

        # Normalize features
        rms_norm = (rms - np.min(rms)) / (np.max(rms) - np.min(rms) + 1e-9)

        # Breaths usually have low energy but not absolute silence
        # Speech has high energy
        # Silence has near zero energy

        # Masks
        # 1. Energy window: Not too loud (speech), not too quiet (silence)
        # These thresholds are heuristic and might need tuning
        is_breath_energy = (rms_norm > 0.01) & (rms_norm < 0.2)

        # 2. High frequency content (breaths are hissy)
        # Centroid usually > 2000Hz-3000Hz for breaths
        is_breath_freq = (centroid > 2500)

        # 3. High Zero Crossing Rate
        is_breath_zcr = (zcr > 0.1)

        # Combine
        is_breath_frame = is_breath_energy & is_breath_freq & is_breath_zcr

        # 3. Group frames into segments
        breaths = []
        in_breath = False
        start_frame = 0
        min_breath_frames = int(0.15 * sr / hop_length) # Min 150ms
        max_breath_frames = int(1.2 * sr / hop_length)  # Max 1.2s

        for i, is_breath in enumerate(is_breath_frame):
            if is_breath and not in_breath:
                in_breath = True
                start_frame = i
            elif not is_breath and in_breath:
                in_breath = False
                duration_frames = i - start_frame

                if min_breath_frames <= duration_frames <= max_breath_frames:
                    # Convert to seconds
                    start_time = start_frame * hop_length / sr
                    end_time = i * hop_length / sr
                    breaths.append((start_time, end_time))

        return breaths

    def _compute_breath_metrics(self, breaths: List[Tuple[float, float]], duration: float) -> dict:
        """Compute statistics about detected breaths."""
        num_breaths = len(breaths)
        total_breath_duration = sum(end - start for start, end in breaths)

        bpm = (num_breaths / duration) * 60 if duration > 0 else 0
        breath_ratio = total_breath_duration / duration if duration > 0 else 0
        avg_breath_duration = total_breath_duration / num_breaths if num_breaths > 0 else 0

        # Calculate regularity (std dev of intervals)
        intervals = []
        for i in range(len(breaths) - 1):
            intervals.append(breaths[i+1][0] - breaths[i][1]) # End of one to start of next

        interval_std = float(np.std(intervals)) if len(intervals) > 1 else 0.0

        return {
            "num_breaths": num_breaths,
            "breaths_per_minute": bpm,
            "breath_ratio": breath_ratio,
            "avg_breath_duration": avg_breath_duration,
            "interval_std": interval_std,
            "duration": duration,
            "breaths": breaths # List of (start, end)
        }

    def _compute_score(self, metrics: dict) -> float:
        """
        Compute AI probability score.

        Hypothesis:
        - AI speech (especially older or standard TTS) often lacks breaths entirely -> High Score.
        - Or breaths are very regular/robotic (low interval std) -> Medium Score.
        - Human speech has natural, semi-regular breathing -> Low Score.
        """
        duration = metrics["duration"]
        bpm = metrics["breaths_per_minute"]

        # If very short audio, unreliable
        if duration < 3.0:
            return 0.5 # Neutral

        score = 0.0

        # 1. Lack of breaths (The strongest signal for many TTS)
        # Humans typically breathe 10-20 times per minute in conversation,
        # but in reading/acting it varies.
        # < 2 BPM is very suspicious for continuous speech > 10s
        if bpm < 1.0:
            score = 0.95
        elif bpm < 3.0:
            score = 0.8
        elif bpm < 5.0:
            score = 0.6
        else:
            # 2. Too many breaths (Hyper-breathing deepfakes exist but rare)
            if bpm > 40:
                score = 0.7
            else:
                score = 0.1 # Likely human

        return score

    def _compute_confidence(self, metrics: dict, duration: float) -> float:
        """Estimate confidence in the detection."""
        confidence = 0.7 # Base

        # Short audio is hard to judge for breathing patterns
        if duration < 5.0:
            confidence = 0.3
        elif duration < 10.0:
            confidence = 0.5
        elif duration > 20.0:
            confidence = 0.9

        return confidence

run(input_data)

Analyze audio for breathing patterns.

Parameters:

Name Type Description Default
input_data Any

Path to audio file.

required

Returns:

Type Description
DetectionResult

DetectionResult with score (high score = AI), confidence, and metadata.

Source code in veridex/audio/breathing_signal.py
def run(self, input_data: Any) -> DetectionResult:
    """
    Analyze audio for breathing patterns.

    Args:
        input_data: Path to audio file.

    Returns:
        DetectionResult with score (high score = AI), confidence, and metadata.
    """
    if not isinstance(input_data, (str, Path)):
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error="Input must be a file path"
        )

    try:
        self.check_dependencies()
    except ImportError as e:
        return DetectionResult(score=0.0, confidence=0.0, error=str(e))

    try:
        from veridex.audio.utils import load_audio, validate_audio
        import librosa

        # Load audio
        audio, sr = load_audio(input_data, target_sr=self.target_sr)

        # Validate
        is_valid, error_msg = validate_audio(audio, sr)
        if not is_valid:
            return DetectionResult(score=0.0, confidence=0.0, error=error_msg)

        # Detect breaths
        breaths = self._detect_breaths(audio, sr)

        # Compute features
        metrics = self._compute_breath_metrics(breaths, len(audio)/sr)

        # Compute score
        score = self._compute_score(metrics)

        # Compute confidence
        confidence = self._compute_confidence(metrics, len(audio)/sr)

        return DetectionResult(
            score=score,
            confidence=confidence,
            metadata=metrics
        )

    except Exception as e:
        return DetectionResult(score=0.0, confidence=0.0, error=f"Breathing detection failed: {e}")

SilenceSignal

Bases: BaseSignal

Analyzes silence intervals (pauses) in speech audio.

Synthetic speech (TTS) often has regular, unnatural, or non-existent pauses compared to natural speech. This signal calculates the ratio of silence to total audio duration and the variance of silence durations.

Source code in veridex/audio/silence.py
class SilenceSignal(BaseSignal):
    """
    Analyzes silence intervals (pauses) in speech audio.

    Synthetic speech (TTS) often has regular, unnatural, or non-existent pauses compared to natural speech.
    This signal calculates the ratio of silence to total audio duration and the variance of silence durations.
    """

    @property
    def name(self) -> str:
        return "silence_analysis"

    @property
    def dtype(self) -> str:
        return "audio"

    def check_dependencies(self) -> None:
        try:
            import librosa
        except ImportError:
            raise ImportError("librosa is required for SilenceSignal. Install veridex[audio].")

    def run(self, input_data: Any) -> DetectionResult:
        """
        Runs silence analysis on the input audio file path or numpy array.
        """
        try:
            self.check_dependencies()
            import librosa
            if isinstance(input_data, str):
                y, sr = librosa.load(input_data, sr=None)
            elif isinstance(input_data, tuple):
                # Assume (y, sr) tuple
                y, sr = input_data
            else:
                 return DetectionResult(
                    score=0.0,
                    confidence=0.0,
                    error="Input must be a file path or (y, sr) tuple."
                )

            # Detect non-silent intervals
            # top_db: The threshold (in decibels) below reference to consider as silence
            non_silent_intervals = librosa.effects.split(y, top_db=20)

            if len(non_silent_intervals) == 0:
                 return DetectionResult(
                    score=0.0, # Cannot determine
                    confidence=0.0,
                    explanation="Audio is completely silent."
                )

            total_duration = len(y) / sr

            # Calculate total non-silent duration
            non_silent_duration = sum([(end - start) for start, end in non_silent_intervals]) / sr
            silence_duration = total_duration - non_silent_duration
            silence_ratio = silence_duration / total_duration

            # Analyze pause lengths (gaps between intervals)
            pause_lengths = []
            for i in range(len(non_silent_intervals) - 1):
                # End of current minus start of next
                pause_samples = non_silent_intervals[i+1][0] - non_silent_intervals[i][1]
                pause_lengths.append(pause_samples / sr)

            if pause_lengths:
                mean_pause = np.mean(pause_lengths)
                std_pause = np.std(pause_lengths)
            else:
                mean_pause = 0.0
                std_pause = 0.0

            # Heuristic:
            # - Very low silence ratio -> typically synthetic (early TTS)
            # - Very low variance in pause lengths -> synthetic (robotic pacing)

            # Simple score: if silence ratio is very low (< 5%), likely AI.
            # Using a gaussian-like drop off? Let's keep it simple linear for now.

            is_suspiciously_continuous = 1.0 if silence_ratio < 0.05 else 0.0

            # If standard deviation of pauses is very low (e.g. < 0.05s), it's robotic
            is_robotic_pacing = 1.0 if (len(pause_lengths) > 2 and std_pause < 0.05) else 0.0

            score = max(is_suspiciously_continuous, is_robotic_pacing)

            # Confidence based on amount of evidence
            # More pauses analyzed = higher confidence
            if len(pause_lengths) > 10:
                base_confidence = 0.45
            elif len(pause_lengths) > 5:
                base_confidence = 0.4
            else:
                base_confidence = 0.35

            # Boost confidence if signal is strong (clear detection)
            if score > 0.5:
                confidence = min(base_confidence + 0.05, 0.5)
            else:
                confidence = base_confidence

            return DetectionResult(
                score=score,
                confidence=confidence,
                metadata={
                    "silence_ratio": float(silence_ratio),
                    "mean_pause_duration": float(mean_pause),
                    "pause_duration_std": float(std_pause),
                    "total_duration": float(total_duration),
                    "num_pauses": len(pause_lengths)
                }
            )

        except Exception as e:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                error=f"Silence analysis failed: {str(e)}"
            )

run(input_data)

Runs silence analysis on the input audio file path or numpy array.

Source code in veridex/audio/silence.py
def run(self, input_data: Any) -> DetectionResult:
    """
    Runs silence analysis on the input audio file path or numpy array.
    """
    try:
        self.check_dependencies()
        import librosa
        if isinstance(input_data, str):
            y, sr = librosa.load(input_data, sr=None)
        elif isinstance(input_data, tuple):
            # Assume (y, sr) tuple
            y, sr = input_data
        else:
             return DetectionResult(
                score=0.0,
                confidence=0.0,
                error="Input must be a file path or (y, sr) tuple."
            )

        # Detect non-silent intervals
        # top_db: The threshold (in decibels) below reference to consider as silence
        non_silent_intervals = librosa.effects.split(y, top_db=20)

        if len(non_silent_intervals) == 0:
             return DetectionResult(
                score=0.0, # Cannot determine
                confidence=0.0,
                explanation="Audio is completely silent."
            )

        total_duration = len(y) / sr

        # Calculate total non-silent duration
        non_silent_duration = sum([(end - start) for start, end in non_silent_intervals]) / sr
        silence_duration = total_duration - non_silent_duration
        silence_ratio = silence_duration / total_duration

        # Analyze pause lengths (gaps between intervals)
        pause_lengths = []
        for i in range(len(non_silent_intervals) - 1):
            # End of current minus start of next
            pause_samples = non_silent_intervals[i+1][0] - non_silent_intervals[i][1]
            pause_lengths.append(pause_samples / sr)

        if pause_lengths:
            mean_pause = np.mean(pause_lengths)
            std_pause = np.std(pause_lengths)
        else:
            mean_pause = 0.0
            std_pause = 0.0

        # Heuristic:
        # - Very low silence ratio -> typically synthetic (early TTS)
        # - Very low variance in pause lengths -> synthetic (robotic pacing)

        # Simple score: if silence ratio is very low (< 5%), likely AI.
        # Using a gaussian-like drop off? Let's keep it simple linear for now.

        is_suspiciously_continuous = 1.0 if silence_ratio < 0.05 else 0.0

        # If standard deviation of pauses is very low (e.g. < 0.05s), it's robotic
        is_robotic_pacing = 1.0 if (len(pause_lengths) > 2 and std_pause < 0.05) else 0.0

        score = max(is_suspiciously_continuous, is_robotic_pacing)

        # Confidence based on amount of evidence
        # More pauses analyzed = higher confidence
        if len(pause_lengths) > 10:
            base_confidence = 0.45
        elif len(pause_lengths) > 5:
            base_confidence = 0.4
        else:
            base_confidence = 0.35

        # Boost confidence if signal is strong (clear detection)
        if score > 0.5:
            confidence = min(base_confidence + 0.05, 0.5)
        else:
            confidence = base_confidence

        return DetectionResult(
            score=score,
            confidence=confidence,
            metadata={
                "silence_ratio": float(silence_ratio),
                "mean_pause_duration": float(mean_pause),
                "pause_duration_std": float(std_pause),
                "total_duration": float(total_duration),
                "num_pauses": len(pause_lengths)
            }
        )

    except Exception as e:
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            error=f"Silence analysis failed: {str(e)}"
        )