Skip to content

Text Signals API

PerplexitySignal

Bases: BaseSignal

Analyzes text complexity using Perplexity metrics.

This signal assumes that AI-generated text tends to have lower perplexity (more predictable) compared to human-written text.

Mechanism: 1. Tokenize input text using a pretrained tokenizer (e.g., GPT-2). 2. Calculate perplexity using the corresponding language model. 3. Map perplexity to an AI probability score using a logistic function. - Low perplexity -> High AI Probability. - High perplexity -> Low AI Probability (Human).

Attributes:

Name Type Description
model_name str

Name of the underlying model (default: "gpt2").

device Optional[str]

Device to run model on ('cpu', 'cuda').

Source code in veridex/text/perplexity.py
class PerplexitySignal(BaseSignal):
    """
    Analyzes text complexity using Perplexity metrics.

    This signal assumes that AI-generated text tends to have lower perplexity (more predictable)
    compared to human-written text.

    **Mechanism**:
    1.  Tokenize input text using a pretrained tokenizer (e.g., GPT-2).
    2.  Calculate perplexity using the corresponding language model.
    3.  Map perplexity to an AI probability score using a logistic function.
        - Low perplexity -> High AI Probability.
        - High perplexity -> Low AI Probability (Human).

    Attributes:
        model_name (str): Name of the underlying model (default: "gpt2").
        device (Optional[str]): Device to run model on ('cpu', 'cuda').
    """

    def __init__(self, model_name: str = "gpt2", device: Optional[str] = None):
        """
        Initialize the Perplexity signal.

        Args:
            model_name (str): Identifier for the model used to calculate perplexity.
            device (Optional[str]): Device to run model on.
        """
        self.model_name = model_name
        self._device = device
        self._model = None
        self._tokenizer = None

    @property
    def name(self) -> str:
        """Returns 'perplexity'."""
        return "perplexity"

    @property
    def dtype(self) -> str:
        """Returns 'text'."""
        return "text"

    def check_dependencies(self) -> None:
        try:
            import torch
            import transformers
        except ImportError:
            raise ImportError(
                "PerplexitySignal requires 'torch' and 'transformers'. "
                "Install with `pip install veridex[text]`"
            )

    def _load_model(self):
        if self._model is not None:
            return

        self.check_dependencies()
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer

        if self._device is None:
            self._device = "cuda" if torch.cuda.is_available() else "cpu"

        self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self._model = AutoModelForCausalLM.from_pretrained(self.model_name).to(self._device)
        self._model.eval()

    def run(self, input_data: Any) -> DetectionResult:
        """
        Calculate perplexity and convert to an AI score.

        Args:
            input_data (str): Text to analyze.

        Returns:
            DetectionResult:
                - score: 0.0-1.0 AI probability.
                - metadata: {'mean_perplexity', 'model_id'}.
        """
        if not isinstance(input_data, str):
             return DetectionResult(score=0.0, confidence=0.0, error="Input must be a string")

        try:
            self._load_model()
            import torch

            # Tokenize with truncation to max length (usually 1024 for GPT-2)
            # This prevents crashes on long text.
            inputs = self._tokenizer(
                input_data,
                return_tensors="pt",
                truncation=True,
                max_length=1024
            )

            if inputs["input_ids"].shape[1] < 2:
                 # Too short for meaningful perplexity
                 return DetectionResult(score=0.5, confidence=0.0, metadata={"reason": "Text too short"})

            inputs = {k: v.to(self._device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self._model(**inputs, labels=inputs["input_ids"])
                loss = outputs.loss
                perplexity = torch.exp(loss).item()

            # Heuristic mapping from Perplexity to AI Score
            # Threshold = 50. If PPL < 50, P(AI) > 0.5.
            threshold = 50.0
            scale = 10.0
            score = 1.0 / (1.0 + np.exp((perplexity - threshold) / scale))

            return DetectionResult(
                score=float(score),
                confidence=0.7,
                metadata={
                    "mean_perplexity": perplexity,
                    "model_id": self.model_name
                }
            )

        except ImportError as e:
            return DetectionResult(score=0.0, confidence=0.0, error=str(e))
        except Exception as e:
             return DetectionResult(score=0.0, confidence=0.0, error=str(e))

name property

Returns 'perplexity'.

dtype property

Returns 'text'.

__init__(model_name='gpt2', device=None)

Initialize the Perplexity signal.

Parameters:

Name Type Description Default
model_name str

Identifier for the model used to calculate perplexity.

'gpt2'
device Optional[str]

Device to run model on.

None
Source code in veridex/text/perplexity.py
def __init__(self, model_name: str = "gpt2", device: Optional[str] = None):
    """
    Initialize the Perplexity signal.

    Args:
        model_name (str): Identifier for the model used to calculate perplexity.
        device (Optional[str]): Device to run model on.
    """
    self.model_name = model_name
    self._device = device
    self._model = None
    self._tokenizer = None

run(input_data)

Calculate perplexity and convert to an AI score.

Parameters:

Name Type Description Default
input_data str

Text to analyze.

required

Returns:

Name Type Description
DetectionResult DetectionResult
  • score: 0.0-1.0 AI probability.
  • metadata: {'mean_perplexity', 'model_id'}.
Source code in veridex/text/perplexity.py
def run(self, input_data: Any) -> DetectionResult:
    """
    Calculate perplexity and convert to an AI score.

    Args:
        input_data (str): Text to analyze.

    Returns:
        DetectionResult:
            - score: 0.0-1.0 AI probability.
            - metadata: {'mean_perplexity', 'model_id'}.
    """
    if not isinstance(input_data, str):
         return DetectionResult(score=0.0, confidence=0.0, error="Input must be a string")

    try:
        self._load_model()
        import torch

        # Tokenize with truncation to max length (usually 1024 for GPT-2)
        # This prevents crashes on long text.
        inputs = self._tokenizer(
            input_data,
            return_tensors="pt",
            truncation=True,
            max_length=1024
        )

        if inputs["input_ids"].shape[1] < 2:
             # Too short for meaningful perplexity
             return DetectionResult(score=0.5, confidence=0.0, metadata={"reason": "Text too short"})

        inputs = {k: v.to(self._device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self._model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            perplexity = torch.exp(loss).item()

        # Heuristic mapping from Perplexity to AI Score
        # Threshold = 50. If PPL < 50, P(AI) > 0.5.
        threshold = 50.0
        scale = 10.0
        score = 1.0 / (1.0 + np.exp((perplexity - threshold) / scale))

        return DetectionResult(
            score=float(score),
            confidence=0.7,
            metadata={
                "mean_perplexity": perplexity,
                "model_id": self.model_name
            }
        )

    except ImportError as e:
        return DetectionResult(score=0.0, confidence=0.0, error=str(e))
    except Exception as e:
         return DetectionResult(score=0.0, confidence=0.0, error=str(e))

ZlibEntropySignal

Bases: BaseSignal

Detects AI content using compression ratio (zlib entropy).

This method employs a compression-based approach under the hypothesis that AI-generated content is more predictable map (lower entropy) and thus more compressible than human content.

Algorithm

ratio = len(zlib(text)) / len(text) - Lower ratio (< 0.6) -> Highly compressible -> Likely AI. - Higher ratio (> 0.8) -> Less compressible -> Likely Human.

Attributes:

Name Type Description
name str

'zlib_entropy'

dtype str

'text'

Source code in veridex/text/entropy.py
class ZlibEntropySignal(BaseSignal):
    """
    Detects AI content using compression ratio (zlib entropy).

    This method employs a compression-based approach under the hypothesis that AI-generated
    content is more predictable map (lower entropy) and thus more compressible than human content.

    Algorithm:
        ratio = len(zlib(text)) / len(text)
        - Lower ratio (< 0.6) -> Highly compressible -> Likely AI.
        - Higher ratio (> 0.8) -> Less compressible -> Likely Human.

    Attributes:
        name (str): 'zlib_entropy'
        dtype (str): 'text'
    """

    @property
    def name(self) -> str:
        return "zlib_entropy"

    @property
    def dtype(self) -> str:
        return "text"

    def run(self, input_data: Any) -> DetectionResult:
        if not isinstance(input_data, str):
             return DetectionResult(
                score=0.0,
                confidence=0.0,
                metadata={},
                error="Input must be a string."
            )

        if not input_data:
             return DetectionResult(
                score=0.0,
                confidence=0.0,
                metadata={"zlib_ratio": 0.0},
                error="Input string is empty."
            )

        encoded = input_data.encode("utf-8")
        compressed = zlib.compress(encoded)
        ratio = len(compressed) / len(encoded)

        # Calculate confidence based on how extreme the ratio is
        # Very compressible (low ratio) or very incompressible (high ratio) = higher confidence
        # Middle values = lower confidence
        # Typical ranges: AI text ~0.55-0.70, Human text ~0.65-0.85
        if ratio < 0.6:
            # Very compressible (repetitive) - moderate confidence it's AI
            score = 0.6  # Slightly AI-leaning
        elif ratio > 0.8:
            # Not very compressible (diverse) - moderate confidence it's human
            score = 0.3  # Slightly human-leaning
        else:
            # Middle range - low confidence
            score = 0.5  # Neutral

        # Use distance from neutral point (0.5) as confidence indicator
        distance_from_neutral = abs(score - 0.5)

        # Map distance to confidence
        # Distance 0.5 (max) -> confidence ~0.45
        # Distance 0.0 (neutral) -> confidence ~0.25
        confidence = 0.25 + distance_from_neutral * 0.4  # Range: 0.25 to 0.45

        return DetectionResult(
            score=score,
            confidence=confidence,
            metadata={
                "original_len": len(encoded),
                "compressed_len": len(compressed),
                "compression_ratio": ratio
            }
        )

BinocularsSignal

Bases: BaseSignal

Implements the 'Binoculars' Zero-Shot Detection method.

This advanced detection strategy compares the perplexity of two models forms a ratio: an 'Observer' model and a 'Performer' model.

Formula: Score = log(PPL_Observer) / log(PPL_Performer)

Interpretation: If the score is below a certain threshold (typically ~0.90), the text is considered AI-generated. This method is considered state-of-the-art for zero-shot detection.

Reference: "Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated Text" (arXiv:2401.12070)

Attributes:

Name Type Description
observer_id str

HuggingFace ID for the observer model.

performer_id str

HuggingFace ID for the performer model.

use_mock bool

If True, returns dummy results without loading models (for testing).

Source code in veridex/text/binoculars.py
class BinocularsSignal(BaseSignal):
    """
    Implements the 'Binoculars' Zero-Shot Detection method.

    This advanced detection strategy compares the perplexity of two models forms a ratio:
    an 'Observer' model and a 'Performer' model.

    **Formula**:
    `Score = log(PPL_Observer) / log(PPL_Performer)`

    **Interpretation**:
    If the score is below a certain threshold (typically ~0.90), the text is considered
    AI-generated. This method is considered state-of-the-art for zero-shot detection.

    **Reference**:
    "Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated Text" (arXiv:2401.12070)

    Attributes:
        observer_id (str): HuggingFace ID for the observer model.
        performer_id (str): HuggingFace ID for the performer model.
        use_mock (bool): If True, returns dummy results without loading models (for testing).
    """

    def __init__(self, observer_id: str = "tiiuae/falcon-7b-instruct", performer_id: str = "tiiuae/falcon-7b", use_mock: bool = False):
        """
        Initialize the Binoculars signal.

        Args:
            observer_id (str): HuggingFace ID for the observer model.
            performer_id (str): HuggingFace ID for the performer model.
            use_mock (bool): If True, returns dummy results without loading models (for testing).
        """
        self.observer_id = observer_id
        self.performer_id = performer_id
        self.use_mock = use_mock
        self._observer_model = None
        self._performer_model = None
        self._tokenizer = None

    @property
    def name(self) -> str:
        """Returns 'binoculars'."""
        return "binoculars"

    @property
    def dtype(self) -> str:
        """Returns 'text'."""
        return "text"

    def check_dependencies(self) -> None:
        if self.use_mock:
            return

        try:
            import torch
            import transformers
        except ImportError:
            raise ImportError(
                "The 'text' extra dependencies (transformers, torch) are required for BinocularsSignal. "
                "Install them with `pip install veridex[text]`."
            )

    def _load_models(self):
        if self.use_mock:
            return

        if self._observer_model is not None and self._performer_model is not None:
            return

        self.check_dependencies()

        from transformers import AutoModelForCausalLM, AutoTokenizer
        import torch

        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda"
        self._device = device

        # Shared tokenizer usually works if models are from same family
        self._tokenizer = AutoTokenizer.from_pretrained(self.observer_id)

        self._observer_model = AutoModelForCausalLM.from_pretrained(
            self.observer_id,
            trust_remote_code=True,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32
        ).to(device)

        self._performer_model = AutoModelForCausalLM.from_pretrained(
            self.performer_id,
            trust_remote_code=True,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32
        ).to(device)

    def _calculate_ppl(self, model, text):
        import torch
        inputs = self._tokenizer(text, return_tensors="pt")
        inputs = {k: v.to(self._device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            return torch.exp(outputs.loss).item()

    def run(self, input_data: Any) -> DetectionResult:
        """
        Execute Binoculars detection.

        Args:
            input_data (str): Text to analyze.

        Returns:
            DetectionResult:
                - score: Probability of AI generation.
                - metadata: 'binoculars_score', 'threshold', 'distance_from_threshold'.
        """
        if not isinstance(input_data, str):
            return DetectionResult(score=0.0, confidence=0.0, metadata={}, error="Input must be a string.")

        if self.use_mock:
            # Mock behavior for testing without heavy downloads
            # Return a dummy result
            return DetectionResult(
                score=0.9,
                confidence=1.0,
                metadata={
                    "binoculars_score": 0.85,
                    "threshold": 0.90,
                    "mode": "mock"
                }
            )

        try:
            self._load_models()
            import numpy as np

            ppl_observer = self._calculate_ppl(self._observer_model, input_data)
            ppl_performer = self._calculate_ppl(self._performer_model, input_data)

            # Binoculars Score = log(PPL_Observer) / log(PPL_Performer)
            # Avoid division by zero
            if ppl_performer <= 1.0:
                ppl_performer = 1.0001

            score_val = np.log(ppl_observer) / np.log(ppl_performer)

            # Thresholding (from paper, typically around 0.9017 for Falcon)
            # If score < threshold, it is AI.
            threshold = 0.9017

            # Convert to probability:
            # If score is much lower than threshold -> High AI prob.
            # If score is higher than threshold -> Low AI prob.

            is_ai = score_val < threshold
            ai_prob = 0.9 if is_ai else 0.1

            # Calculate confidence from distance to threshold
            # Scores far from threshold indicate high confidence
            dist_from_threshold = abs(score_val - threshold)
            # Typical binoculars scores range from ~0.7 to ~1.1
            # Distance > 0.1 from threshold is very confident
            if dist_from_threshold > 0.15:
                confidence = 0.95
            elif dist_from_threshold > 0.1:
                confidence = 0.88
            elif dist_from_threshold > 0.05:
                confidence = 0.78
            elif dist_from_threshold > 0.02:
                confidence = 0.65
            else:
                confidence = 0.50  # Very close to threshold, uncertain

            return DetectionResult(
                score=ai_prob,
                confidence=confidence,
                metadata={
                    "binoculars_score": score_val,
                    "ppl_observer": ppl_observer,
                    "ppl_performer": ppl_performer,
                    "threshold": threshold,
                    "distance_from_threshold": dist_from_threshold
                }
            )

        except Exception as e:
            return DetectionResult(
                score=0.0,
                confidence=0.0,
                metadata={},
                error=f"Binoculars failed: {str(e)}"
            )

name property

Returns 'binoculars'.

dtype property

Returns 'text'.

__init__(observer_id='tiiuae/falcon-7b-instruct', performer_id='tiiuae/falcon-7b', use_mock=False)

Initialize the Binoculars signal.

Parameters:

Name Type Description Default
observer_id str

HuggingFace ID for the observer model.

'tiiuae/falcon-7b-instruct'
performer_id str

HuggingFace ID for the performer model.

'tiiuae/falcon-7b'
use_mock bool

If True, returns dummy results without loading models (for testing).

False
Source code in veridex/text/binoculars.py
def __init__(self, observer_id: str = "tiiuae/falcon-7b-instruct", performer_id: str = "tiiuae/falcon-7b", use_mock: bool = False):
    """
    Initialize the Binoculars signal.

    Args:
        observer_id (str): HuggingFace ID for the observer model.
        performer_id (str): HuggingFace ID for the performer model.
        use_mock (bool): If True, returns dummy results without loading models (for testing).
    """
    self.observer_id = observer_id
    self.performer_id = performer_id
    self.use_mock = use_mock
    self._observer_model = None
    self._performer_model = None
    self._tokenizer = None

run(input_data)

Execute Binoculars detection.

Parameters:

Name Type Description Default
input_data str

Text to analyze.

required

Returns:

Name Type Description
DetectionResult DetectionResult
  • score: Probability of AI generation.
  • metadata: 'binoculars_score', 'threshold', 'distance_from_threshold'.
Source code in veridex/text/binoculars.py
def run(self, input_data: Any) -> DetectionResult:
    """
    Execute Binoculars detection.

    Args:
        input_data (str): Text to analyze.

    Returns:
        DetectionResult:
            - score: Probability of AI generation.
            - metadata: 'binoculars_score', 'threshold', 'distance_from_threshold'.
    """
    if not isinstance(input_data, str):
        return DetectionResult(score=0.0, confidence=0.0, metadata={}, error="Input must be a string.")

    if self.use_mock:
        # Mock behavior for testing without heavy downloads
        # Return a dummy result
        return DetectionResult(
            score=0.9,
            confidence=1.0,
            metadata={
                "binoculars_score": 0.85,
                "threshold": 0.90,
                "mode": "mock"
            }
        )

    try:
        self._load_models()
        import numpy as np

        ppl_observer = self._calculate_ppl(self._observer_model, input_data)
        ppl_performer = self._calculate_ppl(self._performer_model, input_data)

        # Binoculars Score = log(PPL_Observer) / log(PPL_Performer)
        # Avoid division by zero
        if ppl_performer <= 1.0:
            ppl_performer = 1.0001

        score_val = np.log(ppl_observer) / np.log(ppl_performer)

        # Thresholding (from paper, typically around 0.9017 for Falcon)
        # If score < threshold, it is AI.
        threshold = 0.9017

        # Convert to probability:
        # If score is much lower than threshold -> High AI prob.
        # If score is higher than threshold -> Low AI prob.

        is_ai = score_val < threshold
        ai_prob = 0.9 if is_ai else 0.1

        # Calculate confidence from distance to threshold
        # Scores far from threshold indicate high confidence
        dist_from_threshold = abs(score_val - threshold)
        # Typical binoculars scores range from ~0.7 to ~1.1
        # Distance > 0.1 from threshold is very confident
        if dist_from_threshold > 0.15:
            confidence = 0.95
        elif dist_from_threshold > 0.1:
            confidence = 0.88
        elif dist_from_threshold > 0.05:
            confidence = 0.78
        elif dist_from_threshold > 0.02:
            confidence = 0.65
        else:
            confidence = 0.50  # Very close to threshold, uncertain

        return DetectionResult(
            score=ai_prob,
            confidence=confidence,
            metadata={
                "binoculars_score": score_val,
                "ppl_observer": ppl_observer,
                "ppl_performer": ppl_performer,
                "threshold": threshold,
                "distance_from_threshold": dist_from_threshold
            }
        )

    except Exception as e:
        return DetectionResult(
            score=0.0,
            confidence=0.0,
            metadata={},
            error=f"Binoculars failed: {str(e)}"
        )

DetectGPTSignal

Bases: BaseSignal

Implements the DetectGPT zero-shot detection method.

DetectGPT uses the curvature of the model's log-probability function to distinguish human-written text from AI-generated text. The core idea is that AI-generated text occupies regions of negative log-curvature.

Reference: "DetectGPT: Zero-Shot Machine-Generated Text Detection using Probability Curvature" (Mitchell et al., 2023).

Algorithm: 1. Compute log-probability of original text log p(x). 2. Generate k perturbed versions x_tilde using a mask-filling model (e.g., T5). 3. Compute log-probability of perturbations log p(x_tilde). 4. Calculate curvature score: log p(x) - mean(log p(x_tilde)). 5. Normalize score to [0, 1] range.

Note: This signal is computationally expensive as it requires loading two LLMs (Base and Perturbation) and running multiple forward passes.

Attributes:

Name Type Description
base_model_name str

HuggingFace model ID for probability computation.

perturbation_model_name str

HuggingFace model ID for perturbation (T5).

n_perturbations int

Number of perturbed samples to generate.

device str

Computation device ('cpu', 'cuda').

Source code in veridex/text/detectgpt.py
class DetectGPTSignal(BaseSignal):
    """
    Implements the DetectGPT zero-shot detection method.

    DetectGPT uses the curvature of the model's log-probability function to distinguish
    human-written text from AI-generated text. The core idea is that AI-generated text occupies
    regions of negative log-curvature.

    **Reference**:
    "DetectGPT: Zero-Shot Machine-Generated Text Detection using Probability Curvature"
    (Mitchell et al., 2023).

    **Algorithm**:
    1. Compute log-probability of original text `log p(x)`.
    2. Generate `k` perturbed versions `x_tilde` using a mask-filling model (e.g., T5).
    3. Compute log-probability of perturbations `log p(x_tilde)`.
    4. Calculate curvature score: `log p(x) - mean(log p(x_tilde))`.
    5. Normalize score to [0, 1] range.

    **Note**:
    This signal is computationally expensive as it requires loading two LLMs (Base and Perturbation)
    and running multiple forward passes.

    Attributes:
        base_model_name (str): HuggingFace model ID for probability computation.
        perturbation_model_name (str): HuggingFace model ID for perturbation (T5).
        n_perturbations (int): Number of perturbed samples to generate.
        device (str): Computation device ('cpu', 'cuda').
    """

    def __init__(self,
                 base_model_name: str = "gpt2-medium",
                 perturbation_model_name: str = "t5-base",
                 n_perturbations: int = 10,
                 device: Optional[str] = None):
        """
        Initialize the DetectGPT signal.

        Args:
            base_model_name (str): Name of the model to use for computing log-probabilities.
                Defaults to "gpt2-medium".
            perturbation_model_name (str): Name of the model to use for generating perturbations.
                Defaults to "t5-base".
            n_perturbations (int): Number of perturbed samples to generate. Defaults to 10.
            device (Optional[str]): Device to run models on ('cpu' or 'cuda'). If None,
                auto-detects CUDA.
        """
        self.base_model_name = base_model_name
        self.perturbation_model_name = perturbation_model_name
        self.n_perturbations = n_perturbations
        self.device = device or ("cuda" if torch and torch.cuda.is_available() else "cpu")

        self._base_model = None
        self._base_tokenizer = None
        self._perturb_model = None
        self._perturb_tokenizer = None

    @property
    def name(self) -> str:
        """Returns 'detect_gpt'."""
        return "detect_gpt"

    @property
    def dtype(self) -> str:
        """Returns 'text'."""
        return "text"

    def _load_base_model(self):
        if self._base_model is None:
            self.check_dependencies()
            self._base_tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
            self._base_model = AutoModelForCausalLM.from_pretrained(self.base_model_name).to(self.device)
            self._base_model.eval()

    def _load_perturb_model(self):
        # Simplification: For this implementation, we might simulate perturbations or use a simpler
        # heuristic if T5 is too heavy, but let's stick to the interface.
        # Ideally, we would load T5ForConditionalGeneration here.
        pass

    def check_dependencies(self):
        if torch is None or AutoModelForCausalLM is None:
            raise ImportError(
                "DetectGPTSignal requires 'torch' and 'transformers'. "
                "Install with `pip install veridex[text]`"
            )

    def _get_log_prob(self, text: str) -> float:
        """Computes the log probability of a text under the base model."""
        self._load_base_model()
        inputs = self._base_tokenizer(text, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self._base_model(**inputs, labels=inputs["input_ids"])
            # loss is the negative log likelihood
            log_likelihood = -outputs.loss.item()
        return log_likelihood

    def _perturb_text(self, text: str) -> List[str]:
        """
        Generates perturbed versions of the text.
        For a true DetectGPT, we would use T5 mask filling.

        This implementation currently uses a random swap heuristic for demonstration/speed
        unless the full T5 stack is implemented.

        Args:
            text (str): Input text.

        Returns:
            List[str]: List of perturbed text strings.
        """
        perturbations = []
        words = text.split()
        if len(words) < 5:
            return [text] * self.n_perturbations

        import random
        for _ in range(self.n_perturbations):
            # Simple swap of two random words
            new_words = words[:]
            idx1, idx2 = random.sample(range(len(words)), 2)
            new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
            perturbations.append(" ".join(new_words))

        return perturbations

    def run(self, input_data: Any) -> DetectionResult:
        """
        Analyzes the text using DetectGPT logic.

        Args:
            input_data (str): The text to analyze.

        Returns:
            DetectionResult: Result object containing the curvature-based score.
                - score: 0.0 (Human) to 1.0 (AI).
                - metadata: Contains 'curvature', 'original_log_prob'.
        """
        if not isinstance(input_data, str):
             return DetectionResult(score=0.0, confidence=0.0, error="Input must be a string", metadata={})

        try:
            original_log_prob = self._get_log_prob(input_data)
            perturbed_texts = self._perturb_text(input_data)

            perturbed_log_probs = []
            for p_text in perturbed_texts:
                perturbed_log_probs.append(self._get_log_prob(p_text))

            mean_p_log_prob = np.mean(perturbed_log_probs)
            std_p_log_prob = np.std(perturbed_log_probs) + 1e-8

            # DetectGPT score: higher means more likely generated by the model (or similar models)
            curvature = original_log_prob - mean_p_log_prob

            # Sigmoid-like scaling
            score = 1 / (1 + np.exp(-curvature))

            # Confidence based on variance of perturbations?
            confidence = 0.5 # Placeholder

            return DetectionResult(
                score=float(score),
                confidence=confidence,
                metadata={
                    "curvature": curvature,
                    "original_log_prob": original_log_prob,
                    "mean_perturbed_log_prob": mean_p_log_prob
                }
            )

        except Exception as e:
            return DetectionResult(score=0.0, confidence=0.0, error=str(e), metadata={})

name property

Returns 'detect_gpt'.

dtype property

Returns 'text'.

__init__(base_model_name='gpt2-medium', perturbation_model_name='t5-base', n_perturbations=10, device=None)

Initialize the DetectGPT signal.

Parameters:

Name Type Description Default
base_model_name str

Name of the model to use for computing log-probabilities. Defaults to "gpt2-medium".

'gpt2-medium'
perturbation_model_name str

Name of the model to use for generating perturbations. Defaults to "t5-base".

't5-base'
n_perturbations int

Number of perturbed samples to generate. Defaults to 10.

10
device Optional[str]

Device to run models on ('cpu' or 'cuda'). If None, auto-detects CUDA.

None
Source code in veridex/text/detectgpt.py
def __init__(self,
             base_model_name: str = "gpt2-medium",
             perturbation_model_name: str = "t5-base",
             n_perturbations: int = 10,
             device: Optional[str] = None):
    """
    Initialize the DetectGPT signal.

    Args:
        base_model_name (str): Name of the model to use for computing log-probabilities.
            Defaults to "gpt2-medium".
        perturbation_model_name (str): Name of the model to use for generating perturbations.
            Defaults to "t5-base".
        n_perturbations (int): Number of perturbed samples to generate. Defaults to 10.
        device (Optional[str]): Device to run models on ('cpu' or 'cuda'). If None,
            auto-detects CUDA.
    """
    self.base_model_name = base_model_name
    self.perturbation_model_name = perturbation_model_name
    self.n_perturbations = n_perturbations
    self.device = device or ("cuda" if torch and torch.cuda.is_available() else "cpu")

    self._base_model = None
    self._base_tokenizer = None
    self._perturb_model = None
    self._perturb_tokenizer = None

run(input_data)

Analyzes the text using DetectGPT logic.

Parameters:

Name Type Description Default
input_data str

The text to analyze.

required

Returns:

Name Type Description
DetectionResult DetectionResult

Result object containing the curvature-based score. - score: 0.0 (Human) to 1.0 (AI). - metadata: Contains 'curvature', 'original_log_prob'.

Source code in veridex/text/detectgpt.py
def run(self, input_data: Any) -> DetectionResult:
    """
    Analyzes the text using DetectGPT logic.

    Args:
        input_data (str): The text to analyze.

    Returns:
        DetectionResult: Result object containing the curvature-based score.
            - score: 0.0 (Human) to 1.0 (AI).
            - metadata: Contains 'curvature', 'original_log_prob'.
    """
    if not isinstance(input_data, str):
         return DetectionResult(score=0.0, confidence=0.0, error="Input must be a string", metadata={})

    try:
        original_log_prob = self._get_log_prob(input_data)
        perturbed_texts = self._perturb_text(input_data)

        perturbed_log_probs = []
        for p_text in perturbed_texts:
            perturbed_log_probs.append(self._get_log_prob(p_text))

        mean_p_log_prob = np.mean(perturbed_log_probs)
        std_p_log_prob = np.std(perturbed_log_probs) + 1e-8

        # DetectGPT score: higher means more likely generated by the model (or similar models)
        curvature = original_log_prob - mean_p_log_prob

        # Sigmoid-like scaling
        score = 1 / (1 + np.exp(-curvature))

        # Confidence based on variance of perturbations?
        confidence = 0.5 # Placeholder

        return DetectionResult(
            score=float(score),
            confidence=confidence,
            metadata={
                "curvature": curvature,
                "original_log_prob": original_log_prob,
                "mean_perturbed_log_prob": mean_p_log_prob
            }
        )

    except Exception as e:
        return DetectionResult(score=0.0, confidence=0.0, error=str(e), metadata={})

TDetectSignal

Bases: DetectGPTSignal

Implements T-Detect (West et al., 2025), a robust variant of DetectGPT.

Instead of assuming a Gaussian distribution for perturbations (Z-score), T-Detect uses a Student's t-distribution to better model the heavy-tailed nature of adversarial or non-native text perturbations.

Source code in veridex/text/tdetect.py
class TDetectSignal(DetectGPTSignal):
    """
    Implements T-Detect (West et al., 2025), a robust variant of DetectGPT.

    Instead of assuming a Gaussian distribution for perturbations (Z-score),
    T-Detect uses a Student's t-distribution to better model the heavy-tailed
    nature of adversarial or non-native text perturbations.
    """

    @property
    def name(self) -> str:
        return "t_detect"

    def check_dependencies(self) -> None:
        super().check_dependencies()
        try:
            import scipy
        except ImportError:
            raise ImportError(
                "T-Detect signal requires 'scipy'. Please install it via `pip install scipy`."
            )

    def run(self, input_data: str) -> DetectionResult:
        # Reuse the logic to get perturbations and LLs
        if not input_data or not isinstance(input_data, str):
            return DetectionResult(score=0.0, confidence=0.0, error="Invalid input")

        self._load_models()

        original_ll = self._get_ll(input_data)
        perturbations = self._perturb_text_flan(input_data)

        perturbed_lls = []
        for p_text in perturbations:
            if not p_text.strip():
                continue
            ll = self._get_ll(p_text)
            perturbed_lls.append(ll)

        if not perturbed_lls:
            return DetectionResult(score=0.0, confidence=0.0, error="Failed to generate valid perturbations")

        # T-Detect Logic
        mu_p = np.mean(perturbed_lls)
        std_p = np.std(perturbed_lls) if len(perturbed_lls) > 1 else 1.0

        # Degrees of freedom = n - 1
        df = max(1, len(perturbed_lls) - 1)

        curvature = original_ll - mu_p

        # t-score calculation
        t_score = curvature / (std_p + 1e-8)

        # Convert t_score to scalar float safely
        try:
            if isinstance(t_score, (np.ndarray, np.generic)):
                t_score = float(t_score.item()) if isinstance(t_score, np.ndarray) and t_score.size == 1 else float(t_score)
        except Exception:
            # Fallback for unexpected numpy shapes, though unlikely
            t_score = float(np.mean(t_score))

        # Calculate probability using T-CDF
        # If it's AI, curvature is positive (original >> perturbed).
        # We assume 'prob' is the probability that the text is AI.
        # High t_score -> High probability.
        prob = scipy.stats.t.cdf(t_score, df)

        # Ensure prob is scalar float safely
        try:
            if isinstance(prob, (np.ndarray, np.generic)):
                prob = float(prob.item()) if isinstance(prob, np.ndarray) and prob.size == 1 else float(prob)
        except Exception:
             prob = float(np.mean(prob))

        if math.isnan(prob):
             prob = 0.5

        # Calculate confidence from measurement uncertainty (similar to DetectGPT)
        # T-Detect is slightly more robust, so base confidence is a bit higher
        if std_p < 0.2:
            confidence = 0.92
        elif std_p < 0.5:
            confidence = 0.85
        elif std_p < 1.0:
            confidence = 0.75
        elif std_p < 2.0:
            confidence = 0.55
        else:
            confidence = 0.35

        if len(perturbed_lls) >= 15:
            confidence = min(confidence + 0.05, 0.95)

        return DetectionResult(
            score=float(prob),
            confidence=confidence,
            metadata={
                "original_ll": float(original_ll),
                "perturbed_mean_ll": float(mu_p),
                "perturbed_std_ll": float(std_p),
                "curvature": float(curvature),
                "t_score": float(t_score),
                "df": int(df),
                "n_perturbations": len(perturbed_lls)
            }
        )

HumanOODSignal

Bases: BaseSignal

Implements a zero-shot "Human Texts Are Outliers" (HumanOOD) detection signal.

This signal treats the LLM's own generations as the "In-Distribution" (ID) class and human texts as outliers (OOD).

It generates N samples from the model to form an ID cluster, then computes the Mahalanobis or Euclidean distance of the input text's embedding from this cluster.

Higher distance = More likely to be Human (Outlier). Lower distance = More likely to be Machine (ID).

Result score is 1.0 - (normalized_distance), so that AI (Low distance) -> 1.0.

Source code in veridex/text/human_ood.py
class HumanOODSignal(BaseSignal):
    """
    Implements a zero-shot "Human Texts Are Outliers" (HumanOOD) detection signal.

    This signal treats the LLM's own generations as the "In-Distribution" (ID) class
    and human texts as outliers (OOD).

    It generates N samples from the model to form an ID cluster, then computes the
    Mahalanobis or Euclidean distance of the input text's embedding from this cluster.

    Higher distance = More likely to be Human (Outlier).
    Lower distance = More likely to be Machine (ID).

    Result score is 1.0 - (normalized_distance), so that AI (Low distance) -> 1.0.
    """

    def __init__(
        self,
        model_name: str = "gpt2-medium",
        n_samples: int = 20,
        max_length: int = 128,
        distance_metric: str = "euclidean",
        device: Optional[str] = None
    ):
        self.model_name = model_name
        self.n_samples = n_samples
        self.max_length = max_length
        self.distance_metric = distance_metric
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')

        self.model = None
        self.tokenizer = None

    @property
    def name(self) -> str:
        return "human_ood"

    @property
    def dtype(self) -> str:
        return "text"

    def check_dependencies(self) -> None:
        try:
            import transformers
            import torch
            import scipy
        except ImportError:
            raise ImportError(
                "HumanOODSignal requires 'transformers', 'torch', and 'scipy'."
            )

    def _load_models(self):
        if self.model is not None:
            return
        self.check_dependencies()
        from transformers import AutoModelForCausalLM, AutoTokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = AutoModelForCausalLM.from_pretrained(self.model_name).to(self.device)
        self.model.eval()

    def _get_embedding(self, text: str) -> np.ndarray:
        """Computes the mean hidden state embedding for the text."""
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_length
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)
            # Use the last hidden state
            hidden_states = outputs.hidden_states[-1] # (batch, seq, dim)
            # Mean pooling over sequence
            # Mask out padding tokens if any (not critical for single sample gen but good practice)
            mask = inputs.attention_mask.unsqueeze(-1) # (batch, seq, 1)
            sum_embeddings = torch.sum(hidden_states * mask, dim=1)
            sum_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
            mean_embedding = sum_embeddings / sum_mask

        return mean_embedding.cpu().numpy()[0]

    def run(self, input_data: str) -> DetectionResult:
        if not input_data or not isinstance(input_data, str):
            return DetectionResult(score=0.0, confidence=0.0, error="Invalid input")

        self._load_models()

        # 1. Get embedding of input text
        input_emb = self._get_embedding(input_data)

        # 2. Generate N samples from the model (ID distribution)
        # We prompt with the first few tokens of input to condition the style,
        # or use unconditional generation?
        # If we use unconditional, the cluster is "generic English".
        # If we use conditional, we check if the input *continuation* matches the model's *continuation*.
        # The paper suggests "machine-generated texts are in-distribution".
        # If we just generate random text, the distribution is huge.
        # Let's generate completions based on the prefix of the input (first 5 tokens).

        tokens = self.tokenizer.encode(input_data)
        prefix_len = min(5, len(tokens))
        prefix_ids = torch.tensor([tokens[:prefix_len]]).to(self.device)

        generated_embs = []

        for _ in range(self.n_samples):
            with torch.no_grad():
                output_ids = self.model.generate(
                    prefix_ids,
                    do_sample=True,
                    max_length=min(len(tokens) + 20, self.max_length), # Generate similar length
                    top_p=0.9,
                    pad_token_id=self.tokenizer.eos_token_id
                )
                gen_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
                generated_embs.append(self._get_embedding(gen_text))

        generated_embs = np.array(generated_embs)

        # 3. Compute Distance
        # We calculate the distance of input_emb to the distribution of generated_embs.
        # Ideally Mahalanobis, but requires n_samples > dim.
        # GPT2 dim is 768 or 1024. n_samples 20 is too small.
        # Fallback: Euclidean distance to centroid.

        centroid = np.mean(generated_embs, axis=0)

        if self.distance_metric == "euclidean":
            # Distance from input to centroid
            dist = np.linalg.norm(input_emb - centroid)

            # We also need to know the typical spread (radius) of the cluster to normalize.
            # Average distance of samples to centroid.
            radii = np.linalg.norm(generated_embs - centroid, axis=1)
            avg_radius = np.mean(radii)
            std_radius = np.std(radii)

            # Z-score of the distance?
            # if dist >> avg_radius, it is outlier (Human).
            z_dist = (dist - avg_radius) / (std_radius + 1e-8)

            # If z_dist is high (positive), it's far -> Human.
            # If z_dist is low (near 0 or negative), it's close -> Machine.

            # Map z-score to probability of being AI.
            # AI = Low distance.
            # P(AI) = 1 - P(Human)
            # P(Human) is related to CDF(z_dist).

            # Using sigmoid (-z_dist) so that high z (Human) gives low score.
            score = expit(-z_dist) # Numerically stable version of 1 / (1 + exp(z_dist))

        else:
            return DetectionResult(score=0.0, confidence=0.0, error="Unsupported metric")

        return DetectionResult(
            score=float(score),
            confidence=0.7, # Lower confidence as this is a zero-shot approx
            metadata={
                "distance": float(dist),
                "avg_radius": float(avg_radius),
                "z_dist": float(z_dist),
                "n_samples": self.n_samples
            }
        )