#!/usr/bin/env python3
"""
Tokenization bias test — Anthropic Claude API.
Measures EN vs ES token overhead using usage.input_tokens from Claude completions.

Usage: ANTHROPIC_API_KEY=your-key python3 test_claude_tokenizer.py

Note: Claude has a large system prompt baseline (~4,116 tokens for Sonnet 4).
All results are baseline-corrected.

Part of: "Your AI charges you up to 67% more for not speaking English"
https://theprivatestack.com/research/tokenization-bias
"""

import json
import os
import urllib.request


ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
MODEL = "claude-sonnet-4-20250514"


def count_tokens_claude(text, api_key, model=MODEL):
    url = "https://api.anthropic.com/v1/messages"
    data = json.dumps({
        "model": model,
        "max_tokens": 1,
        "messages": [{"role": "user", "content": text}],
    }).encode()
    req = urllib.request.Request(url, data=data, headers={
        "Content-Type": "application/json",
        "x-api-key": api_key,
        "anthropic-version": "2023-06-01",
        "User-Agent": "tokenizer-bias-test/1.0",
    })
    with urllib.request.urlopen(req) as resp:
        result = json.loads(resp.read())
    return result["usage"]["input_tokens"]


CLAUSES_EN = {
    "NDA": "The Receiving Party agrees to hold in strict confidence all Confidential Information disclosed by the Disclosing Party and shall not disclose such information to any third party without prior written consent. This obligation shall survive the termination of this agreement for a period of five years.",
    "Incorporation": "This Corporation is organized for the purpose of engaging in any lawful act or activity for which corporations may be organized under the General Corporation Law of the State of Delaware. The total number of shares of stock which the corporation shall have authority to issue is ten million shares of common stock, each having a par value of one cent per share.",
    "Tax": "The taxpayer shall file annual returns with the Internal Revenue Service for each taxable year. All digital tax receipts must comply with current regulations and include the employer identification number assigned by the federal tax authority.",
}

CLAUSES_ES = {
    "NDA": "La Parte Receptora se obliga a mantener en estricta confidencialidad toda la Información Confidencial revelada por la Parte Divulgante y no divulgará dicha información a terceros sin el consentimiento previo por escrito. Esta obligación sobrevivirá la terminación del presente contrato por un período de cinco años.",
    "Constitutiva": "La Sociedad tiene por objeto la realización de cualquier acto o actividad lícita para la cual las sociedades pueden organizarse conforme a la Ley General de Sociedades Mercantiles del Estado Mexicano. El capital social autorizado será de diez millones de acciones ordinarias, cada una con valor nominal de un centavo por acción.",
    "Fiscal": "El contribuyente deberá presentar declaraciones anuales ante el Servicio de Administración Tributaria por cada ejercicio fiscal. Todos los comprobantes fiscales digitales por Internet deberán cumplir con la normatividad vigente e incluir el Registro Federal de Contribuyentes asignado por la autoridad fiscal federal.",
}

MX_TERMS = [
    "fideicomiso",
    "escritura pública",
    "acta constitutiva",
    "protocolo notarial",
    "Sociedad Anónima Promotora de Inversión de Capital Variable",
    "Registro Federal de Contribuyentes",
    "comprobantes fiscales digitales por Internet",
    "asamblea general ordinaria de accionistas",
]


def main():
    api_key = ANTHROPIC_API_KEY
    if not api_key:
        print("Set ANTHROPIC_API_KEY env var")
        return

    try:
        baseline = count_tokens_claude("test", api_key)
        print(f"Connected to Claude ({MODEL})")
        print(f"System prompt baseline: {baseline} tokens\n")
    except Exception as e:
        print(f"Claude API error: {e}")
        return

    # Clause comparison (baseline-corrected)
    print("=" * 90)
    print(f"  CLAUDE ({MODEL}) — CLAUSE COMPARISON (baseline-corrected)")
    print("=" * 90)

    clause_pairs = [("NDA", "NDA"), ("Incorporation", "Constitutiva"), ("Tax", "Fiscal")]
    for en_key, es_key in clause_pairs:
        raw_en = count_tokens_claude(CLAUSES_EN[en_key], api_key)
        raw_es = count_tokens_claude(CLAUSES_ES[es_key], api_key)
        t_en = raw_en - baseline
        t_es = raw_es - baseline
        pct = ((t_es / t_en) - 1) * 100
        print(f"  {en_key:<20} EN: {t_en:>4} tokens  ES: {t_es:>4} tokens  Overhead: {pct:>+.1f}%")
        print(f"  {'':20} (raw EN: {raw_en}, raw ES: {raw_es}, baseline: {baseline})")

    # Mexican terms
    print(f"\n{'=' * 90}")
    print("  MEXICAN LEGAL TERMS (baseline-corrected)")
    print("=" * 90)
    for term in MX_TERMS:
        raw = count_tokens_claude(term, api_key)
        corrected = raw - baseline
        print(f'  "{term}" -> {corrected} tokens (raw: {raw})')

    print(f"\n  Baseline: {baseline} tokens (system prompt + chat template)")
    print("  All corrected counts = raw - baseline\n")


if __name__ == "__main__":
    main()
