← back to invincible-jha__aumai-linguaforge

Function bodies 11 total

detect function · python · L37-L45 (9 LOC)

src/aumai_linguaforge/cli.py

def detect(input_file: Path, top_k: int) -> None:
    """Detect the language(s) of a text file."""
    text = input_file.read_text(encoding="utf-8")
    results = _detector.detect_multiple(text, top_k=top_k)
    for result in results:
        click.echo(
            f"{result.language.code}  {result.language.name:<20}  "
            f"confidence={result.confidence:.2%}  script={result.language.script}"
        )

tokenize function · python · L57-L63 (7 LOC)

src/aumai_linguaforge/cli.py

def tokenize(input_file: Path, language: str | None) -> None:
    """Tokenize a text file."""
    text = input_file.read_text(encoding="utf-8")
    result = _tokenizer.tokenize(text, language=language)
    click.echo(f"Language: {result.language.name} ({result.language.code})")
    click.echo(f"Tokens ({len(result.tokens)}):")
    click.echo(" | ".join(result.tokens))

LanguageDetector.detect method · python · L162-L175 (14 LOC)

src/aumai_linguaforge/core.py

    def detect(self, text: str) -> DetectionResult:
        """Detect the primary language of the text.

        Uses script-based detection as primary signal, with simple
        n-gram heuristics for Latin-script disambiguation.

        Args:
            text: Input text to analyse.

        Returns:
            DetectionResult with the most likely language and confidence.
        """
        results = self.detect_multiple(text, top_k=1)
        return results[0]

LanguageDetector.detect_multiple method · python · L177-L204 (28 LOC)

src/aumai_linguaforge/core.py

    def detect_multiple(self, text: str, top_k: int = 3) -> list[DetectionResult]:
        """Detect top-k candidate languages for the text.

        Args:
            text: Input text to analyse.
            top_k: Number of candidate languages to return.

        Returns:
            List of DetectionResult objects sorted by descending confidence.
        """
        script = detect_script(text)
        candidates: list[tuple[str, float]] = []

        if script in _SCRIPT_TO_LANG:
            primary_code = _SCRIPT_TO_LANG[script]
            candidates.append((primary_code, 0.90))
        elif script == "Latin":
            candidates = self._latin_heuristics(text)
        else:
            candidates = [("en", 0.30)]

        candidates.sort(key=lambda item: item[1], reverse=True)
        results: list[DetectionResult] = []
        for code, confidence in candidates[:top_k]:
            lang = SUPPORTED_LANGUAGES.get(code, SUPPORTED_LANGUAGES["en"])
            results.append(Dete

LanguageDetector._latin_heuristics method · python · L206-L231 (26 LOC)

src/aumai_linguaforge/core.py

    def _latin_heuristics(self, text: str) -> list[tuple[str, float]]:
        """Use common word patterns to distinguish Latin-script languages."""
        text_lower = text.lower()
        scores: dict[str, float] = {
            "en": 0.40,
            "es": 0.05,
            "fr": 0.05,
            "de": 0.05,
            "pt": 0.05,
        }
        # Simple marker words for common languages
        markers: dict[str, list[str]] = {
            "en": ["the", "and", "is", "are", "was", "of", "in", "to"],
            "es": ["el", "la", "los", "las", "de", "en", "que", "es"],
            "fr": ["le", "la", "les", "de", "du", "et", "est", "une"],
            "de": ["der", "die", "das", "und", "ist", "ich", "ein", "nicht"],
            "pt": ["o", "a", "os", "as", "de", "e", "do", "da"],
        }
        words = re.findall(r"\b\w+\b", text_lower)
        word_set = set(words)
        for lang_code, word_list in markers.items():
            hits = sum(1 for w in word_list if w in word

Tokenizer.tokenize method · python · L237-L261 (25 LOC)

src/aumai_linguaforge/core.py

    def tokenize(self, text: str, language: str | None = None) -> TokenizationResult:
        """Tokenize text into tokens.

        Uses Unicode-aware punctuation splitting. For CJK scripts each
        character is treated as a token.

        Args:
            text: The text to tokenize.
            language: Optional BCP-47 language code for language-aware behavior.

        Returns:
            TokenizationResult with the token list and detected language.
        """
        lang = SUPPORTED_LANGUAGES.get(language or "", SUPPORTED_LANGUAGES["en"])
        if language is None:
            detected = LanguageDetector().detect(text)
            lang = detected.language

        script = detect_script(text)
        if script in {"CJK", "Hiragana", "Katakana"}:
            tokens = list(text.replace(" ", ""))
        else:
            tokens = self._unicode_tokenize(text)

        return TokenizationResult(text=text, tokens=tokens, language=lang)

Tokenizer._unicode_tokenize method · python · L263-L274 (12 LOC)

src/aumai_linguaforge/core.py

    def _unicode_tokenize(self, text: str) -> list[str]:
        """Split on whitespace and punctuation boundaries."""
        # Split on any sequence of whitespace or punctuation characters
        raw_tokens = re.split(r"[\s\u200b\u200c\u200d\u2060\ufeff]+", text.strip())
        result: list[str] = []
        for token in raw_tokens:
            if not token:
                continue
            # Further split on ASCII punctuation boundaries
            sub = re.split(r"(?<=[^\W\d_])(?=[\W\d_])|(?<=[\W\d_])(?=[^\W\d_])", token)
            result.extend(t for t in sub if t and not re.fullmatch(r"\s+", t))
        return result

Repobility · MCP-ready · https://repobility.com

Transliterator.transliterate method · python · L305-L342 (38 LOC)

src/aumai_linguaforge/core.py

    def transliterate(
        self,
        text: str,
        source_script: str,
        target_script: str,
    ) -> TransliterationResult:
        """Transliterate text from one script to another.

        Args:
            text: Source text.
            source_script: Name of source script (e.g. 'Devanagari').
            target_script: Name of target script (e.g. 'Latin').

        Returns:
            TransliterationResult containing the converted text.

        Raises:
            ValueError: If the script pair is not supported.
        """
        source_norm = source_script.lower()
        target_norm = target_script.lower()

        if source_norm == "devanagari" and target_norm == "latin":
            result = self._devanagari_to_latin(text)
        elif source_norm == "latin" and target_norm == "devanagari":
            result = self._latin_to_devanagari(text)
        else:
            raise ValueError(
                f"Transliteration from '{source_script}' to '{target_

Transliterator._latin_to_devanagari method · python · L350-L356 (7 LOC)

src/aumai_linguaforge/core.py

    def _latin_to_devanagari(self, text: str) -> str:
        # Sort keys by length descending to match multi-char sequences first
        sorted_keys = sorted(_LATIN_TO_DEVANAGARI.keys(), key=len, reverse=True)
        result = text
        for key in sorted_keys:
            result = result.replace(key, _LATIN_TO_DEVANAGARI[key])
        return result

TextNormalizer.normalize method · python · L362-L398 (37 LOC)

src/aumai_linguaforge/core.py

    def normalize(self, text: str, language: str) -> str:
        """Normalize text for the given language.

        Performs:
        - Unicode NFC normalization
        - Whitespace normalization (collapse runs, strip edges)
        - Script-specific zero-width character cleanup
        - Devanagari: normalize chandrabindu variants

        Args:
            text: Input text to normalize.
            language: BCP-47 language code.

        Returns:
            Normalized string.
        """
        # Unicode NFC normalization
        normalized = unicodedata.normalize("NFC", text)

        # Collapse whitespace
        normalized = re.sub(r"[ \t]+", " ", normalized)
        normalized = re.sub(r"\n{3,}", "\n\n", normalized)
        normalized = normalized.strip()

        # Remove zero-width characters that have no linguistic meaning.
        # U+200C (ZWNJ) and U+200D (ZWJ) are intentionally preserved because
        # they are semantically significant in Brahmic/Indic scripts (e.g

detect_script function · python · L43-L67 (25 LOC)

src/aumai_linguaforge/scripts.py

def detect_script(text: str) -> str:
    """Detect the primary writing script used in the text.

    Counts characters belonging to each script and returns the most frequent.
    Falls back to 'Unknown' if no known script characters are found.

    Args:
        text: The input text to analyse.

    Returns:
        The name of the dominant script (e.g. 'Devanagari', 'Latin', 'CJK').
    """
    script_counts: dict[str, int] = {}

    for char in text:
        code_point = ord(char)
        for start, end, script_name in SCRIPT_RANGES:
            if start <= code_point <= end:
                script_counts[script_name] = script_counts.get(script_name, 0) + 1
                break

    if not script_counts:
        return "Unknown"

    return max(script_counts, key=lambda s: script_counts[s])