Function bodies 11 total
detect function · python · L37-L45 (9 LOC)src/aumai_linguaforge/cli.py
def detect(input_file: Path, top_k: int) -> None:
"""Detect the language(s) of a text file."""
text = input_file.read_text(encoding="utf-8")
results = _detector.detect_multiple(text, top_k=top_k)
for result in results:
click.echo(
f"{result.language.code} {result.language.name:<20} "
f"confidence={result.confidence:.2%} script={result.language.script}"
)tokenize function · python · L57-L63 (7 LOC)src/aumai_linguaforge/cli.py
def tokenize(input_file: Path, language: str | None) -> None:
"""Tokenize a text file."""
text = input_file.read_text(encoding="utf-8")
result = _tokenizer.tokenize(text, language=language)
click.echo(f"Language: {result.language.name} ({result.language.code})")
click.echo(f"Tokens ({len(result.tokens)}):")
click.echo(" | ".join(result.tokens))LanguageDetector.detect method · python · L162-L175 (14 LOC)src/aumai_linguaforge/core.py
def detect(self, text: str) -> DetectionResult:
"""Detect the primary language of the text.
Uses script-based detection as primary signal, with simple
n-gram heuristics for Latin-script disambiguation.
Args:
text: Input text to analyse.
Returns:
DetectionResult with the most likely language and confidence.
"""
results = self.detect_multiple(text, top_k=1)
return results[0]LanguageDetector.detect_multiple method · python · L177-L204 (28 LOC)src/aumai_linguaforge/core.py
def detect_multiple(self, text: str, top_k: int = 3) -> list[DetectionResult]:
"""Detect top-k candidate languages for the text.
Args:
text: Input text to analyse.
top_k: Number of candidate languages to return.
Returns:
List of DetectionResult objects sorted by descending confidence.
"""
script = detect_script(text)
candidates: list[tuple[str, float]] = []
if script in _SCRIPT_TO_LANG:
primary_code = _SCRIPT_TO_LANG[script]
candidates.append((primary_code, 0.90))
elif script == "Latin":
candidates = self._latin_heuristics(text)
else:
candidates = [("en", 0.30)]
candidates.sort(key=lambda item: item[1], reverse=True)
results: list[DetectionResult] = []
for code, confidence in candidates[:top_k]:
lang = SUPPORTED_LANGUAGES.get(code, SUPPORTED_LANGUAGES["en"])
results.append(DeteLanguageDetector._latin_heuristics method · python · L206-L231 (26 LOC)src/aumai_linguaforge/core.py
def _latin_heuristics(self, text: str) -> list[tuple[str, float]]:
"""Use common word patterns to distinguish Latin-script languages."""
text_lower = text.lower()
scores: dict[str, float] = {
"en": 0.40,
"es": 0.05,
"fr": 0.05,
"de": 0.05,
"pt": 0.05,
}
# Simple marker words for common languages
markers: dict[str, list[str]] = {
"en": ["the", "and", "is", "are", "was", "of", "in", "to"],
"es": ["el", "la", "los", "las", "de", "en", "que", "es"],
"fr": ["le", "la", "les", "de", "du", "et", "est", "une"],
"de": ["der", "die", "das", "und", "ist", "ich", "ein", "nicht"],
"pt": ["o", "a", "os", "as", "de", "e", "do", "da"],
}
words = re.findall(r"\b\w+\b", text_lower)
word_set = set(words)
for lang_code, word_list in markers.items():
hits = sum(1 for w in word_list if w in wordTokenizer.tokenize method · python · L237-L261 (25 LOC)src/aumai_linguaforge/core.py
def tokenize(self, text: str, language: str | None = None) -> TokenizationResult:
"""Tokenize text into tokens.
Uses Unicode-aware punctuation splitting. For CJK scripts each
character is treated as a token.
Args:
text: The text to tokenize.
language: Optional BCP-47 language code for language-aware behavior.
Returns:
TokenizationResult with the token list and detected language.
"""
lang = SUPPORTED_LANGUAGES.get(language or "", SUPPORTED_LANGUAGES["en"])
if language is None:
detected = LanguageDetector().detect(text)
lang = detected.language
script = detect_script(text)
if script in {"CJK", "Hiragana", "Katakana"}:
tokens = list(text.replace(" ", ""))
else:
tokens = self._unicode_tokenize(text)
return TokenizationResult(text=text, tokens=tokens, language=lang)Tokenizer._unicode_tokenize method · python · L263-L274 (12 LOC)src/aumai_linguaforge/core.py
def _unicode_tokenize(self, text: str) -> list[str]:
"""Split on whitespace and punctuation boundaries."""
# Split on any sequence of whitespace or punctuation characters
raw_tokens = re.split(r"[\s\u200b\u200c\u200d\u2060\ufeff]+", text.strip())
result: list[str] = []
for token in raw_tokens:
if not token:
continue
# Further split on ASCII punctuation boundaries
sub = re.split(r"(?<=[^\W\d_])(?=[\W\d_])|(?<=[\W\d_])(?=[^\W\d_])", token)
result.extend(t for t in sub if t and not re.fullmatch(r"\s+", t))
return resultRepobility · MCP-ready · https://repobility.com
Transliterator.transliterate method · python · L305-L342 (38 LOC)src/aumai_linguaforge/core.py
def transliterate(
self,
text: str,
source_script: str,
target_script: str,
) -> TransliterationResult:
"""Transliterate text from one script to another.
Args:
text: Source text.
source_script: Name of source script (e.g. 'Devanagari').
target_script: Name of target script (e.g. 'Latin').
Returns:
TransliterationResult containing the converted text.
Raises:
ValueError: If the script pair is not supported.
"""
source_norm = source_script.lower()
target_norm = target_script.lower()
if source_norm == "devanagari" and target_norm == "latin":
result = self._devanagari_to_latin(text)
elif source_norm == "latin" and target_norm == "devanagari":
result = self._latin_to_devanagari(text)
else:
raise ValueError(
f"Transliteration from '{source_script}' to '{target_Transliterator._latin_to_devanagari method · python · L350-L356 (7 LOC)src/aumai_linguaforge/core.py
def _latin_to_devanagari(self, text: str) -> str:
# Sort keys by length descending to match multi-char sequences first
sorted_keys = sorted(_LATIN_TO_DEVANAGARI.keys(), key=len, reverse=True)
result = text
for key in sorted_keys:
result = result.replace(key, _LATIN_TO_DEVANAGARI[key])
return resultTextNormalizer.normalize method · python · L362-L398 (37 LOC)src/aumai_linguaforge/core.py
def normalize(self, text: str, language: str) -> str:
"""Normalize text for the given language.
Performs:
- Unicode NFC normalization
- Whitespace normalization (collapse runs, strip edges)
- Script-specific zero-width character cleanup
- Devanagari: normalize chandrabindu variants
Args:
text: Input text to normalize.
language: BCP-47 language code.
Returns:
Normalized string.
"""
# Unicode NFC normalization
normalized = unicodedata.normalize("NFC", text)
# Collapse whitespace
normalized = re.sub(r"[ \t]+", " ", normalized)
normalized = re.sub(r"\n{3,}", "\n\n", normalized)
normalized = normalized.strip()
# Remove zero-width characters that have no linguistic meaning.
# U+200C (ZWNJ) and U+200D (ZWJ) are intentionally preserved because
# they are semantically significant in Brahmic/Indic scripts (e.gdetect_script function · python · L43-L67 (25 LOC)src/aumai_linguaforge/scripts.py
def detect_script(text: str) -> str:
"""Detect the primary writing script used in the text.
Counts characters belonging to each script and returns the most frequent.
Falls back to 'Unknown' if no known script characters are found.
Args:
text: The input text to analyse.
Returns:
The name of the dominant script (e.g. 'Devanagari', 'Latin', 'CJK').
"""
script_counts: dict[str, int] = {}
for char in text:
code_point = ord(char)
for start, end, script_name in SCRIPT_RANGES:
if start <= code_point <= end:
script_counts[script_name] = script_counts.get(script_name, 0) + 1
break
if not script_counts:
return "Unknown"
return max(script_counts, key=lambda s: script_counts[s])