← back to domovinatv__fetch.domovina.tv

Function bodies 195 total

All specs Real LLM only Function bodies
format_srt_time function · python · L47-L59 (13 LOC)
colab_canary/transcribe_canary.py
def format_srt_time(seconds: float) -> str:
    """Pretvara sekunde u SRT time format HH:MM:SS,mmm"""
    sanitized = max(0.0, seconds)
    delta = datetime.timedelta(seconds=sanitized)
    total_int_seconds = int(delta.total_seconds())

    hours = total_int_seconds // 3600
    remainder = total_int_seconds % 3600
    minutes = remainder // 60
    secs = remainder % 60
    milliseconds = delta.microseconds // 1000

    return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
generate_srt_content function · python · L62-L73 (12 LOC)
colab_canary/transcribe_canary.py
def generate_srt_content(segment_timestamps: list) -> str:
    """Generira SRT formatirani string iz segmentnih timestampova."""
    srt_lines = []
    for i, ts in enumerate(segment_timestamps):
        start_time = format_srt_time(ts['start'])
        end_time = format_srt_time(ts['end'])
        text = ts['segment']
        srt_lines.append(str(i + 1))
        srt_lines.append(f"{start_time} --> {end_time}")
        srt_lines.append(text)
        srt_lines.append("")
    return "\n".join(srt_lines)
sec_to_hms function · python · L76-L79 (4 LOC)
colab_canary/transcribe_canary.py
def sec_to_hms(seconds: float) -> str:
    """Pretvara sekunde u HH:MM:SS format za CSV."""
    seconds = round(seconds)
    return str(datetime.timedelta(seconds=seconds))
format_duration function · python · L82-L87 (6 LOC)
colab_canary/transcribe_canary.py
def format_duration(seconds: float) -> str:
    """Formatira trajanje u čitljiv format."""
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"{h}h {m}m {s}s"
parse_args function · python · L92-L138 (47 LOC)
colab_canary/transcribe_canary.py
def parse_args():
    parser = argparse.ArgumentParser(
        description="🐤 NVIDIA Canary 1B v2 — Transkripcija na Colab/Kaggle GPU",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Primjeri:
  # Google Colab (nakon mount Google Drive)
  !python transcribe_canary.py --input-dir /content/drive/MyDrive/wav_files

  # Kaggle
  !python transcribe_canary.py --input-dir /kaggle/input/my-dataset --output-dir /kaggle/working

  # Lokalno
  python transcribe_canary.py --input-dir ./wav_files --dry-run
"""
    )

    parser.add_argument(
        "--input-dir", required=True,
        help="Direktorij s WAV datotekama za transkripciju"
    )
    parser.add_argument(
        "--output-dir", default=None,
        help="Direktorij za output (default: isti kao input-dir)"
    )
    parser.add_argument(
        "--source-lang", default="hr",
        help="Izvorni jezik — ISO kod (default: hr za Hrvatski)"
    )
    parser.add_argument(
        "--target-lang", defaul
find_wav_files function · python · L141-L147 (7 LOC)
colab_canary/transcribe_canary.py
def find_wav_files(input_dir: str) -> list:
    """Pronalazi sve WAV datoteke rekurzivno u direktoriju i poddirektorijima."""
    wav_files = sorted([
        str(p) for p in Path(input_dir).rglob("*.wav")
        if not p.name.startswith("._")
    ])
    return wav_files
has_canary_transcript function · python · L150-L156 (7 LOC)
colab_canary/transcribe_canary.py
def has_canary_transcript(wav_file: str, output_dir: str) -> bool:
    """Provjerava postoji li canary transkript za danu WAV datoteku.
    Traži SRT pored WAV fajla (u istom direktoriju)."""
    wav_dir = os.path.dirname(wav_file)
    basename = os.path.basename(wav_file)
    srt_path = os.path.join(wav_dir, basename + CANARY_SRT_SUFFIX)
    return os.path.exists(srt_path)
Repobility · severity-and-effort ranking · https://repobility.com
install_dependencies function · python · L159-L167 (9 LOC)
colab_canary/transcribe_canary.py
def install_dependencies():
    """Provjerava i instalira NeMo ako nije prisutan."""
    try:
        import nemo.collections.asr  # noqa: F401
        print("   ✅ NeMo toolkit je već instaliran")
    except ImportError:
        print("   📦 Instaliram NeMo toolkit...")
        os.system("pip install -U 'nemo_toolkit[asr]'")
        print("   ✅ NeMo instaliran")
load_model function · python · L170-L192 (23 LOC)
colab_canary/transcribe_canary.py
def load_model():
    """Učitava Canary 1B v2 model s BF16 optimizacijom."""
    import torch
    from nemo.collections.asr.models import ASRModel

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"   🖥️  Uređaj: {device.upper()}")

    if device == "cpu":
        print("   ⚠️  UPOZORENJE: GPU nije dostupan! Transkripcija će biti JAKO spora.")
        print("      💡 Na Colab/Kaggle: Runtime → Change runtime type → T4 GPU")

    print("   📥 Učitavam nvidia/canary-1b-v2 model (ovo traje ~1-2min prvi put)...")
    model = ASRModel.from_pretrained(model_name="nvidia/canary-1b-v2")
    model.eval()

    # BF16 optimizacija — pola memorije, brži compute na modernim GPU-ima
    if device == "cuda" and torch.cuda.is_bf16_supported():
        model = model.to(torch.bfloat16)
        print("   ⚡ BF16 optimizacija aktivna")
    print("   ✅ Model učitan")

    return model, device
transcribe_single_file function · python · L195-L272 (78 LOC)
colab_canary/transcribe_canary.py
def transcribe_single_file(model, wav_file: str, output_dir: str,
                           source_lang: str, target_lang: str) -> dict:
    """
    Transkribira jednu WAV datoteku i sprema SRT + CSV.
    NIKADA ne prepisuje postojeće datoteke.
    """
    import torch

    wav_dir = os.path.dirname(wav_file)
    basename = os.path.basename(wav_file)
    srt_output = os.path.join(wav_dir, basename + CANARY_SRT_SUFFIX)
    csv_output = os.path.join(wav_dir, basename + CANARY_CSV_SUFFIX)

    # Sigurnosna provjera
    if os.path.exists(srt_output):
        return {"status": "skipped", "reason": "canary SRT already exists"}

    file_size_mb = os.path.getsize(wav_file) / (1024 * 1024)
    print(f"      ⏳ Transkribriram ({file_size_mb:.1f} MB)...")

    start_time = time.time()

    try:
        # Pokreni transkripciju s timestampovima (inference_mode smanjuje overhead)
        with torch.inference_mode():
            output = model.transcribe(
                [wav_file],
                
main function · python · L275-L388 (114 LOC)
colab_canary/transcribe_canary.py
def main():
    args = parse_args()

    input_dir = args.input_dir
    output_dir = args.output_dir or input_dir
    source_lang = args.source_lang
    target_lang = args.target_lang

    print("╔══════════════════════════════════════════════════╗")
    print("║   🐤 CANARY 1B v2 — DIRECT GPU TRANSKRIPCIJA    ║")
    print("║   Google Colab / Kaggle                         ║")
    print("╚══════════════════════════════════════════════════╝")
    print(f"   📂 Input:  {input_dir}")
    print(f"   💾 Output: {output_dir}")
    print(f"   🗣️  Izvorni jezik: {source_lang}")
    print(f"   💬 Ciljni jezik: {target_lang}")
    if args.dry_run:
        print("   ⚠️  DRY RUN — samo prikaz, bez transkripcije")
    print("")

    # Provjeri direktorije
    if not os.path.isdir(input_dir):
        print(f"❌ Input direktorij ne postoji: {input_dir}")
        sys.exit(1)

    os.makedirs(output_dir, exist_ok=True)

    # Pronađi WAV datoteke
    if args.file:
        if not os.path.isfile(args.file):
parse_srt function · python · L44-L73 (30 LOC)
colab_diarize/diarize_canary.py
def parse_srt(srt_path):
    """Parsira SRT datoteku i vraća listu segmenata."""
    with open(srt_path, "r", encoding="utf-8") as f:
        content = f.read()

    pattern = re.compile(
        r"(\d+)\s*\n"
        r"(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*\n"
        r"((?:(?!\n\n|\n\d+\s*\n).)*)",
        re.DOTALL
    )

    segments = []
    for match in pattern.finditer(content):
        idx = int(match.group(1))
        start_str = match.group(2).replace(",", ".")
        end_str = match.group(3).replace(",", ".")
        text = match.group(4).strip()

        start_sec = timestamp_to_seconds(start_str)
        end_sec = timestamp_to_seconds(end_str)

        segments.append({
            "index": idx,
            "start": start_sec,
            "end": end_sec,
            "text": text
        })

    return segments
timestamp_to_seconds function · python · L76-L84 (9 LOC)
colab_diarize/diarize_canary.py
def timestamp_to_seconds(ts):
    """Konvertira HH:MM:SS.mmm u sekunde."""
    parts = ts.split(":")
    h = int(parts[0])
    m = int(parts[1])
    s_parts = parts[2].split(".")
    s = int(s_parts[0])
    ms = int(s_parts[1]) if len(s_parts) > 1 else 0
    return h * 3600 + m * 60 + s + ms / 1000.0
seconds_to_srt_timestamp function · python · L87-L96 (10 LOC)
colab_diarize/diarize_canary.py
def seconds_to_srt_timestamp(sec):
    """Konvertira sekunde u SRT format HH:MM:SS,mmm."""
    sec = max(0.0, sec)
    td = timedelta(seconds=sec)
    total_seconds = int(td.total_seconds())
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    secs = total_seconds % 60
    millis = int((sec - int(sec)) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
format_duration function · python · L99-L104 (6 LOC)
colab_diarize/diarize_canary.py
def format_duration(seconds):
    """Formatira trajanje u čitljiv format."""
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"{h}h {m}m {s}s"
Repobility (the analyzer behind this table) · https://repobility.com
install_dependencies function · python · L109-L117 (9 LOC)
colab_diarize/diarize_canary.py
def install_dependencies():
    """Provjerava i instalira pyannote.audio ako nije prisutan."""
    try:
        import pyannote.audio  # noqa: F401
        print("   pyannote.audio je već instaliran")
    except ImportError:
        print("   Instaliram pyannote.audio...")
        os.system("pip install pyannote.audio")
        print("   pyannote.audio instaliran")
get_hf_token function · python · L120-L151 (32 LOC)
colab_diarize/diarize_canary.py
def get_hf_token(args_token):
    """Dohvaća HuggingFace token iz argumenata, Colab secrets, ili env varijable."""
    # 1. CLI argument
    if args_token:
        return args_token

    # 2. Colab secrets (userdata)
    try:
        from google.colab import userdata
        token = userdata.get("HF_TOKEN")
        if token:
            print("   HF token učitan iz Colab Secrets")
            return token
    except (ImportError, Exception):
        pass

    # 3. Environment varijabla
    token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
    if token:
        print("   HF token učitan iz environment varijable")
        return token

    print("   HuggingFace token nije pronađen!")
    print("   Opcije:")
    print("     1. Colab Secrets: dodaj HF_TOKEN u Secrets (lijevi panel)")
    print("     2. CLI: --hf-token TVOJ_TOKEN")
    print("     3. Environment: export HF_TOKEN=TVOJ_TOKEN")
    print("")
    print("   Token je potreban za pyannote/speaker-diarizatio
load_diarization_pipeline function · python · L154-L181 (28 LOC)
colab_diarize/diarize_canary.py
def load_diarization_pipeline(hf_token):
    """Učitava pyannote community-1 diarization pipeline na GPU (~9.5 GB VRAM)."""
    import torch
    from pyannote.audio import Pipeline

    if torch.cuda.is_available():
        device = "cuda"
        vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        gpu_name = torch.cuda.get_device_name(0)
        print(f"   Uređaj: CUDA — {gpu_name} ({vram_gb:.1f} GB VRAM)")
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
        print(f"   Uređaj: MPS (Apple Silicon)")
    else:
        device = "cpu"
        print("   Uređaj: CPU")
        print("   UPOZORENJE: GPU nije dostupan! Diarizacija će biti spora.")
        print("   Na Colabu: Runtime > Change runtime type > T4 GPU")

    print("   Učitavam pyannote/speaker-diarization-community-1 model...")
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-community-1",
        token=hf_token
    )
run_diarization function · python · L184-L236 (53 LOC)
colab_diarize/diarize_canary.py
def run_diarization(pipeline, wav_file, min_speakers=None, max_speakers=None):
    """Pokreće pyannote community-1 diarizaciju na jednom WAV fajlu.

    Koristi exclusive_speaker_diarization mode koji daje točno jednog govornika
    u svakom trenutku (bez overlapa), idealno za alignment s SRT titlovima.
    """
    import torch
    import soundfile as sf

    # Učitaj audio
    data, sample_rate = sf.read(wav_file)
    waveform = torch.from_numpy(data).float().unsqueeze(0)

    # Parametri
    diarize_params = {}
    if min_speakers is not None:
        diarize_params["min_speakers"] = min_speakers
    if max_speakers is not None:
        diarize_params["max_speakers"] = max_speakers

    # Pokreni diarizaciju
    audio_input = {"waveform": waveform, "sample_rate": sample_rate}
    result = pipeline(audio_input, **diarize_params)

    # Oslobodi waveform iz memorije (100+ MB za velike fajlove)
    del waveform, data, audio_input

    # community-1 (pyannote 4.x) vraća DiarizeOutput dat
assign_speakers function · python · L239-L256 (18 LOC)
colab_diarize/diarize_canary.py
def assign_speakers(srt_segments, speaker_segments):
    """Za svaki SRT segment, pronađi govornika s najvećim overlapom."""
    for seg in srt_segments:
        best_speaker = "UNKNOWN"
        best_overlap = 0.0

        for spk in speaker_segments:
            overlap_start = max(seg["start"], spk["start"])
            overlap_end = min(seg["end"], spk["end"])
            overlap = max(0, overlap_end - overlap_start)

            if overlap > best_overlap:
                best_overlap = overlap
                best_speaker = spk["speaker"]

        seg["speaker"] = best_speaker

    return srt_segments
write_diarized_srt function · python · L259-L270 (12 LOC)
colab_diarize/diarize_canary.py
def write_diarized_srt(segments, output_path):
    """Zapisuje SRT s oznakama govornika."""
    with open(output_path, "w", encoding="utf-8") as f:
        for i, seg in enumerate(segments, 1):
            start_ts = seconds_to_srt_timestamp(seg["start"])
            end_ts = seconds_to_srt_timestamp(seg["end"])
            speaker = seg.get("speaker", "UNKNOWN")

            f.write(f"{i}\n")
            f.write(f"{start_ts} --> {end_ts}\n")
            f.write(f"[{speaker}] {seg['text']}\n")
            f.write("\n")
_worker_init function · python · L283-L328 (46 LOC)
colab_diarize/diarize_canary.py
def _worker_init(hf_token, min_speakers, max_speakers, threads_per_worker=2,
                 rclone_dest=None, drive_mount=None, input_dir=None):
    """Inicijalizacija worker procesa — svaki učitava vlastiti pyannote pipeline.

    Na CPU-only stroju, ograničava PyTorch/MKL/OMP threadove po workeru
    da spriječi oversubscription (npr. 40 workera × 80 threadova = 3200 threadova na 80 CPU).
    """
    global _worker_pipeline, _worker_min_speakers, _worker_max_speakers

    # Suppress torchcodec/pyannote warnings u worker procesima
    import warnings
    warnings.filterwarnings("ignore", message="torchcodec is not installed")
    warnings.filterwarnings("ignore", message="std\\(\\): degrees of freedom")

    # VAŽNO: env vars moraju biti postavljene PRIJE import torch,
    # jer PyTorch/OMP/MKL čitaju ih pri inicijalizaciji
    os.environ["OMP_NUM_THREADS"] = str(threads_per_worker)
    os.environ["MKL_NUM_THREADS"] = str(threads_per_worker)
    os.environ["OPENBLAS_NUM_THREADS"] = 
_worker_diarize function · python · L331-L428 (98 LOC)
colab_diarize/diarize_canary.py
def _worker_diarize(wav_file):
    """Worker funkcija: diarizira jedan fajl. Vraća (wav_file, result)."""
    global _worker_pipeline, _worker_min_speakers, _worker_max_speakers
    global _worker_rclone_dest, _worker_drive_mount, _worker_input_dir
    import threading

    wav_dir = os.path.dirname(wav_file)
    basename = os.path.basename(wav_file)
    srt_input = os.path.join(wav_dir, basename + CANARY_SRT_SUFFIX)
    diarized_output = os.path.join(wav_dir, basename + DIARIZED_SRT_SUFFIX)
    pid = os.getpid()

    if os.path.exists(diarized_output):
        return wav_file, {"status": "skipped", "reason": "already exists"}
    if not os.path.exists(srt_input):
        return wav_file, {"status": "skipped", "reason": "no .canary.srt"}

    # Distributed lock provjera u workeru
    use_lock = bool(_worker_rclone_dest or _worker_drive_mount)
    if use_lock:
        lock_status = None
        if _worker_rclone_dest:
            lock_status = _lock_exists_remote(wav_file, _worker_input
Methodology: Repobility · https://repobility.com/research/state-of-ai-code-2026/
_get_hostname function · python · L437-L440 (4 LOC)
colab_diarize/diarize_canary.py
def _get_hostname():
    """Vraća hostname za lock identifikaciju."""
    import socket
    return socket.gethostname()
_remote_path_for_wav function · python · L443-L446 (4 LOC)
colab_diarize/diarize_canary.py
def _remote_path_for_wav(wav_file, input_dir, rclone_dest):
    """Izračunaj rclone remote path za dati WAV fajl."""
    rel = os.path.relpath(wav_file, input_dir)
    return f"{rclone_dest}/{rel}"
_lock_exists_remote function · python · L449-L490 (42 LOC)
colab_diarize/diarize_canary.py
def _lock_exists_remote(wav_file, input_dir, rclone_dest):
    """Provjeri postoji li .lock ili .diarized.srt na remote-u (rclone).
    Vraća: 'diarized', 'locked', ili None.
    """
    import subprocess

    basename = os.path.basename(wav_file)
    rel_dir = os.path.relpath(os.path.dirname(wav_file), input_dir)
    remote_dir = f"{rclone_dest}/{rel_dir}" if rel_dir != "." else rclone_dest

    # Provjeri .canary.diarized.srt
    diarized_remote = f"{remote_dir}/{basename}{DIARIZED_SRT_SUFFIX}"
    ret = subprocess.run(
        ["rclone", "ls", diarized_remote, "--max-depth", "1"],
        capture_output=True, text=True, timeout=30
    )
    if ret.returncode == 0 and ret.stdout.strip():
        return "diarized"

    # Provjeri .canary.lock
    lock_remote = f"{remote_dir}/{basename}{LOCK_SUFFIX}"
    ret = subprocess.run(
        ["rclone", "ls", lock_remote, "--max-depth", "1"],
        capture_output=True, text=True, timeout=30
    )
    if ret.returncode == 0 and ret.stdout.stri
_lock_exists_mount function · python · L493-L520 (28 LOC)
colab_diarize/diarize_canary.py
def _lock_exists_mount(wav_file, drive_mount):
    """Provjeri postoji li .lock ili .diarized.srt na mountanom Drive-u.
    Vraća: 'diarized', 'locked', ili None.
    """
    basename = os.path.basename(wav_file)
    # Pronađi odgovarajući direktorij na mountu — koristi isti relativni path
    # wav_file path sadrži kanal/filename, trebamo mapirati to na drive_mount
    # drive_mount je root dir na Drive-u, wav_file struktura je input_dir/kanal/file.wav
    # Za mount mode: wav_file JE na Drive-u, pa lock i diarized su u istom direktoriju
    wav_dir = os.path.dirname(wav_file)

    diarized_path = os.path.join(wav_dir, basename + DIARIZED_SRT_SUFFIX)
    if os.path.exists(diarized_path):
        return "diarized"

    lock_path = os.path.join(wav_dir, basename + LOCK_SUFFIX)
    if os.path.exists(lock_path):
        try:
            with open(lock_path, "r") as f:
                lock_ts = float(f.readline().strip())
            if time.time() - lock_ts > LOCK_STALE_SECONDS:
         
_create_lock_remote function · python · L523-L544 (22 LOC)
colab_diarize/diarize_canary.py
def _create_lock_remote(wav_file, input_dir, rclone_dest):
    """Stvori .lock fajl na remote-u."""
    import subprocess
    import tempfile

    basename = os.path.basename(wav_file)
    rel_dir = os.path.relpath(os.path.dirname(wav_file), input_dir)
    remote_dir = f"{rclone_dest}/{rel_dir}" if rel_dir != "." else rclone_dest
    lock_remote = f"{remote_dir}/{basename}{LOCK_SUFFIX}"

    # Stvori privremeni lock fajl sa timestamp + hostname
    with tempfile.NamedTemporaryFile(mode="w", suffix=".lock", delete=False) as f:
        f.write(f"{time.time()}\n{_get_hostname()}\n")
        tmp_path = f.name

    try:
        subprocess.run(
            ["rclone", "copyto", tmp_path, lock_remote, "--quiet"],
            capture_output=True, timeout=30
        )
    finally:
        os.unlink(tmp_path)
_create_lock_mount function · python · L547-L552 (6 LOC)
colab_diarize/diarize_canary.py
def _create_lock_mount(wav_file):
    """Stvori .lock fajl na mountanom Drive-u."""
    basename = os.path.basename(wav_file)
    lock_path = os.path.join(os.path.dirname(wav_file), basename + LOCK_SUFFIX)
    with open(lock_path, "w") as f:
        f.write(f"{time.time()}\n{_get_hostname()}\n")
_remove_lock_remote function · python · L555-L567 (13 LOC)
colab_diarize/diarize_canary.py
def _remove_lock_remote(wav_file, input_dir, rclone_dest):
    """Obriši .lock fajl s remote-a."""
    import subprocess

    basename = os.path.basename(wav_file)
    rel_dir = os.path.relpath(os.path.dirname(wav_file), input_dir)
    remote_dir = f"{rclone_dest}/{rel_dir}" if rel_dir != "." else rclone_dest
    lock_remote = f"{remote_dir}/{basename}{LOCK_SUFFIX}"

    subprocess.run(
        ["rclone", "deletefile", lock_remote, "--quiet"],
        capture_output=True, timeout=30
    )
_remove_lock_mount function · python · L570-L577 (8 LOC)
colab_diarize/diarize_canary.py
def _remove_lock_mount(wav_file):
    """Obriši .lock fajl s mountanog Drive-a."""
    basename = os.path.basename(wav_file)
    lock_path = os.path.join(os.path.dirname(wav_file), basename + LOCK_SUFFIX)
    try:
        os.remove(lock_path)
    except OSError:
        pass
Repobility · code-quality intelligence · https://repobility.com
_bg_rclone_upload function · python · L582-L594 (13 LOC)
colab_diarize/diarize_canary.py
def _bg_rclone_upload(diarized_srt_path, rclone_dest, input_dir):
    """Spawna background rclone process za upload jednog diarized SRT fajla."""
    import subprocess

    # Izračunaj relativni path unutar input_dir-a
    rel_path = os.path.relpath(os.path.dirname(diarized_srt_path), input_dir)
    dest = f"{rclone_dest}/{rel_path}" if rel_path != "." else rclone_dest

    subprocess.Popen(
        ["rclone", "copyto", diarized_srt_path, f"{dest}/{os.path.basename(diarized_srt_path)}",
         "--quiet"],
        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
    )
has_diarized_transcript function · python · L599-L603 (5 LOC)
colab_diarize/diarize_canary.py
def has_diarized_transcript(wav_file):
    """Provjerava postoji li diarized transkript za WAV datoteku."""
    wav_dir = os.path.dirname(wav_file)
    basename = os.path.basename(wav_file)
    return os.path.exists(os.path.join(wav_dir, basename + DIARIZED_SRT_SUFFIX))
diarize_single_file function · python · L606-L671 (66 LOC)
colab_diarize/diarize_canary.py
def diarize_single_file(pipeline, wav_file, min_speakers=None, max_speakers=None):
    """Diarizira jednu WAV datoteku. NIKADA ne prepisuje postojeće datoteke."""
    import torch
    import gc

    wav_dir = os.path.dirname(wav_file)
    basename = os.path.basename(wav_file)
    srt_input = os.path.join(wav_dir, basename + CANARY_SRT_SUFFIX)
    diarized_output = os.path.join(wav_dir, basename + DIARIZED_SRT_SUFFIX)

    # Sigurnosna provjera
    if os.path.exists(diarized_output):
        return {"status": "skipped", "reason": "diarized SRT already exists"}

    if not os.path.exists(srt_input):
        return {"status": "skipped", "reason": "no .canary.srt found"}

    file_size_mb = os.path.getsize(wav_file) / (1024 * 1024)
    print(f"      Diarizing ({file_size_mb:.1f} MB)...")

    start_time = time.time()

    try:
        # 1. Parsiraj .canary.srt
        srt_segments = parse_srt(srt_input)
        if not srt_segments:
            return {"status": "error", "reason": "empty .c
parse_args function · python · L676-L734 (59 LOC)
colab_diarize/diarize_canary.py
def parse_args():
    parser = argparse.ArgumentParser(
        description="pyannote speaker diarization za Canary transkripte na Colab GPU",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Primjeri:
  # Google Colab (nakon mount Google Drive)
  !python diarize_canary.py --input-dir /content/drive/MyDrive/domovina_fetch_data/canary_wav

  # Samo prikaz
  !python diarize_canary.py --input-dir /content/drive/MyDrive/domovina_fetch_data/canary_wav --dry-run

  # Lokalno
  python diarize_canary.py --input-dir ./wav_files --hf-token hf_xxx
"""
    )

    parser.add_argument(
        "--input-dir", required=True,
        help="Direktorij s WAV i .canary.srt datotekama (rekurzivno)"
    )
    parser.add_argument(
        "--hf-token", default=None,
        help="HuggingFace token za pyannote model (ili koristi Colab Secrets / env HF_TOKEN)"
    )
    parser.add_argument(
        "--dry-run", action="store_true",
        help="Samo prikaz datoteka, bez diariza
main function · python · L737-L967 (231 LOC)
colab_diarize/diarize_canary.py
def main():
    args = parse_args()
    input_dir = args.input_dir

    print("╔══════════════════════════════════════════════════╗")
    print("║   PYANNOTE DIARIZACIJA — CANARY TRANSKRIPTI     ║")
    print("╚══════════════════════════════════════════════════╝")
    print(f"   Input:  {input_dir}")
    if args.workers > 1:
        print(f"   Workers: {args.workers} (paralelno, ~{args.workers * 3} GB RAM)")
    if args.min_speakers:
        print(f"   Min govornika: {args.min_speakers}")
    if args.max_speakers:
        print(f"   Max govornika: {args.max_speakers}")
    if args.rclone_dest:
        print(f"   rclone upload: {args.rclone_dest} (background, nakon svakog fajla)")
    if args.drive_mount:
        print(f"   Drive mount:  {args.drive_mount} (distributed lock via mount)")
    use_distributed_lock = bool(args.rclone_dest or args.drive_mount)
    if use_distributed_lock:
        print(f"   Distributed lock: AKTIVAN (stale timeout: {LOCK_STALE_SECONDS//3600}h)")
    if args.
sanitizeDescription function · javascript · L41-L52 (12 LOC)
convert_to_wav.js
function sanitizeDescription(str) {
    if (!str) return "nepoznat_naslov";
    str = str.toLowerCase();
    const map = {
        'č': 'c', 'ć': 'c', 'ž': 'z', 'š': 's', 'đ': 'd',
        'Č': 'c', 'Ć': 'c', 'Ž': 'z', 'Š': 's', 'Đ': 'd'
    };
    str = str.replace(/[čćžšđČĆŽŠĐ]/g, (char) => map[char] || char);
    str = str.replace(/[^a-z0-9]/g, '_');
    str = str.replace(/_+/g, '_').replace(/^_|_$/g, '');
    return str || "nepoznat_naslov";
}
extractVideoId function · javascript · L54-L59 (6 LOC)
convert_to_wav.js
function extractVideoId(url) {
    url = url.trim();
    if (!url) return null;
    const m = url.match(/(?:youtu\.be\/|v=)([a-zA-Z0-9_-]{11})/);
    return m ? m[1] : null;
}
extractDataFromLine function · javascript · L61-L78 (18 LOC)
convert_to_wav.js
function extractDataFromLine(line) {
    line = line.trim();
    if (!line || line.startsWith("#")) return null;
    if (line.includes("|")) {
        const parts = line.split("|");
        const url = parts[parts.length - 1].trim();
        let title = "nepoznat_naslov";
        let date = "NA";
        if (parts.length >= 3) {
            date = parts[0].trim();
            title = parts.slice(1, parts.length - 1).join(" ").trim();
        } else if (parts.length === 2) {
            title = parts[0].trim();
        }
        return { url, title, date };
    }
    return { url: line, title: "nepoznat_naslov", date: "NA" };
}
Repobility · severity-and-effort ranking · https://repobility.com
loadState function · javascript · L80-L89 (10 LOC)
convert_to_wav.js
function loadState(stateFile) {
    if (fs.existsSync(stateFile)) {
        try {
            return JSON.parse(fs.readFileSync(stateFile, "utf-8"));
        } catch (e) {
            console.error(`[GREŠKA] Neispravan JSON stanja: ${stateFile}`);
        }
    }
    return { completed: [], failed: [] };
}
findAudioFile function · javascript · L97-L108 (12 LOC)
convert_to_wav.js
function findAudioFile(outputDir, videoId) {
    if (!fs.existsSync(outputDir)) return null;

    const files = fs.readdirSync(outputDir);
    // Tražimo datoteku koja sadrži _yt_{videoId} i završava na .mp3
    // Ignoriraj macOS ._ resource fork datoteke
    const match = files.find(f =>
        !f.startsWith("._") && f.includes(`_yt_${videoId}`) && f.endsWith(".mp3")
    );

    return match ? path.join(outputDir, match) : null;
}
convertToWav function · javascript · L114-L137 (24 LOC)
convert_to_wav.js
function convertToWav(inputFile) {
    const wavFile = inputFile.replace(/\.mp3$/, ".wav");

    // Preskoči ako WAV već postoji
    if (fs.existsSync(wavFile)) {
        return { wavFile, skipped: true };
    }

    const args = [
        "-i", inputFile,
        ...FFMPEG_WAV_ARGS,
        "-y",  // Overwrite bez pitanja
        wavFile
    ];

    return new Promise((resolve, reject) => {
        const proc = spawn("ffmpeg", args, { stdio: "inherit" });
        proc.on("close", (code) => {
            if (code === 0) resolve({ wavFile, skipped: false });
            else reject(new Error(`ffmpeg exit code: ${code} za ${inputFile}`));
        });
        proc.on("error", reject);
    });
}
main function · javascript · L141-L277 (137 LOC)
convert_to_wav.js
async function main() {
    const args = process.argv.slice(2);
    const outputDirIdx = args.indexOf("--output-dir");
    const baseOutputDir = outputDirIdx !== -1 ? args[outputDirIdx + 1] : DEFAULT_OUTPUT_DIR;
    const dryRun = args.includes("--dry-run");
    const channelIdx = args.indexOf("--channel");
    const channelFilter = channelIdx !== -1 ? args[channelIdx + 1] : null;

    if (!fs.existsSync(LISTS_DIR)) {
        console.error(`❌ Nema direktorija s listama: ${LISTS_DIR}`);
        process.exit(1);
    }

    if (!fs.existsSync(baseOutputDir)) {
        console.error(`❌ Output direktorij ne postoji: ${baseOutputDir}`);
        console.error(`   Je li disk DOMOVINA1TB mountan?`);
        process.exit(1);
    }

    // Pronađi sve liste
    let listFiles = fs.readdirSync(LISTS_DIR)
        .filter(f => f.endsWith("-lista.txt"))
        .map(f => path.join(LISTS_DIR, f));

    // Filtriraj po kanalu ako je zadan --channel
    if (channelFilter) {
        listFiles = listFiles.
parse_args function · python · L33-L42 (10 LOC)
diarize.py
def parse_args():
    parser = argparse.ArgumentParser(description="Hibridna diarizacija: pyannote + whisper.cpp SRT")
    parser.add_argument("--wav", required=True, help="Putanja do WAV audio datoteke")
    parser.add_argument("--srt", required=True, help="Putanja do postojećeg SRT fajla (whisper.cpp)")
    parser.add_argument("--output", required=True, help="Putanja za izlazni diarized SRT")
    parser.add_argument("--hf-token", required=True, help="HuggingFace access token za pyannote modele")
    parser.add_argument("--device", default="auto", help="PyTorch device: auto, mps, cpu (default: auto)")
    parser.add_argument("--min-speakers", type=int, default=None, help="Minimalan broj govornika")
    parser.add_argument("--max-speakers", type=int, default=None, help="Maksimalan broj govornika")
    return parser.parse_args()
parse_srt function · python · L47-L77 (31 LOC)
diarize.py
def parse_srt(srt_path):
    """Parsira SRT datoteku i vraća listu segmenata."""
    with open(srt_path, "r", encoding="utf-8") as f:
        content = f.read()

    # SRT format: indeks\ntimestamp --> timestamp\ntekst\n\n
    pattern = re.compile(
        r"(\d+)\s*\n"
        r"(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*\n"
        r"((?:(?!\n\n|\n\d+\s*\n).)*)",
        re.DOTALL
    )

    segments = []
    for match in pattern.finditer(content):
        idx = int(match.group(1))
        start_str = match.group(2).replace(",", ".")
        end_str = match.group(3).replace(",", ".")
        text = match.group(4).strip()

        start_sec = timestamp_to_seconds(start_str)
        end_sec = timestamp_to_seconds(end_str)

        segments.append({
            "index": idx,
            "start": start_sec,
            "end": end_sec,
            "text": text
        })

    return segments
timestamp_to_seconds function · python · L80-L88 (9 LOC)
diarize.py
def timestamp_to_seconds(ts):
    """Konvertira HH:MM:SS.mmm u sekunde."""
    parts = ts.split(":")
    h = int(parts[0])
    m = int(parts[1])
    s_parts = parts[2].split(".")
    s = int(s_parts[0])
    ms = int(s_parts[1]) if len(s_parts) > 1 else 0
    return h * 3600 + m * 60 + s + ms / 1000.0
seconds_to_timestamp function · python · L91-L99 (9 LOC)
diarize.py
def seconds_to_timestamp(sec):
    """Konvertira sekunde u SRT format HH:MM:SS,mmm."""
    td = timedelta(seconds=sec)
    total_seconds = int(td.total_seconds())
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    secs = total_seconds % 60
    millis = int((sec - int(sec)) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
Repobility (the analyzer behind this table) · https://repobility.com
run_diarization function · python · L104-L168 (65 LOC)
diarize.py
def run_diarization(wav_path, hf_token, device="auto", min_speakers=None, max_speakers=None):
    """Pokreće pyannote diarizaciju na MPS (Metal GPU) ili CPU."""
    import torch
    import soundfile as sf
    from pyannote.audio import Pipeline

    # Automatski odabir uređaja
    if device == "auto":
        if torch.backends.mps.is_available():
            device = "mps"
            print(f"   🖥️  Koristim Metal GPU (MPS)")
        elif torch.cuda.is_available():
            device = "cuda"
            print(f"   🖥️  Koristim CUDA GPU")
        else:
            device = "cpu"
            print(f"   🖥️  Koristim CPU (nema GPU akceleracije)")
    else:
        print(f"   🖥️  Koristim: {device}")

    print(f"   📥 Učitavam pyannote model...")
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        token=hf_token
    )
    pipeline.to(torch.device(device))

    # Učitaj audio putem soundfile (nativno čita WAV, ne treba FFmpeg)
    print(f"   🔊 Učitav
assign_speakers function · python · L171-L192 (22 LOC)
diarize.py
def assign_speakers(srt_segments, speaker_segments):
    """
    Za svaki SRT segment, pronađi govornika koji ima najveći
    overlap s tim vremenskim rasponom.
    """
    for seg in srt_segments:
        best_speaker = "UNKNOWN"
        best_overlap = 0.0

        for spk in speaker_segments:
            # Izračunaj overlap
            overlap_start = max(seg["start"], spk["start"])
            overlap_end = min(seg["end"], spk["end"])
            overlap = max(0, overlap_end - overlap_start)

            if overlap > best_overlap:
                best_overlap = overlap
                best_speaker = spk["speaker"]

        seg["speaker"] = best_speaker

    return srt_segments
write_diarized_srt function · python · L195-L206 (12 LOC)
diarize.py
def write_diarized_srt(segments, output_path):
    """Zapisuje SRT s oznakom govornika."""
    with open(output_path, "w", encoding="utf-8") as f:
        for i, seg in enumerate(segments, 1):
            start_ts = seconds_to_timestamp(seg["start"])
            end_ts = seconds_to_timestamp(seg["end"])
            speaker = seg.get("speaker", "UNKNOWN")

            f.write(f"{i}\n")
            f.write(f"{start_ts} --> {end_ts}\n")
            f.write(f"[{speaker}] {seg['text']}\n")
            f.write("\n")
page 1 / 4next ›