Function bodies 195 total
format_srt_time function · python · L47-L59 (13 LOC)colab_canary/transcribe_canary.py
def format_srt_time(seconds: float) -> str:
"""Pretvara sekunde u SRT time format HH:MM:SS,mmm"""
sanitized = max(0.0, seconds)
delta = datetime.timedelta(seconds=sanitized)
total_int_seconds = int(delta.total_seconds())
hours = total_int_seconds // 3600
remainder = total_int_seconds % 3600
minutes = remainder // 60
secs = remainder % 60
milliseconds = delta.microseconds // 1000
return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"generate_srt_content function · python · L62-L73 (12 LOC)colab_canary/transcribe_canary.py
def generate_srt_content(segment_timestamps: list) -> str:
"""Generira SRT formatirani string iz segmentnih timestampova."""
srt_lines = []
for i, ts in enumerate(segment_timestamps):
start_time = format_srt_time(ts['start'])
end_time = format_srt_time(ts['end'])
text = ts['segment']
srt_lines.append(str(i + 1))
srt_lines.append(f"{start_time} --> {end_time}")
srt_lines.append(text)
srt_lines.append("")
return "\n".join(srt_lines)sec_to_hms function · python · L76-L79 (4 LOC)colab_canary/transcribe_canary.py
def sec_to_hms(seconds: float) -> str:
"""Pretvara sekunde u HH:MM:SS format za CSV."""
seconds = round(seconds)
return str(datetime.timedelta(seconds=seconds))format_duration function · python · L82-L87 (6 LOC)colab_canary/transcribe_canary.py
def format_duration(seconds: float) -> str:
"""Formatira trajanje u čitljiv format."""
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
return f"{h}h {m}m {s}s"parse_args function · python · L92-L138 (47 LOC)colab_canary/transcribe_canary.py
def parse_args():
parser = argparse.ArgumentParser(
description="🐤 NVIDIA Canary 1B v2 — Transkripcija na Colab/Kaggle GPU",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Primjeri:
# Google Colab (nakon mount Google Drive)
!python transcribe_canary.py --input-dir /content/drive/MyDrive/wav_files
# Kaggle
!python transcribe_canary.py --input-dir /kaggle/input/my-dataset --output-dir /kaggle/working
# Lokalno
python transcribe_canary.py --input-dir ./wav_files --dry-run
"""
)
parser.add_argument(
"--input-dir", required=True,
help="Direktorij s WAV datotekama za transkripciju"
)
parser.add_argument(
"--output-dir", default=None,
help="Direktorij za output (default: isti kao input-dir)"
)
parser.add_argument(
"--source-lang", default="hr",
help="Izvorni jezik — ISO kod (default: hr za Hrvatski)"
)
parser.add_argument(
"--target-lang", defaulfind_wav_files function · python · L141-L147 (7 LOC)colab_canary/transcribe_canary.py
def find_wav_files(input_dir: str) -> list:
"""Pronalazi sve WAV datoteke rekurzivno u direktoriju i poddirektorijima."""
wav_files = sorted([
str(p) for p in Path(input_dir).rglob("*.wav")
if not p.name.startswith("._")
])
return wav_fileshas_canary_transcript function · python · L150-L156 (7 LOC)colab_canary/transcribe_canary.py
def has_canary_transcript(wav_file: str, output_dir: str) -> bool:
"""Provjerava postoji li canary transkript za danu WAV datoteku.
Traži SRT pored WAV fajla (u istom direktoriju)."""
wav_dir = os.path.dirname(wav_file)
basename = os.path.basename(wav_file)
srt_path = os.path.join(wav_dir, basename + CANARY_SRT_SUFFIX)
return os.path.exists(srt_path)Repobility · severity-and-effort ranking · https://repobility.com
install_dependencies function · python · L159-L167 (9 LOC)colab_canary/transcribe_canary.py
def install_dependencies():
"""Provjerava i instalira NeMo ako nije prisutan."""
try:
import nemo.collections.asr # noqa: F401
print(" ✅ NeMo toolkit je već instaliran")
except ImportError:
print(" 📦 Instaliram NeMo toolkit...")
os.system("pip install -U 'nemo_toolkit[asr]'")
print(" ✅ NeMo instaliran")load_model function · python · L170-L192 (23 LOC)colab_canary/transcribe_canary.py
def load_model():
"""Učitava Canary 1B v2 model s BF16 optimizacijom."""
import torch
from nemo.collections.asr.models import ASRModel
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f" 🖥️ Uređaj: {device.upper()}")
if device == "cpu":
print(" ⚠️ UPOZORENJE: GPU nije dostupan! Transkripcija će biti JAKO spora.")
print(" 💡 Na Colab/Kaggle: Runtime → Change runtime type → T4 GPU")
print(" 📥 Učitavam nvidia/canary-1b-v2 model (ovo traje ~1-2min prvi put)...")
model = ASRModel.from_pretrained(model_name="nvidia/canary-1b-v2")
model.eval()
# BF16 optimizacija — pola memorije, brži compute na modernim GPU-ima
if device == "cuda" and torch.cuda.is_bf16_supported():
model = model.to(torch.bfloat16)
print(" ⚡ BF16 optimizacija aktivna")
print(" ✅ Model učitan")
return model, devicetranscribe_single_file function · python · L195-L272 (78 LOC)colab_canary/transcribe_canary.py
def transcribe_single_file(model, wav_file: str, output_dir: str,
source_lang: str, target_lang: str) -> dict:
"""
Transkribira jednu WAV datoteku i sprema SRT + CSV.
NIKADA ne prepisuje postojeće datoteke.
"""
import torch
wav_dir = os.path.dirname(wav_file)
basename = os.path.basename(wav_file)
srt_output = os.path.join(wav_dir, basename + CANARY_SRT_SUFFIX)
csv_output = os.path.join(wav_dir, basename + CANARY_CSV_SUFFIX)
# Sigurnosna provjera
if os.path.exists(srt_output):
return {"status": "skipped", "reason": "canary SRT already exists"}
file_size_mb = os.path.getsize(wav_file) / (1024 * 1024)
print(f" ⏳ Transkribriram ({file_size_mb:.1f} MB)...")
start_time = time.time()
try:
# Pokreni transkripciju s timestampovima (inference_mode smanjuje overhead)
with torch.inference_mode():
output = model.transcribe(
[wav_file],
main function · python · L275-L388 (114 LOC)colab_canary/transcribe_canary.py
def main():
args = parse_args()
input_dir = args.input_dir
output_dir = args.output_dir or input_dir
source_lang = args.source_lang
target_lang = args.target_lang
print("╔══════════════════════════════════════════════════╗")
print("║ 🐤 CANARY 1B v2 — DIRECT GPU TRANSKRIPCIJA ║")
print("║ Google Colab / Kaggle ║")
print("╚══════════════════════════════════════════════════╝")
print(f" 📂 Input: {input_dir}")
print(f" 💾 Output: {output_dir}")
print(f" 🗣️ Izvorni jezik: {source_lang}")
print(f" 💬 Ciljni jezik: {target_lang}")
if args.dry_run:
print(" ⚠️ DRY RUN — samo prikaz, bez transkripcije")
print("")
# Provjeri direktorije
if not os.path.isdir(input_dir):
print(f"❌ Input direktorij ne postoji: {input_dir}")
sys.exit(1)
os.makedirs(output_dir, exist_ok=True)
# Pronađi WAV datoteke
if args.file:
if not os.path.isfile(args.file):
parse_srt function · python · L44-L73 (30 LOC)colab_diarize/diarize_canary.py
def parse_srt(srt_path):
"""Parsira SRT datoteku i vraća listu segmenata."""
with open(srt_path, "r", encoding="utf-8") as f:
content = f.read()
pattern = re.compile(
r"(\d+)\s*\n"
r"(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*\n"
r"((?:(?!\n\n|\n\d+\s*\n).)*)",
re.DOTALL
)
segments = []
for match in pattern.finditer(content):
idx = int(match.group(1))
start_str = match.group(2).replace(",", ".")
end_str = match.group(3).replace(",", ".")
text = match.group(4).strip()
start_sec = timestamp_to_seconds(start_str)
end_sec = timestamp_to_seconds(end_str)
segments.append({
"index": idx,
"start": start_sec,
"end": end_sec,
"text": text
})
return segmentstimestamp_to_seconds function · python · L76-L84 (9 LOC)colab_diarize/diarize_canary.py
def timestamp_to_seconds(ts):
"""Konvertira HH:MM:SS.mmm u sekunde."""
parts = ts.split(":")
h = int(parts[0])
m = int(parts[1])
s_parts = parts[2].split(".")
s = int(s_parts[0])
ms = int(s_parts[1]) if len(s_parts) > 1 else 0
return h * 3600 + m * 60 + s + ms / 1000.0seconds_to_srt_timestamp function · python · L87-L96 (10 LOC)colab_diarize/diarize_canary.py
def seconds_to_srt_timestamp(sec):
"""Konvertira sekunde u SRT format HH:MM:SS,mmm."""
sec = max(0.0, sec)
td = timedelta(seconds=sec)
total_seconds = int(td.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
secs = total_seconds % 60
millis = int((sec - int(sec)) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"format_duration function · python · L99-L104 (6 LOC)colab_diarize/diarize_canary.py
def format_duration(seconds):
"""Formatira trajanje u čitljiv format."""
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
return f"{h}h {m}m {s}s"Repobility (the analyzer behind this table) · https://repobility.com
install_dependencies function · python · L109-L117 (9 LOC)colab_diarize/diarize_canary.py
def install_dependencies():
"""Provjerava i instalira pyannote.audio ako nije prisutan."""
try:
import pyannote.audio # noqa: F401
print(" pyannote.audio je već instaliran")
except ImportError:
print(" Instaliram pyannote.audio...")
os.system("pip install pyannote.audio")
print(" pyannote.audio instaliran")get_hf_token function · python · L120-L151 (32 LOC)colab_diarize/diarize_canary.py
def get_hf_token(args_token):
"""Dohvaća HuggingFace token iz argumenata, Colab secrets, ili env varijable."""
# 1. CLI argument
if args_token:
return args_token
# 2. Colab secrets (userdata)
try:
from google.colab import userdata
token = userdata.get("HF_TOKEN")
if token:
print(" HF token učitan iz Colab Secrets")
return token
except (ImportError, Exception):
pass
# 3. Environment varijabla
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
if token:
print(" HF token učitan iz environment varijable")
return token
print(" HuggingFace token nije pronađen!")
print(" Opcije:")
print(" 1. Colab Secrets: dodaj HF_TOKEN u Secrets (lijevi panel)")
print(" 2. CLI: --hf-token TVOJ_TOKEN")
print(" 3. Environment: export HF_TOKEN=TVOJ_TOKEN")
print("")
print(" Token je potreban za pyannote/speaker-diarizatioload_diarization_pipeline function · python · L154-L181 (28 LOC)colab_diarize/diarize_canary.py
def load_diarization_pipeline(hf_token):
"""Učitava pyannote community-1 diarization pipeline na GPU (~9.5 GB VRAM)."""
import torch
from pyannote.audio import Pipeline
if torch.cuda.is_available():
device = "cuda"
vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
gpu_name = torch.cuda.get_device_name(0)
print(f" Uređaj: CUDA — {gpu_name} ({vram_gb:.1f} GB VRAM)")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
device = "mps"
print(f" Uređaj: MPS (Apple Silicon)")
else:
device = "cpu"
print(" Uređaj: CPU")
print(" UPOZORENJE: GPU nije dostupan! Diarizacija će biti spora.")
print(" Na Colabu: Runtime > Change runtime type > T4 GPU")
print(" Učitavam pyannote/speaker-diarization-community-1 model...")
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-community-1",
token=hf_token
)run_diarization function · python · L184-L236 (53 LOC)colab_diarize/diarize_canary.py
def run_diarization(pipeline, wav_file, min_speakers=None, max_speakers=None):
"""Pokreće pyannote community-1 diarizaciju na jednom WAV fajlu.
Koristi exclusive_speaker_diarization mode koji daje točno jednog govornika
u svakom trenutku (bez overlapa), idealno za alignment s SRT titlovima.
"""
import torch
import soundfile as sf
# Učitaj audio
data, sample_rate = sf.read(wav_file)
waveform = torch.from_numpy(data).float().unsqueeze(0)
# Parametri
diarize_params = {}
if min_speakers is not None:
diarize_params["min_speakers"] = min_speakers
if max_speakers is not None:
diarize_params["max_speakers"] = max_speakers
# Pokreni diarizaciju
audio_input = {"waveform": waveform, "sample_rate": sample_rate}
result = pipeline(audio_input, **diarize_params)
# Oslobodi waveform iz memorije (100+ MB za velike fajlove)
del waveform, data, audio_input
# community-1 (pyannote 4.x) vraća DiarizeOutput datassign_speakers function · python · L239-L256 (18 LOC)colab_diarize/diarize_canary.py
def assign_speakers(srt_segments, speaker_segments):
"""Za svaki SRT segment, pronađi govornika s najvećim overlapom."""
for seg in srt_segments:
best_speaker = "UNKNOWN"
best_overlap = 0.0
for spk in speaker_segments:
overlap_start = max(seg["start"], spk["start"])
overlap_end = min(seg["end"], spk["end"])
overlap = max(0, overlap_end - overlap_start)
if overlap > best_overlap:
best_overlap = overlap
best_speaker = spk["speaker"]
seg["speaker"] = best_speaker
return srt_segmentswrite_diarized_srt function · python · L259-L270 (12 LOC)colab_diarize/diarize_canary.py
def write_diarized_srt(segments, output_path):
"""Zapisuje SRT s oznakama govornika."""
with open(output_path, "w", encoding="utf-8") as f:
for i, seg in enumerate(segments, 1):
start_ts = seconds_to_srt_timestamp(seg["start"])
end_ts = seconds_to_srt_timestamp(seg["end"])
speaker = seg.get("speaker", "UNKNOWN")
f.write(f"{i}\n")
f.write(f"{start_ts} --> {end_ts}\n")
f.write(f"[{speaker}] {seg['text']}\n")
f.write("\n")_worker_init function · python · L283-L328 (46 LOC)colab_diarize/diarize_canary.py
def _worker_init(hf_token, min_speakers, max_speakers, threads_per_worker=2,
rclone_dest=None, drive_mount=None, input_dir=None):
"""Inicijalizacija worker procesa — svaki učitava vlastiti pyannote pipeline.
Na CPU-only stroju, ograničava PyTorch/MKL/OMP threadove po workeru
da spriječi oversubscription (npr. 40 workera × 80 threadova = 3200 threadova na 80 CPU).
"""
global _worker_pipeline, _worker_min_speakers, _worker_max_speakers
# Suppress torchcodec/pyannote warnings u worker procesima
import warnings
warnings.filterwarnings("ignore", message="torchcodec is not installed")
warnings.filterwarnings("ignore", message="std\\(\\): degrees of freedom")
# VAŽNO: env vars moraju biti postavljene PRIJE import torch,
# jer PyTorch/OMP/MKL čitaju ih pri inicijalizaciji
os.environ["OMP_NUM_THREADS"] = str(threads_per_worker)
os.environ["MKL_NUM_THREADS"] = str(threads_per_worker)
os.environ["OPENBLAS_NUM_THREADS"] = _worker_diarize function · python · L331-L428 (98 LOC)colab_diarize/diarize_canary.py
def _worker_diarize(wav_file):
"""Worker funkcija: diarizira jedan fajl. Vraća (wav_file, result)."""
global _worker_pipeline, _worker_min_speakers, _worker_max_speakers
global _worker_rclone_dest, _worker_drive_mount, _worker_input_dir
import threading
wav_dir = os.path.dirname(wav_file)
basename = os.path.basename(wav_file)
srt_input = os.path.join(wav_dir, basename + CANARY_SRT_SUFFIX)
diarized_output = os.path.join(wav_dir, basename + DIARIZED_SRT_SUFFIX)
pid = os.getpid()
if os.path.exists(diarized_output):
return wav_file, {"status": "skipped", "reason": "already exists"}
if not os.path.exists(srt_input):
return wav_file, {"status": "skipped", "reason": "no .canary.srt"}
# Distributed lock provjera u workeru
use_lock = bool(_worker_rclone_dest or _worker_drive_mount)
if use_lock:
lock_status = None
if _worker_rclone_dest:
lock_status = _lock_exists_remote(wav_file, _worker_inputMethodology: Repobility · https://repobility.com/research/state-of-ai-code-2026/
_get_hostname function · python · L437-L440 (4 LOC)colab_diarize/diarize_canary.py
def _get_hostname():
"""Vraća hostname za lock identifikaciju."""
import socket
return socket.gethostname()_remote_path_for_wav function · python · L443-L446 (4 LOC)colab_diarize/diarize_canary.py
def _remote_path_for_wav(wav_file, input_dir, rclone_dest):
"""Izračunaj rclone remote path za dati WAV fajl."""
rel = os.path.relpath(wav_file, input_dir)
return f"{rclone_dest}/{rel}"_lock_exists_remote function · python · L449-L490 (42 LOC)colab_diarize/diarize_canary.py
def _lock_exists_remote(wav_file, input_dir, rclone_dest):
"""Provjeri postoji li .lock ili .diarized.srt na remote-u (rclone).
Vraća: 'diarized', 'locked', ili None.
"""
import subprocess
basename = os.path.basename(wav_file)
rel_dir = os.path.relpath(os.path.dirname(wav_file), input_dir)
remote_dir = f"{rclone_dest}/{rel_dir}" if rel_dir != "." else rclone_dest
# Provjeri .canary.diarized.srt
diarized_remote = f"{remote_dir}/{basename}{DIARIZED_SRT_SUFFIX}"
ret = subprocess.run(
["rclone", "ls", diarized_remote, "--max-depth", "1"],
capture_output=True, text=True, timeout=30
)
if ret.returncode == 0 and ret.stdout.strip():
return "diarized"
# Provjeri .canary.lock
lock_remote = f"{remote_dir}/{basename}{LOCK_SUFFIX}"
ret = subprocess.run(
["rclone", "ls", lock_remote, "--max-depth", "1"],
capture_output=True, text=True, timeout=30
)
if ret.returncode == 0 and ret.stdout.stri_lock_exists_mount function · python · L493-L520 (28 LOC)colab_diarize/diarize_canary.py
def _lock_exists_mount(wav_file, drive_mount):
"""Provjeri postoji li .lock ili .diarized.srt na mountanom Drive-u.
Vraća: 'diarized', 'locked', ili None.
"""
basename = os.path.basename(wav_file)
# Pronađi odgovarajući direktorij na mountu — koristi isti relativni path
# wav_file path sadrži kanal/filename, trebamo mapirati to na drive_mount
# drive_mount je root dir na Drive-u, wav_file struktura je input_dir/kanal/file.wav
# Za mount mode: wav_file JE na Drive-u, pa lock i diarized su u istom direktoriju
wav_dir = os.path.dirname(wav_file)
diarized_path = os.path.join(wav_dir, basename + DIARIZED_SRT_SUFFIX)
if os.path.exists(diarized_path):
return "diarized"
lock_path = os.path.join(wav_dir, basename + LOCK_SUFFIX)
if os.path.exists(lock_path):
try:
with open(lock_path, "r") as f:
lock_ts = float(f.readline().strip())
if time.time() - lock_ts > LOCK_STALE_SECONDS:
_create_lock_remote function · python · L523-L544 (22 LOC)colab_diarize/diarize_canary.py
def _create_lock_remote(wav_file, input_dir, rclone_dest):
"""Stvori .lock fajl na remote-u."""
import subprocess
import tempfile
basename = os.path.basename(wav_file)
rel_dir = os.path.relpath(os.path.dirname(wav_file), input_dir)
remote_dir = f"{rclone_dest}/{rel_dir}" if rel_dir != "." else rclone_dest
lock_remote = f"{remote_dir}/{basename}{LOCK_SUFFIX}"
# Stvori privremeni lock fajl sa timestamp + hostname
with tempfile.NamedTemporaryFile(mode="w", suffix=".lock", delete=False) as f:
f.write(f"{time.time()}\n{_get_hostname()}\n")
tmp_path = f.name
try:
subprocess.run(
["rclone", "copyto", tmp_path, lock_remote, "--quiet"],
capture_output=True, timeout=30
)
finally:
os.unlink(tmp_path)_create_lock_mount function · python · L547-L552 (6 LOC)colab_diarize/diarize_canary.py
def _create_lock_mount(wav_file):
"""Stvori .lock fajl na mountanom Drive-u."""
basename = os.path.basename(wav_file)
lock_path = os.path.join(os.path.dirname(wav_file), basename + LOCK_SUFFIX)
with open(lock_path, "w") as f:
f.write(f"{time.time()}\n{_get_hostname()}\n")_remove_lock_remote function · python · L555-L567 (13 LOC)colab_diarize/diarize_canary.py
def _remove_lock_remote(wav_file, input_dir, rclone_dest):
"""Obriši .lock fajl s remote-a."""
import subprocess
basename = os.path.basename(wav_file)
rel_dir = os.path.relpath(os.path.dirname(wav_file), input_dir)
remote_dir = f"{rclone_dest}/{rel_dir}" if rel_dir != "." else rclone_dest
lock_remote = f"{remote_dir}/{basename}{LOCK_SUFFIX}"
subprocess.run(
["rclone", "deletefile", lock_remote, "--quiet"],
capture_output=True, timeout=30
)_remove_lock_mount function · python · L570-L577 (8 LOC)colab_diarize/diarize_canary.py
def _remove_lock_mount(wav_file):
"""Obriši .lock fajl s mountanog Drive-a."""
basename = os.path.basename(wav_file)
lock_path = os.path.join(os.path.dirname(wav_file), basename + LOCK_SUFFIX)
try:
os.remove(lock_path)
except OSError:
passRepobility · code-quality intelligence · https://repobility.com
_bg_rclone_upload function · python · L582-L594 (13 LOC)colab_diarize/diarize_canary.py
def _bg_rclone_upload(diarized_srt_path, rclone_dest, input_dir):
"""Spawna background rclone process za upload jednog diarized SRT fajla."""
import subprocess
# Izračunaj relativni path unutar input_dir-a
rel_path = os.path.relpath(os.path.dirname(diarized_srt_path), input_dir)
dest = f"{rclone_dest}/{rel_path}" if rel_path != "." else rclone_dest
subprocess.Popen(
["rclone", "copyto", diarized_srt_path, f"{dest}/{os.path.basename(diarized_srt_path)}",
"--quiet"],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
)has_diarized_transcript function · python · L599-L603 (5 LOC)colab_diarize/diarize_canary.py
def has_diarized_transcript(wav_file):
"""Provjerava postoji li diarized transkript za WAV datoteku."""
wav_dir = os.path.dirname(wav_file)
basename = os.path.basename(wav_file)
return os.path.exists(os.path.join(wav_dir, basename + DIARIZED_SRT_SUFFIX))diarize_single_file function · python · L606-L671 (66 LOC)colab_diarize/diarize_canary.py
def diarize_single_file(pipeline, wav_file, min_speakers=None, max_speakers=None):
"""Diarizira jednu WAV datoteku. NIKADA ne prepisuje postojeće datoteke."""
import torch
import gc
wav_dir = os.path.dirname(wav_file)
basename = os.path.basename(wav_file)
srt_input = os.path.join(wav_dir, basename + CANARY_SRT_SUFFIX)
diarized_output = os.path.join(wav_dir, basename + DIARIZED_SRT_SUFFIX)
# Sigurnosna provjera
if os.path.exists(diarized_output):
return {"status": "skipped", "reason": "diarized SRT already exists"}
if not os.path.exists(srt_input):
return {"status": "skipped", "reason": "no .canary.srt found"}
file_size_mb = os.path.getsize(wav_file) / (1024 * 1024)
print(f" Diarizing ({file_size_mb:.1f} MB)...")
start_time = time.time()
try:
# 1. Parsiraj .canary.srt
srt_segments = parse_srt(srt_input)
if not srt_segments:
return {"status": "error", "reason": "empty .cparse_args function · python · L676-L734 (59 LOC)colab_diarize/diarize_canary.py
def parse_args():
parser = argparse.ArgumentParser(
description="pyannote speaker diarization za Canary transkripte na Colab GPU",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Primjeri:
# Google Colab (nakon mount Google Drive)
!python diarize_canary.py --input-dir /content/drive/MyDrive/domovina_fetch_data/canary_wav
# Samo prikaz
!python diarize_canary.py --input-dir /content/drive/MyDrive/domovina_fetch_data/canary_wav --dry-run
# Lokalno
python diarize_canary.py --input-dir ./wav_files --hf-token hf_xxx
"""
)
parser.add_argument(
"--input-dir", required=True,
help="Direktorij s WAV i .canary.srt datotekama (rekurzivno)"
)
parser.add_argument(
"--hf-token", default=None,
help="HuggingFace token za pyannote model (ili koristi Colab Secrets / env HF_TOKEN)"
)
parser.add_argument(
"--dry-run", action="store_true",
help="Samo prikaz datoteka, bez diarizamain function · python · L737-L967 (231 LOC)colab_diarize/diarize_canary.py
def main():
args = parse_args()
input_dir = args.input_dir
print("╔══════════════════════════════════════════════════╗")
print("║ PYANNOTE DIARIZACIJA — CANARY TRANSKRIPTI ║")
print("╚══════════════════════════════════════════════════╝")
print(f" Input: {input_dir}")
if args.workers > 1:
print(f" Workers: {args.workers} (paralelno, ~{args.workers * 3} GB RAM)")
if args.min_speakers:
print(f" Min govornika: {args.min_speakers}")
if args.max_speakers:
print(f" Max govornika: {args.max_speakers}")
if args.rclone_dest:
print(f" rclone upload: {args.rclone_dest} (background, nakon svakog fajla)")
if args.drive_mount:
print(f" Drive mount: {args.drive_mount} (distributed lock via mount)")
use_distributed_lock = bool(args.rclone_dest or args.drive_mount)
if use_distributed_lock:
print(f" Distributed lock: AKTIVAN (stale timeout: {LOCK_STALE_SECONDS//3600}h)")
if args.sanitizeDescription function · javascript · L41-L52 (12 LOC)convert_to_wav.js
function sanitizeDescription(str) {
if (!str) return "nepoznat_naslov";
str = str.toLowerCase();
const map = {
'č': 'c', 'ć': 'c', 'ž': 'z', 'š': 's', 'đ': 'd',
'Č': 'c', 'Ć': 'c', 'Ž': 'z', 'Š': 's', 'Đ': 'd'
};
str = str.replace(/[čćžšđČĆŽŠĐ]/g, (char) => map[char] || char);
str = str.replace(/[^a-z0-9]/g, '_');
str = str.replace(/_+/g, '_').replace(/^_|_$/g, '');
return str || "nepoznat_naslov";
}extractVideoId function · javascript · L54-L59 (6 LOC)convert_to_wav.js
function extractVideoId(url) {
url = url.trim();
if (!url) return null;
const m = url.match(/(?:youtu\.be\/|v=)([a-zA-Z0-9_-]{11})/);
return m ? m[1] : null;
}extractDataFromLine function · javascript · L61-L78 (18 LOC)convert_to_wav.js
function extractDataFromLine(line) {
line = line.trim();
if (!line || line.startsWith("#")) return null;
if (line.includes("|")) {
const parts = line.split("|");
const url = parts[parts.length - 1].trim();
let title = "nepoznat_naslov";
let date = "NA";
if (parts.length >= 3) {
date = parts[0].trim();
title = parts.slice(1, parts.length - 1).join(" ").trim();
} else if (parts.length === 2) {
title = parts[0].trim();
}
return { url, title, date };
}
return { url: line, title: "nepoznat_naslov", date: "NA" };
}Repobility · severity-and-effort ranking · https://repobility.com
loadState function · javascript · L80-L89 (10 LOC)convert_to_wav.js
function loadState(stateFile) {
if (fs.existsSync(stateFile)) {
try {
return JSON.parse(fs.readFileSync(stateFile, "utf-8"));
} catch (e) {
console.error(`[GREŠKA] Neispravan JSON stanja: ${stateFile}`);
}
}
return { completed: [], failed: [] };
}findAudioFile function · javascript · L97-L108 (12 LOC)convert_to_wav.js
function findAudioFile(outputDir, videoId) {
if (!fs.existsSync(outputDir)) return null;
const files = fs.readdirSync(outputDir);
// Tražimo datoteku koja sadrži _yt_{videoId} i završava na .mp3
// Ignoriraj macOS ._ resource fork datoteke
const match = files.find(f =>
!f.startsWith("._") && f.includes(`_yt_${videoId}`) && f.endsWith(".mp3")
);
return match ? path.join(outputDir, match) : null;
}convertToWav function · javascript · L114-L137 (24 LOC)convert_to_wav.js
function convertToWav(inputFile) {
const wavFile = inputFile.replace(/\.mp3$/, ".wav");
// Preskoči ako WAV već postoji
if (fs.existsSync(wavFile)) {
return { wavFile, skipped: true };
}
const args = [
"-i", inputFile,
...FFMPEG_WAV_ARGS,
"-y", // Overwrite bez pitanja
wavFile
];
return new Promise((resolve, reject) => {
const proc = spawn("ffmpeg", args, { stdio: "inherit" });
proc.on("close", (code) => {
if (code === 0) resolve({ wavFile, skipped: false });
else reject(new Error(`ffmpeg exit code: ${code} za ${inputFile}`));
});
proc.on("error", reject);
});
}main function · javascript · L141-L277 (137 LOC)convert_to_wav.js
async function main() {
const args = process.argv.slice(2);
const outputDirIdx = args.indexOf("--output-dir");
const baseOutputDir = outputDirIdx !== -1 ? args[outputDirIdx + 1] : DEFAULT_OUTPUT_DIR;
const dryRun = args.includes("--dry-run");
const channelIdx = args.indexOf("--channel");
const channelFilter = channelIdx !== -1 ? args[channelIdx + 1] : null;
if (!fs.existsSync(LISTS_DIR)) {
console.error(`❌ Nema direktorija s listama: ${LISTS_DIR}`);
process.exit(1);
}
if (!fs.existsSync(baseOutputDir)) {
console.error(`❌ Output direktorij ne postoji: ${baseOutputDir}`);
console.error(` Je li disk DOMOVINA1TB mountan?`);
process.exit(1);
}
// Pronađi sve liste
let listFiles = fs.readdirSync(LISTS_DIR)
.filter(f => f.endsWith("-lista.txt"))
.map(f => path.join(LISTS_DIR, f));
// Filtriraj po kanalu ako je zadan --channel
if (channelFilter) {
listFiles = listFiles.parse_args function · python · L33-L42 (10 LOC)diarize.py
def parse_args():
parser = argparse.ArgumentParser(description="Hibridna diarizacija: pyannote + whisper.cpp SRT")
parser.add_argument("--wav", required=True, help="Putanja do WAV audio datoteke")
parser.add_argument("--srt", required=True, help="Putanja do postojećeg SRT fajla (whisper.cpp)")
parser.add_argument("--output", required=True, help="Putanja za izlazni diarized SRT")
parser.add_argument("--hf-token", required=True, help="HuggingFace access token za pyannote modele")
parser.add_argument("--device", default="auto", help="PyTorch device: auto, mps, cpu (default: auto)")
parser.add_argument("--min-speakers", type=int, default=None, help="Minimalan broj govornika")
parser.add_argument("--max-speakers", type=int, default=None, help="Maksimalan broj govornika")
return parser.parse_args()parse_srt function · python · L47-L77 (31 LOC)diarize.py
def parse_srt(srt_path):
"""Parsira SRT datoteku i vraća listu segmenata."""
with open(srt_path, "r", encoding="utf-8") as f:
content = f.read()
# SRT format: indeks\ntimestamp --> timestamp\ntekst\n\n
pattern = re.compile(
r"(\d+)\s*\n"
r"(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*\n"
r"((?:(?!\n\n|\n\d+\s*\n).)*)",
re.DOTALL
)
segments = []
for match in pattern.finditer(content):
idx = int(match.group(1))
start_str = match.group(2).replace(",", ".")
end_str = match.group(3).replace(",", ".")
text = match.group(4).strip()
start_sec = timestamp_to_seconds(start_str)
end_sec = timestamp_to_seconds(end_str)
segments.append({
"index": idx,
"start": start_sec,
"end": end_sec,
"text": text
})
return segmentstimestamp_to_seconds function · python · L80-L88 (9 LOC)diarize.py
def timestamp_to_seconds(ts):
"""Konvertira HH:MM:SS.mmm u sekunde."""
parts = ts.split(":")
h = int(parts[0])
m = int(parts[1])
s_parts = parts[2].split(".")
s = int(s_parts[0])
ms = int(s_parts[1]) if len(s_parts) > 1 else 0
return h * 3600 + m * 60 + s + ms / 1000.0seconds_to_timestamp function · python · L91-L99 (9 LOC)diarize.py
def seconds_to_timestamp(sec):
"""Konvertira sekunde u SRT format HH:MM:SS,mmm."""
td = timedelta(seconds=sec)
total_seconds = int(td.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
secs = total_seconds % 60
millis = int((sec - int(sec)) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"Repobility (the analyzer behind this table) · https://repobility.com
run_diarization function · python · L104-L168 (65 LOC)diarize.py
def run_diarization(wav_path, hf_token, device="auto", min_speakers=None, max_speakers=None):
"""Pokreće pyannote diarizaciju na MPS (Metal GPU) ili CPU."""
import torch
import soundfile as sf
from pyannote.audio import Pipeline
# Automatski odabir uređaja
if device == "auto":
if torch.backends.mps.is_available():
device = "mps"
print(f" 🖥️ Koristim Metal GPU (MPS)")
elif torch.cuda.is_available():
device = "cuda"
print(f" 🖥️ Koristim CUDA GPU")
else:
device = "cpu"
print(f" 🖥️ Koristim CPU (nema GPU akceleracije)")
else:
print(f" 🖥️ Koristim: {device}")
print(f" 📥 Učitavam pyannote model...")
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
token=hf_token
)
pipeline.to(torch.device(device))
# Učitaj audio putem soundfile (nativno čita WAV, ne treba FFmpeg)
print(f" 🔊 Učitavassign_speakers function · python · L171-L192 (22 LOC)diarize.py
def assign_speakers(srt_segments, speaker_segments):
"""
Za svaki SRT segment, pronađi govornika koji ima najveći
overlap s tim vremenskim rasponom.
"""
for seg in srt_segments:
best_speaker = "UNKNOWN"
best_overlap = 0.0
for spk in speaker_segments:
# Izračunaj overlap
overlap_start = max(seg["start"], spk["start"])
overlap_end = min(seg["end"], spk["end"])
overlap = max(0, overlap_end - overlap_start)
if overlap > best_overlap:
best_overlap = overlap
best_speaker = spk["speaker"]
seg["speaker"] = best_speaker
return srt_segmentswrite_diarized_srt function · python · L195-L206 (12 LOC)diarize.py
def write_diarized_srt(segments, output_path):
"""Zapisuje SRT s oznakom govornika."""
with open(output_path, "w", encoding="utf-8") as f:
for i, seg in enumerate(segments, 1):
start_ts = seconds_to_timestamp(seg["start"])
end_ts = seconds_to_timestamp(seg["end"])
speaker = seg.get("speaker", "UNKNOWN")
f.write(f"{i}\n")
f.write(f"{start_ts} --> {end_ts}\n")
f.write(f"[{speaker}] {seg['text']}\n")
f.write("\n")page 1 / 4next ›