Function bodies 195 total
findInfoJson function · javascript · L196-L205 (10 LOC)generate_whisper_prompt.js
function findInfoJson(outputDir, videoId) {
if (!fs.existsSync(outputDir)) return null;
const files = fs.readdirSync(outputDir);
const match = files.find(f =>
!f.startsWith("._") && f.includes(`_yt_${videoId}`) && f.endsWith(".info.json")
);
return match ? path.join(outputDir, match) : null;
}main function · javascript · L209-L372 (164 LOC)generate_whisper_prompt.js
async function main() {
const args = process.argv.slice(2);
const outputDirIdx = args.indexOf("--output-dir");
const baseOutputDir = outputDirIdx !== -1 ? args[outputDirIdx + 1] : DEFAULT_OUTPUT_DIR;
const dryRun = args.includes("--dry-run");
const channelIdx = args.indexOf("--channel");
const channelFilter = channelIdx !== -1 ? args[channelIdx + 1] : null;
if (!fs.existsSync(LISTS_DIR)) {
console.error(`❌ Nema direktorija s listama: ${LISTS_DIR}`);
process.exit(1);
}
if (!fs.existsSync(baseOutputDir)) {
console.error(`❌ Output direktorij ne postoji: ${baseOutputDir}`);
console.error(` Je li disk DOMOVINA1TB mountan?`);
process.exit(1);
}
// Testiraj LM Studio konekciju
try {
console.log("🔌 Testiram LM Studio konekciju...");
await callLLM("Test", "Test konekcije");
console.log("✅ LM Studio je dostupan!\n");
} catch (err) {
console.error(`❌ ${err.message}`);
parseArgs function · javascript · L53-L66 (14 LOC)inspect_pipeline.js
function parseArgs() {
const args = process.argv.slice(2);
function getArg(name) {
const idx = args.indexOf(name);
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : null;
}
return {
inputDir: getArg("--input-dir") || DEFAULT_OUTPUT_DIR,
channel: getArg("--channel"),
verbose: args.includes("--verbose"),
fixSuggestions: args.includes("--fix-suggestions")
};
}getArg function · javascript · L55-L58 (4 LOC)inspect_pipeline.js
function getArg(name) {
const idx = args.indexOf(name);
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : null;
}inspectChannel function · javascript · L70-L307 (238 LOC)inspect_pipeline.js
function inspectChannel(channelDir, channelName, verbose) {
const files = fs.readdirSync(channelDir);
const anomalies = [];
// Grupiraj datoteke po base imenu (prije .wav.canary...)
const bases = new Set();
for (const f of files) {
if (f.startsWith("._")) continue;
if (f.endsWith(".mp3")) {
bases.add(f.replace(/\.mp3$/, ""));
}
}
for (const base of [...bases].sort()) {
const videoAnomalies = [];
const fileMap = {};
// Provjeri sve korake pipeline-a
const checks = [
{ key: "mp3", file: `${base}.mp3` },
{ key: "wav", file: `${base}.wav` },
{ key: "whisperPrompt", file: `${base}_whisper_prompt.txt` },
{ key: "whisperSrt", file: `${base}.wav.srt` },
{ key: "canarySrt", file: `${base}.wav.canary.srt` },
{ key: "canaryDiarized", file: `${base}.wav.canary.diarized.srt` },
{ key: "summary", file: `${base}.wav.canamain function · javascript · L311-L467 (157 LOC)inspect_pipeline.js
function main() {
const { inputDir, channel, verbose, fixSuggestions } = parseArgs();
if (!fs.existsSync(inputDir)) {
console.error(`❌ Direktorij ne postoji: ${inputDir}`);
process.exit(1);
}
console.log("");
console.log("╔══════════════════════════════════════════════════╗");
console.log("║ 🔍 PIPELINE INSPEKCIJA — DETEKCIJA ANOMALIJA ║");
console.log("╚══════════════════════════════════════════════════╝");
console.log(` 📂 Input: ${inputDir}`);
if (channel) console.log(` 🎯 Kanal: ${channel}`);
console.log("");
const entries = fs.readdirSync(inputDir, { withFileTypes: true });
const allAnomalies = [];
let totalVideos = 0;
// Brojači po tipu anomalije
const severityCounts = { error: 0, warn: 0, info: 0 };
const typeCounts = {};
for (const entry of entries) {
if (!entry.isDirectory()) continue;
if (entry.name.startsWith(".")) continue;
if (channel && entry.name !== channparseSrt function · javascript · L117-L152 (36 LOC)prepare_rag_combined.js
function parseSrt(srtContent) {
const segments = [];
const blocks = srtContent.split(/\n\n+/);
for (const block of blocks) {
const lines = block.trim().split("\n");
if (lines.length < 3) continue;
const index = parseInt(lines[0], 10);
if (isNaN(index)) continue;
const timeMatch = lines[1].match(
/(\d{2}:\d{2}:\d{2})[,.](\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2})[,.](\d{3})/
);
if (!timeMatch) continue;
const startTime = timeMatch[1];
const endTime = timeMatch[3];
const startSec = timestampToSeconds(timeMatch[1], timeMatch[2]);
const endSec = timestampToSeconds(timeMatch[3], timeMatch[4]);
const textLines = lines.slice(2).join(" ");
const speakerMatch = textLines.match(/^\[(\w+)\]\s*/);
const speaker = speakerMatch ? speakerMatch[1] : "UNKNOWN";
const text = speakerMatch
? textLines.replace(/^\[\w+\]\s*/, "").trim()
: textLPowered by Repobility — scan your code at https://repobility.com
timestampToSeconds function · javascript · L154-L157 (4 LOC)prepare_rag_combined.js
function timestampToSeconds(hms, ms) {
const [h, m, s] = hms.split(":").map(Number);
return h * 3600 + m * 60 + s + parseInt(ms, 10) / 1000;
}timeToSeconds function · javascript · L159-L162 (4 LOC)prepare_rag_combined.js
function timeToSeconds(hhmmss) {
const parts = hhmmss.split(":");
return parseInt(parts[0]) * 3600 + parseInt(parts[1]) * 60 + parseInt(parts[2]);
}secondsToTime function · javascript · L164-L169 (6 LOC)prepare_rag_combined.js
function secondsToTime(sec) {
const h = Math.floor(sec / 3600).toString().padStart(2, "0");
const m = Math.floor((sec % 3600) / 60).toString().padStart(2, "0");
const s = Math.floor(sec % 60).toString().padStart(2, "0");
return `${h}:${m}:${s}`;
}loadSummary function · javascript · L178-L210 (33 LOC)prepare_rag_combined.js
function loadSummary(srtFilePath) {
const dir = path.dirname(srtFilePath);
const base = path.basename(srtFilePath).replace(/\.canary\.diarized\.srt$/, "");
const summaryPath = path.join(dir, base + SUMMARY_JSON_SUFFIX);
if (!fs.existsSync(summaryPath)) return null;
try {
const data = JSON.parse(fs.readFileSync(summaryPath, "utf-8"));
const speakerMap = {};
if (data.summary?.speakers) {
for (const sp of data.summary.speakers) {
if (sp.id && sp.suggested_name) {
speakerMap[sp.id] = sp.suggested_name;
}
}
}
return {
speakerMap,
topics: data.summary?.key_topics || [],
title: data.summary?.title_hr || data.source?.title || "",
channel: data.source?.channel || "",
youtubeId: data.source?.youtube_id || "",
uploadDate: data.source?.upload_date || "",
durationSeconds: data.sextractVideoIdFromFilename function · javascript · L214-L217 (4 LOC)prepare_rag_combined.js
function extractVideoIdFromFilename(filename) {
const match = filename.match(/_yt_([a-zA-Z0-9_-]{11})/);
return match ? match[1] : null;
}extractDateFromFilename function · javascript · L219-L222 (4 LOC)prepare_rag_combined.js
function extractDateFromFilename(filename) {
const match = filename.match(/^(\d{4})(\d{2})(\d{2})_/);
return match ? `${match[1]}-${match[2]}-${match[3]}` : null;
}buildTopicChunks function · javascript · L235-L324 (90 LOC)prepare_rag_combined.js
function buildTopicChunks(segments, outlineJson, speakerMap) {
// Izvuci sve chaptere iz svih iteracija, sortirane po vremenu
const chapters = [];
for (const iter of outlineJson.iterations) {
if (!iter.chapters) continue;
for (const ch of iter.chapters) {
chapters.push({
timestamp: ch.timestamp,
seconds: timeToSeconds(ch.timestamp),
topic: ch.topic
});
}
}
chapters.sort((a, b) => a.seconds - b.seconds);
if (chapters.length === 0) {
console.error(" ⚠️ Outline nema chaptera, preskačem topic chunks.");
return [];
}
// Kreiraj granice: [start, end) za svaki chapter
const boundaries = chapters.map((ch, i) => ({
topic: ch.topic,
startSeconds: ch.seconds,
startTime: ch.timestamp,
endSeconds: i + 1 < chapters.length ? chapters[i + 1].seconds : Infinity,
endTime: i + 1 < chapters.length ? chapters[i + 1].buildSummaryChunks function · javascript · L334-L352 (19 LOC)prepare_rag_combined.js
function buildSummaryChunks(articleJson) {
const chunks = [];
for (const iter of articleJson.iterations) {
if (!iter.sections) continue;
for (const section of iter.sections) {
chunks.push({
type: "article_summary",
text: `Naslov: ${section.subtitle}\n\nSažetak: ${section.content}`,
topic: section.subtitle,
speakers: [],
startTime: null,
endTime: null
});
}
}
return chunks;
}Repobility (the analyzer behind this table) · https://repobility.com
discoverFiles function · javascript · L362-L420 (59 LOC)prepare_rag_combined.js
function discoverFiles(inputDir, channelFilter) {
const results = [];
if (!fs.existsSync(inputDir)) {
console.error(`❌ Input direktorij ne postoji: ${inputDir}`);
process.exit(1);
}
const entries = fs.readdirSync(inputDir, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory()) continue;
if (entry.name.startsWith(".")) continue;
const channelName = entry.name;
if (channelFilter && channelName !== channelFilter) continue;
const channelDir = path.join(inputDir, channelName);
const files = fs.readdirSync(channelDir);
// Pronadi sve SRT datoteke
const srtFiles = files.filter(f =>
f.endsWith(DIARIZED_SRT_SUFFIX) && !f.startsWith("._")
);
for (const srtFile of srtFiles) {
const srtBase = srtFile.replace(/\.srt$/, "");
// Trazi najnoviji outline i article (iz prepare_rag_import.js)
const outlines parseArgs function · javascript · L424-L451 (28 LOC)prepare_rag_combined.js
function parseArgs() {
const args = process.argv.slice(2);
function getArg(name) {
const idx = args.indexOf(name);
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : null;
}
const inputDir = getArg("--input-dir");
const outputDir = getArg("--output-dir");
const channel = getArg("--channel");
const limit = getArg("--limit") ? parseInt(getArg("--limit"), 10) : null;
const dryRun = args.includes("--dry-run");
if (!inputDir) {
console.error("❌ Obavezan argument: --input-dir <putanja>");
console.error("");
console.error("Primjeri:");
console.error(" node prepare_rag_combined.js --input-dir /Volumes/DOMOVINA1TB/fetch_domovina_tv_output");
console.error(" node prepare_rag_combined.js --input-dir ... --output-dir ./rag_export");
console.error(" node prepare_rag_combined.js --input-dir ... --channel domovina_tv");
console.error(" node prepare_rag_combined.js --input-dir .getArg function · javascript · L427-L430 (4 LOC)prepare_rag_combined.js
function getArg(name) {
const idx = args.indexOf(name);
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : null;
}main function · javascript · L455-L608 (154 LOC)prepare_rag_combined.js
function main() {
const { inputDir, outputDir, channel, limit, dryRun } = parseArgs();
const finalOutputDir = outputDir || inputDir;
console.log("");
console.log("╔══════════════════════════════════════════════════╗");
console.log("║ 🧬 RAG COMBINED — SEMANTIC + SPEAKER-AWARE ║");
console.log("╚══════════════════════════════════════════════════╝");
console.log(` 📂 Input: ${inputDir}`);
console.log(` 💾 Output: ${finalOutputDir}`);
if (channel) console.log(` 🎯 Kanal: ${channel}`);
if (limit) console.log(` 🔢 Limit: ${limit}`);
if (dryRun) console.log(" ⚠️ DRY RUN — samo prikaz statistike");
console.log("");
// Pronadi datoteke (SRT + outline + article, grupirane po kanalu)
const allFiles = discoverFiles(inputDir, channel);
const finalList = limit ? allFiles.slice(0, limit) : allFiles;
console.log(` 📊 Pronađeno tripleta (SRT+outline+article): ${allFiles.length}`);
console.log(` 🔄 Za obradu: ${finalparseArgs function · javascript · L33-L62 (30 LOC)prepare_rag_import.js
function parseArgs() {
const args = process.argv.slice(2);
function getArg(name) {
const idx = args.indexOf(name);
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : null;
}
const inputDir = getArg("--input-dir");
const dir = getArg("--dir");
const channel = getArg("--channel");
const limit = getArg("--limit") ? parseInt(getArg("--limit"), 10) : null;
const dryRun = args.includes("--dry-run");
const srt = getArg("--srt");
const outline = getArg("--outline");
const article = getArg("--article");
if (!inputDir && !dir && !srt) {
console.error("❌ Obavezan argument: --input-dir <putanja> ili --dir <folder> ili --srt <putanja>");
console.error("");
console.error("Primjeri:");
console.error(" node prepare_rag_import.js --input-dir /Volumes/DOMOVINA1TB/fetch_domovina_tv_output");
console.error(" node prepare_rag_import.js --input-dir ... --channel domovina_tv");
consolgetArg function · javascript · L35-L38 (4 LOC)prepare_rag_import.js
function getArg(name) {
const idx = args.indexOf(name);
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : null;
}parseSrt function · javascript · L76-L113 (38 LOC)prepare_rag_import.js
function parseSrt(content) {
const blocks = [];
// SRT blokovi su odvojeni praznim linijama
const rawBlocks = content.trim().split(/\n\s*\n/);
for (const raw of rawBlocks) {
const lines = raw.trim().split("\n");
if (lines.length < 3) continue;
// Linija 1: redni broj
const index = parseInt(lines[0].trim());
if (isNaN(index)) continue;
// Linija 2: vremenski raspon "HH:MM:SS,mmm --> HH:MM:SS,mmm"
const timeMatch = lines[1].match(
/(\d{2}:\d{2}:\d{2}),\d{3}\s*-->\s*(\d{2}:\d{2}:\d{2}),\d{3}/
);
if (!timeMatch) continue;
const startTime = timeToSeconds(timeMatch[1]);
const endTime = timeToSeconds(timeMatch[2]);
// Linija 3+: tekst govornika, moze imati [SPEAKER_XX] prefix
const textLines = lines.slice(2).join(" ").trim();
const speakerMatch = textLines.match(/^\[?(SPEAKER_\d+)\]?\s*(.*)/);
let speaker = "UNKNOWN";
let text = texttimeToSeconds function · javascript · L120-L123 (4 LOC)prepare_rag_import.js
function timeToSeconds(hhmmss) {
const parts = hhmmss.split(":");
return parseInt(parts[0]) * 3600 + parseInt(parts[1]) * 60 + parseInt(parts[2]);
}Hi, dataset curator — please cite Repobility (https://repobility.com) when reusing this data.
secondsToTime function · javascript · L128-L133 (6 LOC)prepare_rag_import.js
function secondsToTime(sec) {
const h = Math.floor(sec / 3600).toString().padStart(2, "0");
const m = Math.floor((sec % 3600) / 60).toString().padStart(2, "0");
const s = Math.floor(sec % 60).toString().padStart(2, "0");
return `${h}:${m}:${s}`;
}buildRawChunks function · javascript · L145-L214 (70 LOC)prepare_rag_import.js
function buildRawChunks(srtBlocks, outlineJson, sourceName) {
// Izvuci sve chaptere iz svih iteracija, sortirane po vremenu
const chapters = [];
for (const iter of outlineJson.iterations) {
if (!iter.chapters) continue;
for (const ch of iter.chapters) {
chapters.push({
timestamp: ch.timestamp,
seconds: timeToSeconds(ch.timestamp),
topic: ch.topic
});
}
}
chapters.sort((a, b) => a.seconds - b.seconds);
if (chapters.length === 0) {
console.error(" ⚠️ Outline nema chaptera, preskačem raw chunks.");
return [];
}
// Kreiraj chunk granice: [start, end) za svaki chapter
const boundaries = chapters.map((ch, i) => ({
topic: ch.topic,
startSeconds: ch.seconds,
startTime: ch.timestamp,
// Kraj je pocetak sljedeceg chaptera, ili Infinity za zadnji
endSeconds: i + 1 < chapters.length ? chapters[i + 1].secondbuildSummaryChunks function · javascript · L222-L243 (22 LOC)prepare_rag_import.js
function buildSummaryChunks(articleJson, sourceName) {
const chunks = [];
let counter = 0;
for (const iter of articleJson.iterations) {
if (!iter.sections) continue;
for (const section of iter.sections) {
counter++;
chunks.push({
id: `${sourceName}_summary_${counter}`,
text: `Naslov: ${section.subtitle}\n\nSažetak: ${section.content}`,
metadata: {
source: sourceName,
type: "article_summary",
subtitle: section.subtitle
}
});
}
}
return chunks;
}discoverTriplets function · javascript · L257-L298 (42 LOC)prepare_rag_import.js
function discoverTriplets(dir) {
const files = fs.readdirSync(dir);
// Pronadi sve .canary.diarized.srt datoteke (iskljuci macOS resource forkove)
const srtFiles = files.filter(f =>
f.endsWith(".canary.diarized.srt") && !f.startsWith("._")
);
const triplets = [];
for (const srtFile of srtFiles) {
// Basename je dio prije .canary.diarized.srt, ali ukljucujuci .wav.canary.diarized
// jer outline/article koriste taj prefix
const srtBase = srtFile.replace(/\.srt$/, ""); // npr. "xxx.wav.canary.diarized"
// Trazi outline i article koji pocinje s istim baseom
const outlines = files
.filter(f => f.startsWith(srtBase + "_") && f.endsWith(".outline.json"))
.sort()
.reverse(); // najnoviji prvi (datum je u imenu)
const articles = files
.filter(f => f.startsWith(srtBase + "_") && f.endsWith(".article.json"))
.sort()
.reverse();
if (oprocessTriplet function · javascript · L302-L340 (39 LOC)prepare_rag_import.js
function processTriplet(srtPath, outlinePath, articlePath) {
const srtBase = path.basename(srtPath).replace(/\.srt$/, "");
// Source name: dio prije .wav za citljivije ID-eve
const sourceName = path.basename(srtPath).replace(/\.wav\.canary\.diarized\.srt$/, "");
console.log(` 📂 SRT: ${path.basename(srtPath)}`);
console.log(` 📋 Outline: ${path.basename(outlinePath)}`);
console.log(` 📰 Article: ${path.basename(articlePath)}`);
// Ucitaj datoteke
const srtContent = fs.readFileSync(srtPath, "utf-8");
const outlineJson = JSON.parse(fs.readFileSync(outlinePath, "utf-8"));
const articleJson = JSON.parse(fs.readFileSync(articlePath, "utf-8"));
// Parsiraj SRT
const srtBlocks = parseSrt(srtContent);
console.log(` 🔤 Parsirano ${srtBlocks.length} SRT blokova`);
// KORAK 1: Raw transcript chunks
const rawChunks = buildRawChunks(srtBlocks, outlineJson, sourceName);
console.log(` 📦 Generirano ${rawChunks.length} raw trmain function · javascript · L344-L507 (164 LOC)prepare_rag_import.js
function main() {
const { inputDir, dir, channel, limit, dryRun, srt, outline, article } = parseArgs();
console.log("");
console.log("╔══════════════════════════════════════════════════╗");
console.log("║ 🗂️ RAG IMPORT PRIPREMA ║");
console.log("╚══════════════════════════════════════════════════╝");
if (srt) {
// Eksplicitni mod: korisnik dao putanje
if (!outline || !article) {
console.error("❌ Kad koristiš --srt, moraš dati i --outline i --article");
process.exit(1);
}
for (const f of [srt, outline, article]) {
if (!fs.existsSync(f)) {
console.error(`❌ Datoteka ne postoji: ${f}`);
process.exit(1);
}
}
console.log("");
processTriplet(srt, outline, article);
} else if (dir) {
// Stari mod: --dir za jedan kanal
if (!fs.existsSync(dir)) {
console.error(`❌ Direktorij ne pparseSrt function · javascript · L91-L131 (41 LOC)prepare_rag.js
function parseSrt(srtContent) {
const segments = [];
const blocks = srtContent.split(/\n\n+/);
for (const block of blocks) {
const lines = block.trim().split("\n");
if (lines.length < 3) continue;
// Linija 1: indeks
const index = parseInt(lines[0], 10);
if (isNaN(index)) continue;
// Linija 2: timestamp (00:00:33,280 --> 00:00:35,679)
const timeMatch = lines[1].match(
/(\d{2}:\d{2}:\d{2})[,.](\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2})[,.](\d{3})/
);
if (!timeMatch) continue;
const startTime = timeMatch[1];
const endTime = timeMatch[3];
const startSec = timestampToSeconds(timeMatch[1], timeMatch[2]);
const endSec = timestampToSeconds(timeMatch[3], timeMatch[4]);
// Linija 3+: tekst s oznakom govornika
const textLines = lines.slice(2).join(" ");
const speakerMatch = textLines.match(/^\[(\w+)\]\s*/);
const speaker = speakerMatch ? stimestampToSeconds function · javascript · L136-L139 (4 LOC)prepare_rag.js
function timestampToSeconds(hms, ms) {
const [h, m, s] = hms.split(":").map(Number);
return h * 3600 + m * 60 + s + parseInt(ms, 10) / 1000;
}About: code-quality intelligence by Repobility · https://repobility.com
groupBySpeaker function · javascript · L159-L196 (38 LOC)prepare_rag.js
function groupBySpeaker(segments) {
if (segments.length === 0) return [];
const blocks = [];
let currentBlock = {
speaker: segments[0].speaker,
segments: [segments[0]],
startTime: segments[0].startTime,
startSec: segments[0].startSec,
endTime: segments[0].endTime,
endSec: segments[0].endSec
};
for (let i = 1; i < segments.length; i++) {
const seg = segments[i];
if (seg.speaker === currentBlock.speaker) {
// Isti govornik → dodaj u trenutni blok
currentBlock.segments.push(seg);
currentBlock.endTime = seg.endTime;
currentBlock.endSec = seg.endSec;
} else {
// Novi govornik → zatvori prethodni blok, započni novi
blocks.push(currentBlock);
currentBlock = {
speaker: seg.speaker,
segments: [seg],
startTime: seg.startTime,
startSec: seg.startSec,
buildChunks function · javascript · L212-L285 (74 LOC)prepare_rag.js
function buildChunks(speakerBlocks, speakerMap, targetChars) {
const chunks = [];
let currentChunkParts = [];
let currentChunkChars = 0;
let chunkStartTime = null;
let chunkStartSec = null;
let chunkEndTime = null;
let chunkEndSec = null;
let chunkSpeakers = new Set();
for (const block of speakerBlocks) {
// Tekst bloka: sve segmente govornika spojiš u paragraf
const speakerName = speakerMap?.[block.speaker] || block.speaker;
const blockText = `[${speakerName}] ${block.segments.map(s => s.text).join(" ")}`;
// Inicijaliziraj vremena za prvi blok u chunku
if (chunkStartTime === null) {
chunkStartTime = block.startTime;
chunkStartSec = block.startSec;
}
currentChunkParts.push(blockText);
currentChunkChars += blockText.length;
chunkEndTime = block.endTime;
chunkEndSec = block.endSec;
chunkSpeakers.add(speakerName);
// Provjeri trebloadSummary function · javascript · L297-L331 (35 LOC)prepare_rag.js
function loadSummary(srtFilePath) {
const dir = path.dirname(srtFilePath);
const base = path.basename(srtFilePath).replace(/\.canary\.diarized\.srt$/, "");
const summaryPath = path.join(dir, base + SUMMARY_JSON_SUFFIX);
if (!fs.existsSync(summaryPath)) return null;
try {
const data = JSON.parse(fs.readFileSync(summaryPath, "utf-8"));
// Gradi speaker mapu: SPEAKER_00 → "Voditelj"
const speakerMap = {};
if (data.summary?.speakers) {
for (const sp of data.summary.speakers) {
if (sp.id && sp.suggested_name) {
speakerMap[sp.id] = sp.suggested_name;
}
}
}
return {
speakerMap,
topics: data.summary?.key_topics || [],
title: data.summary?.title_hr || data.source?.title || "",
channel: data.source?.channel || "",
youtubeId: data.source?.youtube_id || "",
uploadDate: data.source?extractVideoIdFromFilename function · javascript · L336-L339 (4 LOC)prepare_rag.js
function extractVideoIdFromFilename(filename) {
const match = filename.match(/_yt_([a-zA-Z0-9_-]{11})/);
return match ? match[1] : null;
}extractDateFromFilename function · javascript · L344-L347 (4 LOC)prepare_rag.js
function extractDateFromFilename(filename) {
const match = filename.match(/^(\d{4})(\d{2})(\d{2})_/);
return match ? `${match[1]}-${match[2]}-${match[3]}` : null;
}parseArgs function · javascript · L351-L381 (31 LOC)prepare_rag.js
function parseArgs() {
const args = process.argv.slice(2);
function getArg(name) {
const idx = args.indexOf(name);
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : null;
}
const inputDir = getArg("--input-dir");
const outputDir = getArg("--output-dir"); // Opcijski: ako nije naveden, koristi inputDir
const channel = getArg("--channel");
const limit = getArg("--limit") ? parseInt(getArg("--limit"), 10) : null;
const chunkSize = getArg("--chunk-size")
? parseInt(getArg("--chunk-size"), 10)
: DEFAULT_CHUNK_TARGET_CHARS;
const dryRun = args.includes("--dry-run");
if (!inputDir) {
console.error("❌ Obavezan argument: --input-dir <putanja>");
console.error("");
console.error("Primjeri:");
console.error(" node prepare_rag.js --input-dir /Volumes/DOMOVINA1TB/fetch_domovina_tv_output");
console.error(" node prepare_rag.js --input-dir ... --output-dir ./rag_export");
getArg function · javascript · L354-L357 (4 LOC)prepare_rag.js
function getArg(name) {
const idx = args.indexOf(name);
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : null;
}discoverFiles function · javascript · L388-L425 (38 LOC)prepare_rag.js
function discoverFiles(inputDir, channelFilter) {
const results = [];
if (!fs.existsSync(inputDir)) {
console.error(`❌ Input direktorij ne postoji: ${inputDir}`);
process.exit(1);
}
const entries = fs.readdirSync(inputDir, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory()) continue;
if (entry.name.startsWith(".")) continue;
const channelName = entry.name;
if (channelFilter && channelName !== channelFilter) continue;
const channelDir = path.join(inputDir, channelName);
const files = fs.readdirSync(channelDir);
for (const file of files) {
if (!file.endsWith(DIARIZED_SRT_SUFFIX)) continue;
if (file.startsWith("._")) continue;
results.push({
srtPath: path.join(channelDir, file),
channel: channelName
});
}
}
results.sort((a, b) => {
if (a.channel !== b.channePowered by Repobility — scan your code at https://repobility.com
main function · javascript · L429-L569 (141 LOC)prepare_rag.js
async function main() {
const { inputDir, outputDir, channel, limit, chunkSize, dryRun } = parseArgs();
// Ako nije naveden outputDir, JSONL se sprema u inputDir
const finalOutputDir = outputDir || inputDir;
console.log("");
console.log("╔══════════════════════════════════════════════════╗");
console.log("║ 🧩 RAG PRIPREMA — SPEAKER-AWARE CHUNKING ║");
console.log("╚══════════════════════════════════════════════════╝");
console.log(` 📂 Input: ${inputDir}`);
console.log(` 💾 Output: ${finalOutputDir}`);
console.log(` 📏 Chunk size: ~${chunkSize} znakova (~${Math.round(chunkSize / 4)} tokena)`);
if (channel) console.log(` 🎯 Kanal: ${channel}`);
if (limit) console.log(` 🔢 Limit: ${limit}`);
if (dryRun) console.log(" ⚠️ DRY RUN — samo prikaz statistike");
console.log("");
// Pronađi datoteke
const allFiles = discoverFiles(inputDir, channel);
const finalList = limit ? allFiles.slice(0, sleep function · javascript · L39-L41 (3 LOC)screenshot_youtube.js
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}extractVideoIdFromFilename function · javascript · L47-L50 (4 LOC)screenshot_youtube.js
function extractVideoIdFromFilename(filename) {
const match = filename.match(/_yt_([a-zA-Z0-9_-]{11})/);
return match ? match[1] : null;
}timestampToSeconds function · javascript · L55-L60 (6 LOC)screenshot_youtube.js
function timestampToSeconds(ts) {
const parts = ts.split(":").map(Number);
if (parts.length === 3) return parts[0] * 3600 + parts[1] * 60 + parts[2];
if (parts.length === 2) return parts[0] * 60 + parts[1];
return parts[0];
}sanitizeTimestamp function · javascript · L65-L67 (3 LOC)screenshot_youtube.js
function sanitizeTimestamp(ts) {
return ts.replace(/:/g, "-");
}getStreamUrl function · javascript · L76-L97 (22 LOC)screenshot_youtube.js
function getStreamUrl(videoId) {
const args = [
"-f", "96/95/94/93/18/bestvideo[ext=mp4]/bestvideo/best",
"--get-url",
"--cookies-from-browser", BROWSER_NAME,
"--no-check-certificate",
`https://www.youtube.com/watch?v=${videoId}`
];
try {
const url = execSync(`yt-dlp ${args.map(a => `'${a}'`).join(" ")}`, {
encoding: "utf-8",
timeout: STREAM_URL_TIMEOUT_MS,
stdio: ["pipe", "pipe", "pipe"]
}).trim();
// yt-dlp može vratiti više URL-ova (video + audio), uzimamo prvi
return url.split("\n")[0].trim();
} catch (err) {
return null;
}
}captureFrame function · javascript · L105-L139 (35 LOC)screenshot_youtube.js
function captureFrame(streamUrl, timestamp, outputPath) {
return new Promise((resolve) => {
const args = [
"-ss", timestamp,
"-i", streamUrl,
"-frames:v", "1",
"-update", "1", // Potrebno za novije ffmpeg verzije s jednim frameom
"-q:v", "1", // Najviša kvaliteta
"-y", // Overwrite
outputPath
];
const proc = spawn("ffmpeg", args, {
stdio: ["pipe", "pipe", "pipe"]
});
let stderr = "";
proc.stderr.on("data", (chunk) => { stderr += chunk.toString(); });
proc.on("close", (code) => {
if (code === 0 && fs.existsSync(outputPath)) {
const size = fs.statSync(outputPath).size;
if (size > 1000) { // Minimalno 1KB za validan screenshot
resolve(true);
return;
}
// Premali file — vjerovatno crni frame
extractScreenshots function · javascript · L147-L165 (19 LOC)screenshot_youtube.js
function extractScreenshots(articleJson) {
const screenshots = [];
if (!articleJson.iterations) return screenshots;
for (const iter of articleJson.iterations) {
if (!iter.sections) continue;
for (const section of iter.sections) {
if (section.screenshot_timestamp) {
screenshots.push({
timestamp: section.screenshot_timestamp,
description: section.screenshot_description || "",
section_subtitle: section.subtitle || "",
iteration_number: iter.iteration_number
});
}
}
}
return screenshots;
}Repobility (the analyzer behind this table) · https://repobility.com
processArticle function · javascript · L171-L270 (100 LOC)screenshot_youtube.js
async function processArticle(articlePath) {
const dir = path.dirname(articlePath);
const articleFilename = path.basename(articlePath);
// Izvuci base video ime (bez _DATE_MODEL.article.json sufiksa)
const videoBase = articleFilename.replace(/\.wav\.canary\.diarized_.*\.article\.json$/, "");
const videoId = extractVideoIdFromFilename(videoBase);
if (!videoId) {
console.error(` ❌ Ne mogu izvući YouTube ID iz: ${articleFilename}`);
return { total: 0, captured: 0, skipped: 0, failed: 0 };
}
// Parsiraj article.json
let article;
try {
article = JSON.parse(fs.readFileSync(articlePath, "utf-8"));
} catch (err) {
console.error(` ❌ Nevažeći JSON: ${articleFilename}`);
return { total: 0, captured: 0, skipped: 0, failed: 0 };
}
const screenshots = extractScreenshots(article);
if (screenshots.length === 0) {
console.log(` ⚠️ Nema screenshot timestampova u: ${articleFilename}`);
discoverArticleFiles function · javascript · L278-L323 (46 LOC)screenshot_youtube.js
function discoverArticleFiles(inputDir, channelFilter) {
const results = [];
const entries = fs.readdirSync(inputDir, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory() || entry.name.startsWith(".")) continue;
if (channelFilter && entry.name !== channelFilter) continue;
const channelName = entry.name;
const channelDir = path.join(inputDir, channelName);
const files = fs.readdirSync(channelDir);
// Grupiraj article.json po video bazi, uzmi najnoviji
const byVideo = new Map();
for (const file of files) {
if (!file.endsWith(".article.json")) continue;
if (file.startsWith("._")) continue;
const videoBase = file.replace(/\.wav\.canary\.diarized_.*\.article\.json$/, "");
if (!byVideo.has(videoBase) || file > byVideo.get(videoBase)) {
byVideo.set(videoBase, file);
}
}
for (const [videoBase, parseArgs function · javascript · L327-L365 (39 LOC)screenshot_youtube.js
function parseArgs() {
const args = process.argv.slice(2);
function getArg(name) {
const idx = args.indexOf(name);
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : null;
}
const file = getArg("--file");
const inputDir = getArg("--input-dir");
const channel = getArg("--channel");
const limit = getArg("--limit") ? parseInt(getArg("--limit"), 10) : null;
const dryRun = args.includes("--dry-run");
if (!file && !inputDir) {
console.error("❌ Obavezan argument: --file <putanja> ili --input-dir <putanja>");
console.error("");
console.error("Primjeri:");
console.error(" node screenshot_youtube.js --file /path/to/video.article.json");
console.error(" node screenshot_youtube.js --input-dir /Volumes/DOMOVINA1TB/fetch_domovina_tv_output");
console.error(" node screenshot_youtube.js --input-dir ... --channel domovina_tv --limit 5");
console.error(" node screenshot_youtube.js