Function bodies 161 total
weighted_query_with_provenance function · python · L171-L185 (15 LOC)dex_weights.py
def weighted_query_with_provenance(
query_text: str,
n_results: int = 5,
include_external: bool = False
) -> tuple[list[dict], str]:
results = weighted_query(query_text, n_results, include_external)
label_counts: dict[str, int] = {}
for r in results:
label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
parts = [f"{count}×{label}" for label, count in sorted(label_counts.items())]
provenance = f"[Sources: {' | '.join(parts)}]"
return results, provenanceprint_weight_stats function · python · L189-L200 (12 LOC)dex_weights.py
def print_weight_stats():
print("\n DEX JR. SOURCE WEIGHT TABLE")
print(" " + "="*55)
print(f" {'Collection':<15} {'File Type':<20} {'Weight':>8}")
print(" " + "-"*55)
for coll, cconf in COLLECTIONS.items():
base = cconf["base_weight"]
for ftype, mult in sorted(FILE_TYPE_WEIGHTS.items(), key=lambda x: -x[1]):
effective = round(base * mult, 4)
print(f" {coll:<15} {ftype:<20} {effective:>8.4f}")
print()build_header function · python · L35-L56 (22 LOC)dex-whisper.py
def build_header(filename, language, duration_seconds, model_used):
now = datetime.datetime.now().isoformat()
return f"""# =====================================================================
# DEX JR. VIDEO TRANSCRIPT
# =====================================================================
# source_file: {filename}
# source_type: video_transcript
# origin: tiktok_public
# platform: tiktok
# account: @dropdownlogistics
# era: pre_dexverse
# confidence: whisper_auto
# processing_stage: unstructured
# language: {language}
# duration_seconds: {duration_seconds:.1f}
# whisper_model: {model_used}
# transcribed_at: {now}
# tier: foundation
# status: pre_canonical
# =====================================================================
"""main function · python · L61-L159 (99 LOC)dex-whisper.py
def main():
parser = argparse.ArgumentParser(description="Dex Jr Whisper Batch Transcription Pipeline")
parser.add_argument("--dir", default=DEFAULT_INPUT_DIR, help="Directory containing video files")
parser.add_argument("--output", default=DEFAULT_OUTPUT_DIR, help="Output directory for transcripts")
parser.add_argument("--model", default=DEFAULT_MODEL, help="Whisper model: tiny, base, small, medium, large")
parser.add_argument("--skip-existing", action="store_true", help="Skip files that already have a transcript")
parser.add_argument("--language", default=None, help="Force language (e.g. en). Auto-detect if not set.")
args = parser.parse_args()
input_dir = args.dir
output_dir = args.output
os.makedirs(output_dir, exist_ok=True)
# Scan for video files
files = [
f for f in os.listdir(input_dir)
if os.path.splitext(f)[1].lower() in SUPPORTED_EXTENSIONS
]
fxlsx_to_text function · python · L50-L123 (74 LOC)dex-xlsx.py
def xlsx_to_text(filepath, preview=False):
"""Convert an xlsx file to structured text."""
try:
wb = load_workbook(filepath, read_only=True, data_only=True)
except Exception as e:
print(f" ERROR reading {filepath}: {e}")
return None
filename = os.path.basename(filepath)
output_parts = []
# Header
output_parts.append(f"XLSX CONVERSION: {filename}")
output_parts.append(f"Source: {filepath}")
output_parts.append(f"Converted: {datetime.datetime.now().isoformat()}")
output_parts.append(f"Sheets: {len(wb.sheetnames)}")
output_parts.append(f"Sheet names: {', '.join(wb.sheetnames)}")
output_parts.append("=" * 60)
total_rows = 0
total_cells = 0
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
output_parts.append(f"\n{'─' * 60}")
output_parts.append(f"SHEET: {sheet_name}")
output_parts.append(f"{'─' * 60}\n")
row_count = 0
for row in ws.iter_rows(values_onlsave_text function · python · L128-L138 (11 LOC)dex-xlsx.py
def save_text(result, output_dir):
"""Save converted text to a file."""
os.makedirs(output_dir, exist_ok=True)
name = os.path.splitext(result["filename"])[0]
output_path = os.path.join(output_dir, f"{name}_xlsx.txt")
with open(output_path, "w", encoding="utf-8") as f:
f.write(result["text"])
print(f" Saved: {output_path} ({result['char_count']:,} chars)")
return output_pathingest_text function · python · L140-L151 (12 LOC)dex-xlsx.py
def ingest_text(result):
"""Save to canon folder for corpus ingestion."""
os.makedirs(CANON_DIR, exist_ok=True)
name = os.path.splitext(result["filename"])[0]
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = os.path.join(CANON_DIR, f"XLSX_{name}_{ts}.txt")
with open(output_path, "w", encoding="utf-8") as f:
f.write(result["text"])
print(f" Ingested: {output_path} ({result['char_count']:,} chars)")
return output_pathRepobility analyzer · published findings · https://repobility.com
find_xlsx_files function · python · L156-L162 (7 LOC)dex-xlsx.py
def find_xlsx_files(folder):
"""Find all .xlsx files in a folder (non-recursive)."""
files = []
for filename in os.listdir(folder):
if filename.endswith(".xlsx") and not filename.startswith("~$"):
files.append(os.path.join(folder, filename))
return filesfind_xlsx_recursive function · python · L164-L171 (8 LOC)dex-xlsx.py
def find_xlsx_recursive(folder):
"""Find all .xlsx files recursively."""
files = []
for root, dirs, filenames in os.walk(folder):
for filename in filenames:
if filename.endswith(".xlsx") and not filename.startswith("~$"):
files.append(os.path.join(root, filename))
return fileslog_conversion function · python · L176-L191 (16 LOC)dex-xlsx.py
def log_conversion(filepath, result, action):
entry = {
"timestamp": datetime.datetime.now().isoformat(),
"source": filepath,
"filename": result["filename"],
"sheets": result["sheets"],
"total_rows": result["total_rows"],
"total_cells": result["total_cells"],
"char_count": result["char_count"],
"action": action,
}
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
except:
passmain function · python · L196-L288 (93 LOC)dex-xlsx.py
def main():
parser = argparse.ArgumentParser(description="Dex Jr XLSX Converter v1.0")
parser.add_argument("path", help="Path to .xlsx file or folder")
parser.add_argument("--all", action="store_true", help="Convert all .xlsx in folder")
parser.add_argument("--recursive", action="store_true", help="Search subfolders too")
parser.add_argument("--ingest", action="store_true", help="Save to canon for ingestion")
parser.add_argument("--save", default=None, help="Save to specified folder")
parser.add_argument("--preview", action="store_true", help="Show first 50 rows per sheet")
args = parser.parse_args()
# Single file mode
if not args.all and os.path.isfile(args.path):
print(f"\n Converting: {args.path}")
result = xlsx_to_text(args.path, preview=args.preview)
if not result:
return
print(f" Sheets: {result['sheets']} ({', '.join(result['sheet_names'])})")
print(f" Rows: {result['total_rows']:,}‹ prevpage 4 / 4