Function bodies 424 total
convert function · rust · L347-L433 (87 LOC)src/converters/docx.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
options: ConversionOptions,
) -> Result<ConversionResult> {
let start_time = Instant::now();
// Get input file size
let input_size = tokio::fs::metadata(input).await?.len();
// Convert based on output format
let content = match output_format {
OutputFormat::Markdown { .. } => {
#[cfg(feature = "office")]
{
self.convert_to_markdown(input, &options).await?
}
#[cfg(not(feature = "office"))]
{
return Err(crate::TransmutationError::InvalidOptions(
"DOCX conversion requires office feature".to_string(),
));
}
}
OutputFormat::Image {
format: _format,
quality: _quality,
dpi: _dpi,
metadata function · rust · L434-L442 (9 LOC)src/converters/docx.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "DOCX Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "Pure Rust DOCX to Markdown converter".to_string(),
external_deps: vec!["docx-rs".to_string()],
}
}test_docx_converter_metadata function · rust · L456-L461 (6 LOC)src/converters/docx.rs
fn test_docx_converter_metadata() {
let converter = DocxConverter::new();
let meta = converter.metadata();
assert_eq!(meta.name, "DOCX Converter");
assert!(meta.external_deps.contains(&"docx-rs".to_string()));
}html_to_markdown function · rust · L35-L68 (34 LOC)src/converters/html.rs
fn html_to_markdown(&self, html: &str) -> Result<String> {
use scraper::{Html, Selector};
let document = Html::parse_document(html);
let mut markdown = String::new();
// Extract title
if let Ok(selector) = Selector::parse("title") {
if let Some(title) = document.select(&selector).next() {
markdown.push_str(&format!("# {}\n\n", title.inner_html().trim()));
}
}
// Extract main content (try multiple selectors)
let content_selectors = vec!["main", "article", "body"];
let mut content_found = false;
for sel_str in content_selectors {
if let Ok(selector) = Selector::parse(sel_str) {
if let Some(main_content) = document.select(&selector).next() {
markdown.push_str(&self.process_element(&main_content));
content_found = true;
break;
}
}
}
process_element function · rust · L71-L148 (78 LOC)src/converters/html.rs
fn process_element(&self, element: &scraper::ElementRef) -> String {
use scraper::Node;
let mut result = String::new();
for child in element.children() {
match child.value() {
Node::Text(text) => {
let text_content = text.trim();
if !text_content.is_empty() {
result.push_str(text_content);
result.push(' ');
}
}
Node::Element(elem) => {
let tag_name = elem.name();
// Create ElementRef for child
if let Some(child_elem) = scraper::ElementRef::wrap(child) {
match tag_name {
"h1" => result
.push_str(&format!("# {}\n\n", child_elem.inner_html().trim())),
"h2" => result
.push_str(&format!("## {}\nprocess_list function · rust · L151-L172 (22 LOC)src/converters/html.rs
fn process_list(&self, element: &scraper::ElementRef, ordered: bool) -> String {
use scraper::Selector;
let mut result = String::new();
if let Ok(li_selector) = Selector::parse("li") {
for (idx, li) in element.select(&li_selector).enumerate() {
let marker = if ordered {
format!("{}. ", idx + 1)
} else {
"- ".to_string()
};
result.push_str(&format!(
"{}{}\n",
marker,
li.text().collect::<String>().trim()
));
}
}
result
}output_formats function · rust · L186-L198 (13 LOC)src/converters/html.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Json {
structured: true,
include_metadata: true,
},
]
}Repobility · code-quality intelligence platform · https://repobility.com
convert function · rust · L199-L278 (80 LOC)src/converters/html.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
_options: ConversionOptions,
) -> Result<ConversionResult> {
eprintln!("🔄 HTML Conversion (Pure Rust)");
eprintln!(" HTML → Semantic Parsing → {:?}", output_format);
eprintln!();
// Read HTML file
let html_content = fs::read_to_string(input).await?;
// Convert to requested format
let output_data = match output_format {
OutputFormat::Markdown { .. } => {
eprintln!("📝 Converting to Markdown...");
let markdown = self.html_to_markdown(&html_content)?;
markdown.into_bytes()
}
OutputFormat::Json { .. } => {
eprintln!("📝 Converting to JSON...");
// Simple JSON with raw HTML and extracted text
let markdown = self.html_to_markdown(&html_content)?;
let json = serde_json::json!({
metadata function · rust · L279-L288 (10 LOC)src/converters/html.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "HTML Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "HTML to Markdown converter using semantic parsing (pure Rust)"
.to_string(),
external_deps: vec![],
}
}test_html_to_markdown_basic function · rust · L302-L310 (9 LOC)src/converters/html.rs
fn test_html_to_markdown_basic() {
let converter = HtmlConverter::new();
let html = "<h1>Title</h1><p>Paragraph</p>";
let result = converter.html_to_markdown(html);
assert!(result.is_ok());
let markdown = result.unwrap();
assert!(markdown.contains("# Title"));
assert!(markdown.contains("Paragraph"));
}test_html_converter_metadata function · rust · L313-L318 (6 LOC)src/converters/html.rs
fn test_html_converter_metadata() {
let converter = HtmlConverter::new();
let meta = converter.metadata();
assert_eq!(meta.name, "HTML Converter");
assert!(!meta.external_deps.is_empty() || meta.external_deps.is_empty());
}new function · rust · L31-L36 (6 LOC)src/converters/image.rs
pub fn new() -> Self {
Self {
#[cfg(feature = "tesseract")]
ocr_engine: Some("tesseract".to_string()),
}
}ocr_image function · rust · L40-L62 (23 LOC)src/converters/image.rs
async fn ocr_image(&self, image_path: &Path, language: &str) -> Result<String> {
use leptess::LepTess;
// Initialize Tesseract
let mut tesseract = LepTess::new(None, language).map_err(|e| {
crate::TransmutationError::conversion_failed(&format!(
"Failed to initialize Tesseract: {}",
e
))
})?;
// Set image
tesseract.set_image(image_path).map_err(|e| {
crate::TransmutationError::conversion_failed(&format!("Failed to set image: {}", e))
})?;
// Get text
let text = tesseract.get_utf8_text().map_err(|e| {
crate::TransmutationError::conversion_failed(&format!("OCR failed: {}", e))
})?;
Ok(text)
}image_to_markdown function · rust · L65-L90 (26 LOC)src/converters/image.rs
async fn image_to_markdown(&self, image_path: &Path, language: &str) -> Result<String> {
#[cfg(feature = "tesseract")]
{
let text = self.ocr_image(image_path, language).await?;
let mut markdown = String::new();
markdown.push_str("# OCR Result\n\n");
// Add paragraphs
for para in text.split("\n\n") {
let trimmed = para.trim();
if !trimmed.is_empty() {
markdown.push_str(&format!("{}\n\n", trimmed));
}
}
Ok(markdown)
}
#[cfg(not(feature = "tesseract"))]
{
Err(crate::TransmutationError::conversion_failed(
"OCR feature not enabled. Compile with --features tesseract",
))
}
}supported_formats function · rust · L101-L110 (10 LOC)src/converters/image.rs
fn supported_formats(&self) -> Vec<FileFormat> {
vec![
FileFormat::Jpeg,
FileFormat::Png,
FileFormat::Tiff,
FileFormat::Bmp,
FileFormat::Gif,
FileFormat::Webp,
]
}All rows above produced by Repobility · https://repobility.com
output_formats function · rust · L111-L123 (13 LOC)src/converters/image.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Json {
structured: true,
include_metadata: true,
},
]
}convert function · rust · L124-L213 (90 LOC)src/converters/image.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
_options: ConversionOptions,
) -> Result<ConversionResult> {
eprintln!("🔄 Image OCR (Tesseract)");
eprintln!(" Image → OCR → {:?}", output_format);
eprintln!();
let language = "eng"; // Default to English (can be made configurable later)
#[cfg(feature = "tesseract")]
{
eprintln!("📸 Running OCR (language: {})...", language);
// Convert image to text
let markdown = self.image_to_markdown(input, language).await?;
// Convert to requested format
let output_data = match output_format {
OutputFormat::Markdown { .. } => {
eprintln!("📝 Markdown generated!");
markdown.into_bytes()
}
OutputFormat::Json { .. } => {
eprintln!("📝 Converting to JSON...");
metadata function · rust · L214-L222 (9 LOC)src/converters/image.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "Image OCR Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "Image to text converter using Tesseract OCR".to_string(),
external_deps: vec!["tesseract".to_string()],
}
}extract_text_from_xml function · rust · L37-L115 (79 LOC)src/converters/odt.rs
fn extract_text_from_xml(&self, xml: &str) -> String {
let mut markdown = String::new();
markdown.push_str("# Document\n\n");
let mut reader = Reader::from_str(xml);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let mut in_paragraph = false;
let mut in_heading = false;
let mut heading_level = 1;
let mut current_text = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
if name == "text:p" {
in_paragraph = true;
current_text.clear();
} else if name == "text:h" {
in_heading = true;
// Try to get outline-level attribute
for attr in e.attributes() {
if lodt_to_markdown function · rust · L118-L136 (19 LOC)src/converters/odt.rs
async fn odt_to_markdown(&self, odt_path: &Path) -> Result<String> {
// Read ODT file (it's a ZIP)
let data = fs::read(odt_path).await?;
let cursor = Cursor::new(data);
let mut archive = ZipArchive::new(cursor)?;
// Extract content.xml
let mut content_xml = String::new();
match archive.by_name("content.xml") {
Ok(mut file) => {
file.read_to_string(&mut content_xml)?;
}
Err(_) => {
return Ok("# Error\n\n*Could not find content.xml in ODT file*\n".to_string());
}
}
Ok(self.extract_text_from_xml(&content_xml))
}output_formats function · rust · L150-L162 (13 LOC)src/converters/odt.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Json {
structured: true,
include_metadata: true,
},
]
}convert function · rust · L163-L238 (76 LOC)src/converters/odt.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
_options: ConversionOptions,
) -> Result<ConversionResult> {
eprintln!("🔄 ODT Conversion (Pure Rust)");
eprintln!(" ODT → ZIP → XML → {:?}", output_format);
eprintln!();
// Convert ODT to Markdown
let markdown = self.odt_to_markdown(input).await?;
// Convert to requested format
let output_data = match output_format {
OutputFormat::Markdown { .. } => {
eprintln!("📝 Markdown extracted!");
markdown.into_bytes()
}
OutputFormat::Json { .. } => {
eprintln!("📝 Converting to JSON...");
let json = serde_json::json!({
"text": {
"content": markdown,
"format": "odt",
}
});
serde_json::to_string_pretty(&json)?.into_bmetadata function · rust · L239-L247 (9 LOC)src/converters/odt.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "ODT Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "ODT to Markdown converter (pure Rust, ZIP + XML parsing)".to_string(),
external_deps: vec![],
}
}Powered by Repobility — scan your code at https://repobility.com
new function · rust · L55-L73 (19 LOC)src/converters/pdf.rs
fn new() -> Self {
Self {
sentence_break: Regex::new(r"([.!?]) ([A-Z])").unwrap(),
section_pattern: Regex::new(
r"\b(Abstract|Introduction|Background|Methods|Results|Discussion|Conclusion|References)([A-Z][a-z]+)"
).unwrap(),
title_author_pattern: Regex::new(
r"([A-Z][a-z]+ [A-Z][a-z]+(?: [A-Z][a-z]+)+)([A-Z][a-z]+ [A-Z]\.|[A-Z][a-z]+ [A-Z][a-z]+)"
).unwrap(),
page_number_figure: Regex::new(r"(\d+)(Figure|Table)").unwrap(),
math_var_number: Regex::new(r"\b([a-z])([0-9])\b").unwrap(),
math_var_letter: Regex::new(r"\b([a-z])([a-z])\b").unwrap(),
func_paren: Regex::new(r"([a-zA-Z])\(([a-z])").unwrap(),
plus_capital: Regex::new(r"([a-z])\+([A-Z])").unwrap(),
letter_symbol: Regex::new(r"([a-zA-Z])([∗†‡])").unwrap(),
symbol_capital: Regex::new(r"([∗†‡])([A-Z])").unwrap(),
single_letter_pair: Regbreak_long_text_into_paragraphs function · rust · L100-L124 (25 LOC)src/converters/pdf.rs
fn break_long_text_into_paragraphs(text: &str) -> String {
let cache = regex_cache();
// Pre-allocate result with estimated capacity
let mut result = String::with_capacity(text.len() + text.len() / 10);
// GENERIC RULE 1: Add line breaks after sentences
// Pattern: ". A" -> ".\n\nA" (period + space + capital)
result.push_str(&cache.sentence_break.replace_all(text, "$1\n\n$2"));
// GENERIC RULE 2: Add line breaks before headings (in-place replacements)
let temp = result.replace(" ## ", "\n\n## ");
result = temp.replace(" # ", "\n\n# ");
// GENERIC RULE 3: Clean up excessive newlines (max 2 iterations)
for _ in 0..2 {
if result.contains("\n\n\n") {
result = result.replace("\n\n\n", "\n\n");
} else {
break;
}
}
result.trim().to_string()
}convert_with_docling_style function · rust · L531-L587 (57 LOC)src/converters/pdf.rs
async fn convert_with_docling_style(
&self,
path: &Path,
options: &ConversionOptions,
) -> Result<Vec<ConversionOutput>> {
// Try docling-parse FFI first if enabled and use_ffi flag is set
#[cfg(feature = "docling-ffi")]
if options.use_ffi {
match self.convert_with_docling_ffi(path).await {
Ok(result) => return Ok(result),
Err(e) => {
eprintln!("⚠️ FFI conversion failed: {}", e);
eprintln!(" Falling back to Precision mode...");
// Fall through to precision mode
}
}
}
// Check if split_pages is enabled - if so, we need page info
if options.split_pages {
let parser = PdfParser::load(path)?;
let pages = parser.extract_all_pages()?;
eprintln!(
"📄 Splitting into {} individual pages (precision mode)",
pages.len()convert_to_images function · rust · L592-L690 (99 LOC)src/converters/pdf.rs
async fn convert_to_images(
&self,
path: &Path,
format: crate::types::ImageFormat,
_quality: u8,
dpi: u32,
_options: &ConversionOptions,
) -> Result<Vec<ConversionOutput>> {
use std::process::Command;
use tokio::fs;
eprintln!(
"🖼️ Rendering PDF to images (DPI: {}, Format: {:?})...",
dpi, format
);
eprintln!(" Using pdftoppm command-line tool...");
// Create temporary directory for images
let temp_dir = std::env::temp_dir().join(format!("transmutation_{}", std::process::id()));
fs::create_dir_all(&temp_dir).await?;
// Determine format flag for pdftoppm
let format_flag = match format {
crate::types::ImageFormat::Png => "png",
crate::types::ImageFormat::Jpeg => "jpeg",
crate::types::ImageFormat::Webp => "png", // pdftoppm doesn't support webp, use png
};
// Cross-platform convert_pages_individually function · rust · L696-L763 (68 LOC)src/converters/pdf.rs
async fn convert_pages_individually(
&self,
path: &Path,
pages: &[PdfPage],
_options: &ConversionOptions,
) -> Result<Vec<ConversionOutput>> {
use pdf_extract::extract_text_from_mem;
// Pre-allocate output vector with known size
let mut outputs = Vec::with_capacity(pages.len());
// Load PDF bytes once
let pdf_bytes = tokio::fs::read(path).await?;
// Extract text ONCE for the entire document (major memory optimization)
let full_text = extract_text_from_mem(&pdf_bytes).map_err(|e| {
crate::TransmutationError::engine_error(
"PDF Parser",
format!("pdf-extract failed: {:?}", e),
)
})?;
// Drop PDF bytes immediately to free memory
drop(pdf_bytes);
// Split by page markers (pdf-extract adds \f between pages)
let page_texts: Vec<&str> = full_text.split('\x0C').collect();
for (page_idx, page)convert_with_docling_ffi function · rust · L767-L870 (104 LOC)src/converters/pdf.rs
async fn convert_with_docling_ffi(&self, path: &Path) -> Result<Vec<ConversionOutput>> {
use crate::document::{
DoclingJsonParser, HierarchyBuilder, MarkdownSerializer, PageAssembler,
PageAssemblerOptions,
};
use crate::engines::docling_parse_ffi::DoclingParseEngine;
eprintln!("┌─────────────────────────────────────────┐");
eprintln!("│ 🚀 Docling FFI Pipeline (Full) │");
eprintln!("└─────────────────────────────────────────┘");
// Step 1: Extract cells from PDF via C++ FFI
eprintln!("\n[1/5] 📄 Extracting PDF cells via docling-parse FFI...");
let engine = DoclingParseEngine::open(path)?;
let json_output = engine.export_markdown()?; // Returns JSON with cells
eprintln!(" ✓ JSON size: {} KB", json_output.len() / 1024);
// Step 2: Parse JSON to normalized pages with cells
eprintln!("\n[2/5] 🔍 Parsing JSON structure...");
let doc = Doclijoin_paragraph_lines_enhanced function · rust · L875-L937 (63 LOC)src/converters/pdf.rs
fn join_paragraph_lines_enhanced(text: &str) -> String {
let cache = regex_cache();
// CRITICAL FIX: Remove unwanted spaces that pdf-extract introduces
// "i s" -> "is", "o n" -> "on", "t o" -> "to", "o f" -> "of", "a n" -> "an", etc.
// Pre-allocate with estimated capacity
let mut cleaned = String::with_capacity(text.len());
cleaned.push_str(text);
// Fix common two-letter words that got split
// Using static array to avoid allocation
const WORD_FIXES: [(&str, &str); 19] = [
(" i s ", " is "),
(" i n ", " in "),
(" o n ", " on "),
(" t o ", " to "),
(" o f ", " of "),
(" a n ", " an "),
(" a s ", " as "),
(" a t ", " at "),
(" b y ", " by "),
(" o r ", " or "),
(" w e ", " we "),
(" i t ", " it "),
(" b e ", " be "),
("o f ", "of "),
docling_style_markdown_from_blocks function · rust · L941-L1079 (139 LOC)src/converters/pdf.rs
fn docling_style_markdown_from_blocks(
blocks: &[crate::engines::pdf_parser::TextBlock],
_page_width: f32,
_page_height: f32,
) -> String {
if blocks.is_empty() {
return String::new();
}
// Step 1: Sort by reading order (top to bottom, then left to right)
let mut sorted_blocks = blocks.to_vec();
sorted_blocks.sort_by(|a, b| {
// Sort by Y (top to bottom - higher Y first in PDF coords), then X (left to right)
let y_cmp = b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal);
if y_cmp == std::cmp::Ordering::Equal {
a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
} else {
y_cmp
}
});
// Step 2: Calculate average font size for body text
let font_sizes: Vec<f32> = sorted_blocks.iter().map(|b| b.font_size).collect();
let avg_font_size = if !font_sizes.is_empty() {
Same scanner, your repo: https://repobility.com — Repobility
convert_to_markdown_pdf_extract function · rust · L1082-L1153 (72 LOC)src/converters/pdf.rs
async fn convert_to_markdown_pdf_extract(
&self,
path: &Path,
options: &ConversionOptions,
) -> Result<Vec<ConversionOutput>> {
use pdf_extract::extract_text;
if options.split_pages {
// For split pages: extract each PDF page individually using lopdf
// This accurately reflects the actual PDF page boundaries
let parser = PdfParser::load(path)?;
let pages = parser.extract_all_pages()?;
// Process each physical PDF page
let outputs: Vec<ConversionOutput> = pages
.iter()
.enumerate()
.map(|(i, page)| {
// lopdf returns text with few line breaks, need to add them
let page_markdown = if page.text.lines().count() > 20 {
// If text has many lines, use join algorithm (like pdf-extract)
Self::join_paragraph_lines(&page.text)
convert_to_markdown function · rust · L1156-L1262 (107 LOC)src/converters/pdf.rs
async fn convert_to_markdown(
&self,
parser: &PdfParser,
options: &ConversionOptions,
) -> Result<Vec<ConversionOutput>> {
let pages = parser.extract_all_pages()?;
// Use layout analysis if text blocks are available
let analyzer = LayoutAnalyzer::new();
let markdown_outputs: Vec<String> = if options.split_pages {
// Generate separate markdown for each page
pages
.iter()
.map(|page| {
if !page.text_blocks.is_empty() {
// Use semantic layout analysis
let analyzed = analyzer.analyze(&page.text_blocks);
MarkdownGenerator::from_analyzed_blocks(&analyzed, options.clone())
} else {
// Fallback to simple text extraction
let text = if options.optimize_for_llm {
self.text_optimizer.optimize(&convert_to_json function · rust · L1265-L1311 (47 LOC)src/converters/pdf.rs
async fn convert_to_json(
&self,
parser: &PdfParser,
options: &ConversionOptions,
) -> Result<Vec<ConversionOutput>> {
let pages = parser.extract_all_pages()?;
let metadata = parser.get_metadata();
// Create JSON structure
let json_data = serde_json::json!({
"format": "pdf",
"metadata": {
"title": metadata.title,
"author": metadata.author,
"created": metadata.created,
"modified": metadata.modified,
"page_count": metadata.page_count,
},
"pages": pages.iter().map(|page| serde_json::json!({
"number": page.number,
"text": if options.optimize_for_llm {
self.text_optimizer.optimize(&page.text)
} else {
page.text.clone()
},
"width": page.width,
"height": page.height,
build_metadata function · rust · L1314-L1326 (13 LOC)src/converters/pdf.rs
fn build_metadata(&self, parser: &PdfParser) -> DocumentMetadata {
let pdf_meta = parser.get_metadata();
DocumentMetadata {
title: pdf_meta.title,
author: pdf_meta.author,
created: pdf_meta.created,
modified: pdf_meta.modified,
page_count: pdf_meta.page_count,
language: None, // TODO: Implement language detection
custom: std::collections::HashMap::new(),
}
}output_formats function · rust · L1340-L1352 (13 LOC)src/converters/pdf.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Json {
structured: true,
include_metadata: true,
},
]
}convert function · rust · L1353-L1445 (93 LOC)src/converters/pdf.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
options: ConversionOptions,
) -> Result<ConversionResult> {
let start_time = Instant::now();
// Load PDF
let parser = PdfParser::load(input)?;
// Get input file size
let input_size = tokio::fs::metadata(input).await?.len();
// Convert based on output format
let content = match output_format {
OutputFormat::Markdown { .. } => {
// Use pdf-extract for best quality
if options.use_precision_mode || options.use_ffi {
// High-precision mode: Docling-style layout analysis for ~95% similarity
// Also used for FFI mode which tries docling-parse C++ first
self.convert_with_docling_style(input, &options).await?
} else {
// Fast mode: Pure Rust heuristics, ~81% similarity, much faster
metadata function · rust · L1446-L1454 (9 LOC)src/converters/pdf.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "PDF Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "Pure Rust PDF to Markdown/JSON converter using lopdf".to_string(),
external_deps: vec!["lopdf".to_string()],
}
}test_pdf_converter_metadata function · rust · L1468-L1473 (6 LOC)src/converters/pdf.rs
fn test_pdf_converter_metadata() {
let converter = PdfConverter::new();
let meta = converter.metadata();
assert_eq!(meta.name, "PDF Converter");
assert!(!meta.external_deps.is_empty());
}Repobility · code-quality intelligence platform · https://repobility.com
test_join_paragraph_lines_utf8_boundary function · rust · L1476-L1493 (18 LOC)src/converters/pdf.rs
fn test_join_paragraph_lines_utf8_boundary() {
// Test with German text containing umlauts near the 500-byte boundary
// "Gefährdungen" contains 'ä' which is a multibyte character (2 bytes in UTF-8)
// This test ensures we don't panic when slicing at byte boundaries
// Create a string where multibyte chars fall around byte 500
let prefix = "A".repeat(495); // 495 ASCII chars = 495 bytes
let german_text = "Elementare Gefährdungen"; // Contains ä (2 bytes)
let suffix = " more text here for testing purposes";
let input = format!("{}{}{}", prefix, german_text, suffix);
// This should not panic - the fix ensures we find valid char boundaries
let result = PdfConverter::join_paragraph_lines(&input);
// The result should contain the original text (possibly reformatted)
assert!(result.contains("Gefährdungen") || result.contains("Gef") || !result.is_empty());
}test_join_paragraph_lines_multibyte_at_boundary function · rust · L1496-L1511 (16 LOC)src/converters/pdf.rs
fn test_join_paragraph_lines_multibyte_at_boundary() {
// Specifically test when a multibyte character spans byte 500
// Chinese characters are 3 bytes each in UTF-8
// Create text where byte 499-501 is inside a Chinese character
let prefix = "x".repeat(498); // 498 bytes
let chinese = "中文测试"; // 4 Chinese chars = 12 bytes, first char at bytes 498-500
let suffix = " end";
let input = format!("{}{}{}", prefix, chinese, suffix);
assert!(input.len() > 500);
// Should not panic
let result = PdfConverter::join_paragraph_lines(&input);
assert!(!result.is_empty());
}test_join_paragraph_lines_emoji_at_boundary function · rust · L1514-L1526 (13 LOC)src/converters/pdf.rs
fn test_join_paragraph_lines_emoji_at_boundary() {
// Emojis are 4 bytes in UTF-8
let prefix = "y".repeat(497); // 497 bytes
let emoji_text = "🎉🎊🎈"; // 3 emojis = 12 bytes
let suffix = " celebration";
let input = format!("{}{}{}", prefix, emoji_text, suffix);
assert!(input.len() > 500);
// Should not panic
let result = PdfConverter::join_paragraph_lines(&input);
assert!(!result.is_empty());
}test_join_paragraph_lines_short_text function · rust · L1529-L1535 (7 LOC)src/converters/pdf.rs
fn test_join_paragraph_lines_short_text() {
// Text shorter than 500 bytes should work fine
let input = "Short text with Ümläuts and émojis 🎉";
let result = PdfConverter::join_paragraph_lines(input);
assert!(!result.is_empty());
}test_join_paragraph_lines_exactly_500_ascii function · rust · L1538-L1544 (7 LOC)src/converters/pdf.rs
fn test_join_paragraph_lines_exactly_500_ascii() {
// Exactly 500 ASCII characters
let input = "a".repeat(500);
let result = PdfConverter::join_paragraph_lines(&input);
assert!(!result.is_empty());
}test_join_paragraph_lines_cyrillic_text function · rust · L1547-L1558 (12 LOC)src/converters/pdf.rs
fn test_join_paragraph_lines_cyrillic_text() {
// Cyrillic characters are 2 bytes each
let prefix = "z".repeat(499);
let cyrillic = "Привет мир"; // Russian "Hello world"
let suffix = " end";
let input = format!("{}{}{}", prefix, cyrillic, suffix);
// Should not panic
let result = PdfConverter::join_paragraph_lines(&input);
assert!(!result.is_empty());
}test_join_paragraph_lines_mixed_scripts function · rust · L1561-L1571 (11 LOC)src/converters/pdf.rs
fn test_join_paragraph_lines_mixed_scripts() {
// Mix of different scripts with varying byte lengths
let input = format!(
"{}Latin äöü Ελληνικά 日本語 한국어 العربية 🌍🌎🌏",
"x".repeat(450)
);
// Should not panic regardless of where the 500-byte boundary falls
let result = PdfConverter::join_paragraph_lines(&input);
assert!(!result.is_empty());
}extract_text_from_pptx function · rust · L40-L84 (45 LOC)src/converters/pptx.rs
fn extract_text_from_pptx(&self, path: &Path) -> Result<Vec<String>> {
use std::fs::File;
use zip::ZipArchive;
eprintln!("📝 Extracting text from PPTX (Direct XML parsing)...");
let file = File::open(path)?;
let mut archive = ZipArchive::new(file).map_err(|e| {
crate::TransmutationError::engine_error(
"zip",
format!("Failed to open PPTX as ZIP: {}", e),
)
})?;
let mut slides = Vec::new();
// Find all slide XML files
for i in 0..archive.len() {
let mut file = archive.by_index(i).map_err(|e| {
crate::TransmutationError::engine_error(
"zip",
format!("Failed to read file from PPTX: {}", e),
)
})?;
let name = file.name().to_string();
// Process slide files: ppt/slides/slide*.xml
if name.starts_with("ppt/slides/slide") && name.eAll rows above produced by Repobility · https://repobility.com
extract_text_from_xml function · rust · L87-L115 (29 LOC)src/converters/pptx.rs
fn extract_text_from_xml(&self, xml: &str) -> String {
use quick_xml::Reader;
use quick_xml::events::Event;
let mut reader = Reader::from_str(xml);
reader.config_mut().trim_text(true);
let mut text_parts = Vec::new();
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Text(e)) => {
if let Ok(txt) = e.unescape() {
let content = txt.trim();
if !content.is_empty() {
text_parts.push(content.to_string());
}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
text_parts.join(" ")
}output_formats function · rust · L201-L214 (14 LOC)src/converters/pptx.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: true, // Split by slide
optimize_for_llm: true,
},
OutputFormat::Image {
format: crate::types::ImageFormat::Png,
quality: 85,
dpi: 150,
},
]
}convert function · rust · L215-L348 (134 LOC)src/converters/pptx.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
options: ConversionOptions,
) -> Result<ConversionResult> {
// For images, use LibreOffice → PDF → Images
// For Markdown, use direct XML parsing for better quality
match output_format {
OutputFormat::Image { .. } => {
eprintln!("🔄 PPTX → Images Pipeline");
eprintln!(" PPTX → PDF → Images (via LibreOffice)");
eprintln!();
// Use PDF pipeline for images
let pdf_path = self.pptx_to_pdf(input).await?;
let result = self
.pdf_converter
.convert(&pdf_path, output_format, options)
.await?;
// Cleanup
let temp_dir = pdf_path.parent().unwrap();
let _ = fs::remove_dir_all(temp_dir).await;
eprintln!(
"✅ PPTX → Ima