Function bodies 424 total

Name: Aljefra Mapper analysis
Creator: Repobility
License: https://repobility.com/legal/terms/

convert function · rust · L347-L433 (87 LOC)

src/converters/docx.rs

    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        options: ConversionOptions,
    ) -> Result<ConversionResult> {
        let start_time = Instant::now();

        // Get input file size
        let input_size = tokio::fs::metadata(input).await?.len();

        // Convert based on output format
        let content = match output_format {
            OutputFormat::Markdown { .. } => {
                #[cfg(feature = "office")]
                {
                    self.convert_to_markdown(input, &options).await?
                }
                #[cfg(not(feature = "office"))]
                {
                    return Err(crate::TransmutationError::InvalidOptions(
                        "DOCX conversion requires office feature".to_string(),
                    ));
                }
            }
            OutputFormat::Image {
                format: _format,
                quality: _quality,
                dpi: _dpi,

metadata function · rust · L434-L442 (9 LOC)

src/converters/docx.rs

    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "DOCX Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "Pure Rust DOCX to Markdown converter".to_string(),
            external_deps: vec!["docx-rs".to_string()],
        }
    }

test_docx_converter_metadata function · rust · L456-L461 (6 LOC)

src/converters/docx.rs

    fn test_docx_converter_metadata() {
        let converter = DocxConverter::new();
        let meta = converter.metadata();
        assert_eq!(meta.name, "DOCX Converter");
        assert!(meta.external_deps.contains(&"docx-rs".to_string()));
    }

html_to_markdown function · rust · L35-L68 (34 LOC)

src/converters/html.rs

    fn html_to_markdown(&self, html: &str) -> Result<String> {
        use scraper::{Html, Selector};

        let document = Html::parse_document(html);
        let mut markdown = String::new();

        // Extract title
        if let Ok(selector) = Selector::parse("title") {
            if let Some(title) = document.select(&selector).next() {
                markdown.push_str(&format!("# {}\n\n", title.inner_html().trim()));
            }
        }

        // Extract main content (try multiple selectors)
        let content_selectors = vec!["main", "article", "body"];
        let mut content_found = false;

        for sel_str in content_selectors {
            if let Ok(selector) = Selector::parse(sel_str) {
                if let Some(main_content) = document.select(&selector).next() {
                    markdown.push_str(&self.process_element(&main_content));
                    content_found = true;
                    break;
                }
            }
        }

process_element function · rust · L71-L148 (78 LOC)

src/converters/html.rs

    fn process_element(&self, element: &scraper::ElementRef) -> String {
        use scraper::Node;

        let mut result = String::new();

        for child in element.children() {
            match child.value() {
                Node::Text(text) => {
                    let text_content = text.trim();
                    if !text_content.is_empty() {
                        result.push_str(text_content);
                        result.push(' ');
                    }
                }
                Node::Element(elem) => {
                    let tag_name = elem.name();

                    // Create ElementRef for child
                    if let Some(child_elem) = scraper::ElementRef::wrap(child) {
                        match tag_name {
                            "h1" => result
                                .push_str(&format!("# {}\n\n", child_elem.inner_html().trim())),
                            "h2" => result
                                .push_str(&format!("## {}\n

process_list function · rust · L151-L172 (22 LOC)

src/converters/html.rs

    fn process_list(&self, element: &scraper::ElementRef, ordered: bool) -> String {
        use scraper::Selector;

        let mut result = String::new();

        if let Ok(li_selector) = Selector::parse("li") {
            for (idx, li) in element.select(&li_selector).enumerate() {
                let marker = if ordered {
                    format!("{}. ", idx + 1)
                } else {
                    "- ".to_string()
                };
                result.push_str(&format!(
                    "{}{}\n",
                    marker,
                    li.text().collect::<String>().trim()
                ));
            }
        }

        result
    }

output_formats function · rust · L186-L198 (13 LOC)

src/converters/html.rs

    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: true,
            },
        ]
    }

Repobility · code-quality intelligence platform · https://repobility.com

convert function · rust · L199-L278 (80 LOC)

src/converters/html.rs

    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        _options: ConversionOptions,
    ) -> Result<ConversionResult> {
        eprintln!("🔄 HTML Conversion (Pure Rust)");
        eprintln!("   HTML → Semantic Parsing → {:?}", output_format);
        eprintln!();

        // Read HTML file
        let html_content = fs::read_to_string(input).await?;

        // Convert to requested format
        let output_data = match output_format {
            OutputFormat::Markdown { .. } => {
                eprintln!("📝 Converting to Markdown...");
                let markdown = self.html_to_markdown(&html_content)?;
                markdown.into_bytes()
            }
            OutputFormat::Json { .. } => {
                eprintln!("📝 Converting to JSON...");
                // Simple JSON with raw HTML and extracted text
                let markdown = self.html_to_markdown(&html_content)?;
                let json = serde_json::json!({

metadata function · rust · L279-L288 (10 LOC)

src/converters/html.rs

    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "HTML Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "HTML to Markdown converter using semantic parsing (pure Rust)"
                .to_string(),
            external_deps: vec![],
        }
    }

test_html_to_markdown_basic function · rust · L302-L310 (9 LOC)

src/converters/html.rs

    fn test_html_to_markdown_basic() {
        let converter = HtmlConverter::new();
        let html = "<h1>Title</h1><p>Paragraph</p>";
        let result = converter.html_to_markdown(html);
        assert!(result.is_ok());
        let markdown = result.unwrap();
        assert!(markdown.contains("# Title"));
        assert!(markdown.contains("Paragraph"));
    }

test_html_converter_metadata function · rust · L313-L318 (6 LOC)

src/converters/html.rs

    fn test_html_converter_metadata() {
        let converter = HtmlConverter::new();
        let meta = converter.metadata();
        assert_eq!(meta.name, "HTML Converter");
        assert!(!meta.external_deps.is_empty() || meta.external_deps.is_empty());
    }

new function · rust · L31-L36 (6 LOC)

src/converters/image.rs

    pub fn new() -> Self {
        Self {
            #[cfg(feature = "tesseract")]
            ocr_engine: Some("tesseract".to_string()),
        }
    }

ocr_image function · rust · L40-L62 (23 LOC)

src/converters/image.rs

    async fn ocr_image(&self, image_path: &Path, language: &str) -> Result<String> {
        use leptess::LepTess;

        // Initialize Tesseract
        let mut tesseract = LepTess::new(None, language).map_err(|e| {
            crate::TransmutationError::conversion_failed(&format!(
                "Failed to initialize Tesseract: {}",
                e
            ))
        })?;

        // Set image
        tesseract.set_image(image_path).map_err(|e| {
            crate::TransmutationError::conversion_failed(&format!("Failed to set image: {}", e))
        })?;

        // Get text
        let text = tesseract.get_utf8_text().map_err(|e| {
            crate::TransmutationError::conversion_failed(&format!("OCR failed: {}", e))
        })?;

        Ok(text)
    }

image_to_markdown function · rust · L65-L90 (26 LOC)

src/converters/image.rs

    async fn image_to_markdown(&self, image_path: &Path, language: &str) -> Result<String> {
        #[cfg(feature = "tesseract")]
        {
            let text = self.ocr_image(image_path, language).await?;

            let mut markdown = String::new();
            markdown.push_str("# OCR Result\n\n");

            // Add paragraphs
            for para in text.split("\n\n") {
                let trimmed = para.trim();
                if !trimmed.is_empty() {
                    markdown.push_str(&format!("{}\n\n", trimmed));
                }
            }

            Ok(markdown)
        }

        #[cfg(not(feature = "tesseract"))]
        {
            Err(crate::TransmutationError::conversion_failed(
                "OCR feature not enabled. Compile with --features tesseract",
            ))
        }
    }

supported_formats function · rust · L101-L110 (10 LOC)

src/converters/image.rs

    fn supported_formats(&self) -> Vec<FileFormat> {
        vec![
            FileFormat::Jpeg,
            FileFormat::Png,
            FileFormat::Tiff,
            FileFormat::Bmp,
            FileFormat::Gif,
            FileFormat::Webp,
        ]
    }

All rows above produced by Repobility · https://repobility.com

output_formats function · rust · L111-L123 (13 LOC)

src/converters/image.rs

    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: true,
            },
        ]
    }

convert function · rust · L124-L213 (90 LOC)

src/converters/image.rs

    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        _options: ConversionOptions,
    ) -> Result<ConversionResult> {
        eprintln!("🔄 Image OCR (Tesseract)");
        eprintln!("   Image → OCR → {:?}", output_format);
        eprintln!();

        let language = "eng"; // Default to English (can be made configurable later)

        #[cfg(feature = "tesseract")]
        {
            eprintln!("📸 Running OCR (language: {})...", language);

            // Convert image to text
            let markdown = self.image_to_markdown(input, language).await?;

            // Convert to requested format
            let output_data = match output_format {
                OutputFormat::Markdown { .. } => {
                    eprintln!("📝 Markdown generated!");
                    markdown.into_bytes()
                }
                OutputFormat::Json { .. } => {
                    eprintln!("📝 Converting to JSON...");

metadata function · rust · L214-L222 (9 LOC)

src/converters/image.rs

    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "Image OCR Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "Image to text converter using Tesseract OCR".to_string(),
            external_deps: vec!["tesseract".to_string()],
        }
    }

extract_text_from_xml function · rust · L37-L115 (79 LOC)

src/converters/odt.rs

    fn extract_text_from_xml(&self, xml: &str) -> String {
        let mut markdown = String::new();
        markdown.push_str("# Document\n\n");

        let mut reader = Reader::from_str(xml);
        reader.config_mut().trim_text(true);

        let mut buf = Vec::new();
        let mut in_paragraph = false;
        let mut in_heading = false;
        let mut heading_level = 1;
        let mut current_text = String::new();

        loop {
            match reader.read_event_into(&mut buf) {
                Ok(Event::Start(e)) => {
                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();

                    if name == "text:p" {
                        in_paragraph = true;
                        current_text.clear();
                    } else if name == "text:h" {
                        in_heading = true;
                        // Try to get outline-level attribute
                        for attr in e.attributes() {
                            if l

odt_to_markdown function · rust · L118-L136 (19 LOC)

src/converters/odt.rs

    async fn odt_to_markdown(&self, odt_path: &Path) -> Result<String> {
        // Read ODT file (it's a ZIP)
        let data = fs::read(odt_path).await?;
        let cursor = Cursor::new(data);
        let mut archive = ZipArchive::new(cursor)?;

        // Extract content.xml
        let mut content_xml = String::new();
        match archive.by_name("content.xml") {
            Ok(mut file) => {
                file.read_to_string(&mut content_xml)?;
            }
            Err(_) => {
                return Ok("# Error\n\n*Could not find content.xml in ODT file*\n".to_string());
            }
        }

        Ok(self.extract_text_from_xml(&content_xml))
    }

output_formats function · rust · L150-L162 (13 LOC)

src/converters/odt.rs

    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: true,
            },
        ]
    }

convert function · rust · L163-L238 (76 LOC)

src/converters/odt.rs

    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        _options: ConversionOptions,
    ) -> Result<ConversionResult> {
        eprintln!("🔄 ODT Conversion (Pure Rust)");
        eprintln!("   ODT → ZIP → XML → {:?}", output_format);
        eprintln!();

        // Convert ODT to Markdown
        let markdown = self.odt_to_markdown(input).await?;

        // Convert to requested format
        let output_data = match output_format {
            OutputFormat::Markdown { .. } => {
                eprintln!("📝 Markdown extracted!");
                markdown.into_bytes()
            }
            OutputFormat::Json { .. } => {
                eprintln!("📝 Converting to JSON...");
                let json = serde_json::json!({
                    "text": {
                        "content": markdown,
                        "format": "odt",
                    }
                });
                serde_json::to_string_pretty(&json)?.into_b

metadata function · rust · L239-L247 (9 LOC)

src/converters/odt.rs

    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "ODT Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "ODT to Markdown converter (pure Rust, ZIP + XML parsing)".to_string(),
            external_deps: vec![],
        }
    }

new function · rust · L55-L73 (19 LOC)

src/converters/pdf.rs

    fn new() -> Self {
        Self {
            sentence_break: Regex::new(r"([.!?]) ([A-Z])").unwrap(),
            section_pattern: Regex::new(
                r"\b(Abstract|Introduction|Background|Methods|Results|Discussion|Conclusion|References)([A-Z][a-z]+)"
            ).unwrap(),
            title_author_pattern: Regex::new(
                r"([A-Z][a-z]+ [A-Z][a-z]+(?: [A-Z][a-z]+)+)([A-Z][a-z]+ [A-Z]\.|[A-Z][a-z]+ [A-Z][a-z]+)"
            ).unwrap(),
            page_number_figure: Regex::new(r"(\d+)(Figure|Table)").unwrap(),
            math_var_number: Regex::new(r"\b([a-z])([0-9])\b").unwrap(),
            math_var_letter: Regex::new(r"\b([a-z])([a-z])\b").unwrap(),
            func_paren: Regex::new(r"([a-zA-Z])\(([a-z])").unwrap(),
            plus_capital: Regex::new(r"([a-z])\+([A-Z])").unwrap(),
            letter_symbol: Regex::new(r"([a-zA-Z])([∗†‡])").unwrap(),
            symbol_capital: Regex::new(r"([∗†‡])([A-Z])").unwrap(),
            single_letter_pair: Reg

break_long_text_into_paragraphs function · rust · L100-L124 (25 LOC)

src/converters/pdf.rs

    fn break_long_text_into_paragraphs(text: &str) -> String {
        let cache = regex_cache();

        // Pre-allocate result with estimated capacity
        let mut result = String::with_capacity(text.len() + text.len() / 10);

        // GENERIC RULE 1: Add line breaks after sentences
        // Pattern: ". A" -> ".\n\nA" (period + space + capital)
        result.push_str(&cache.sentence_break.replace_all(text, "$1\n\n$2"));

        // GENERIC RULE 2: Add line breaks before headings (in-place replacements)
        let temp = result.replace(" ## ", "\n\n## ");
        result = temp.replace(" # ", "\n\n# ");

        // GENERIC RULE 3: Clean up excessive newlines (max 2 iterations)
        for _ in 0..2 {
            if result.contains("\n\n\n") {
                result = result.replace("\n\n\n", "\n\n");
            } else {
                break;
            }
        }

        result.trim().to_string()
    }

convert_with_docling_style function · rust · L531-L587 (57 LOC)

src/converters/pdf.rs

    async fn convert_with_docling_style(
        &self,
        path: &Path,
        options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        // Try docling-parse FFI first if enabled and use_ffi flag is set
        #[cfg(feature = "docling-ffi")]
        if options.use_ffi {
            match self.convert_with_docling_ffi(path).await {
                Ok(result) => return Ok(result),
                Err(e) => {
                    eprintln!("⚠️  FFI conversion failed: {}", e);
                    eprintln!("   Falling back to Precision mode...");
                    // Fall through to precision mode
                }
            }
        }

        // Check if split_pages is enabled - if so, we need page info
        if options.split_pages {
            let parser = PdfParser::load(path)?;
            let pages = parser.extract_all_pages()?;
            eprintln!(
                "📄 Splitting into {} individual pages (precision mode)",
                pages.len()

convert_to_images function · rust · L592-L690 (99 LOC)

src/converters/pdf.rs

    async fn convert_to_images(
        &self,
        path: &Path,
        format: crate::types::ImageFormat,
        _quality: u8,
        dpi: u32,
        _options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        use std::process::Command;

        use tokio::fs;

        eprintln!(
            "🖼️  Rendering PDF to images (DPI: {}, Format: {:?})...",
            dpi, format
        );
        eprintln!("   Using pdftoppm command-line tool...");

        // Create temporary directory for images
        let temp_dir = std::env::temp_dir().join(format!("transmutation_{}", std::process::id()));
        fs::create_dir_all(&temp_dir).await?;

        // Determine format flag for pdftoppm
        let format_flag = match format {
            crate::types::ImageFormat::Png => "png",
            crate::types::ImageFormat::Jpeg => "jpeg",
            crate::types::ImageFormat::Webp => "png", // pdftoppm doesn't support webp, use png
        };

        // Cross-platform

convert_pages_individually function · rust · L696-L763 (68 LOC)

src/converters/pdf.rs

    async fn convert_pages_individually(
        &self,
        path: &Path,
        pages: &[PdfPage],
        _options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        use pdf_extract::extract_text_from_mem;

        // Pre-allocate output vector with known size
        let mut outputs = Vec::with_capacity(pages.len());

        // Load PDF bytes once
        let pdf_bytes = tokio::fs::read(path).await?;

        // Extract text ONCE for the entire document (major memory optimization)
        let full_text = extract_text_from_mem(&pdf_bytes).map_err(|e| {
            crate::TransmutationError::engine_error(
                "PDF Parser",
                format!("pdf-extract failed: {:?}", e),
            )
        })?;

        // Drop PDF bytes immediately to free memory
        drop(pdf_bytes);

        // Split by page markers (pdf-extract adds \f between pages)
        let page_texts: Vec<&str> = full_text.split('\x0C').collect();

        for (page_idx, page)

convert_with_docling_ffi function · rust · L767-L870 (104 LOC)

src/converters/pdf.rs

    async fn convert_with_docling_ffi(&self, path: &Path) -> Result<Vec<ConversionOutput>> {
        use crate::document::{
            DoclingJsonParser, HierarchyBuilder, MarkdownSerializer, PageAssembler,
            PageAssemblerOptions,
        };
        use crate::engines::docling_parse_ffi::DoclingParseEngine;

        eprintln!("┌─────────────────────────────────────────┐");
        eprintln!("│ 🚀 Docling FFI Pipeline (Full)         │");
        eprintln!("└─────────────────────────────────────────┘");

        // Step 1: Extract cells from PDF via C++ FFI
        eprintln!("\n[1/5] 📄 Extracting PDF cells via docling-parse FFI...");
        let engine = DoclingParseEngine::open(path)?;
        let json_output = engine.export_markdown()?; // Returns JSON with cells
        eprintln!("      ✓ JSON size: {} KB", json_output.len() / 1024);

        // Step 2: Parse JSON to normalized pages with cells
        eprintln!("\n[2/5] 🔍 Parsing JSON structure...");
        let doc = Docli

join_paragraph_lines_enhanced function · rust · L875-L937 (63 LOC)

src/converters/pdf.rs

    fn join_paragraph_lines_enhanced(text: &str) -> String {
        let cache = regex_cache();

        // CRITICAL FIX: Remove unwanted spaces that pdf-extract introduces
        // "i s" -> "is", "o n" -> "on", "t o" -> "to", "o f" -> "of", "a n" -> "an", etc.

        // Pre-allocate with estimated capacity
        let mut cleaned = String::with_capacity(text.len());
        cleaned.push_str(text);

        // Fix common two-letter words that got split
        // Using static array to avoid allocation
        const WORD_FIXES: [(&str, &str); 19] = [
            (" i s ", " is "),
            (" i n ", " in "),
            (" o n ", " on "),
            (" t o ", " to "),
            (" o f ", " of "),
            (" a n ", " an "),
            (" a s ", " as "),
            (" a t ", " at "),
            (" b y ", " by "),
            (" o r ", " or "),
            (" w e ", " we "),
            (" i t ", " it "),
            (" b e ", " be "),
            ("o f ", "of "),

docling_style_markdown_from_blocks function · rust · L941-L1079 (139 LOC)

src/converters/pdf.rs

    fn docling_style_markdown_from_blocks(
        blocks: &[crate::engines::pdf_parser::TextBlock],
        _page_width: f32,
        _page_height: f32,
    ) -> String {
        if blocks.is_empty() {
            return String::new();
        }

        // Step 1: Sort by reading order (top to bottom, then left to right)
        let mut sorted_blocks = blocks.to_vec();
        sorted_blocks.sort_by(|a, b| {
            // Sort by Y (top to bottom - higher Y first in PDF coords), then X (left to right)
            let y_cmp = b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal);
            if y_cmp == std::cmp::Ordering::Equal {
                a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
            } else {
                y_cmp
            }
        });

        // Step 2: Calculate average font size for body text
        let font_sizes: Vec<f32> = sorted_blocks.iter().map(|b| b.font_size).collect();
        let avg_font_size = if !font_sizes.is_empty() {

Same scanner, your repo: https://repobility.com — Repobility

convert_to_markdown_pdf_extract function · rust · L1082-L1153 (72 LOC)

src/converters/pdf.rs

    async fn convert_to_markdown_pdf_extract(
        &self,
        path: &Path,
        options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        use pdf_extract::extract_text;

        if options.split_pages {
            // For split pages: extract each PDF page individually using lopdf
            // This accurately reflects the actual PDF page boundaries
            let parser = PdfParser::load(path)?;
            let pages = parser.extract_all_pages()?;

            // Process each physical PDF page
            let outputs: Vec<ConversionOutput> = pages
                .iter()
                .enumerate()
                .map(|(i, page)| {
                    // lopdf returns text with few line breaks, need to add them
                    let page_markdown = if page.text.lines().count() > 20 {
                        // If text has many lines, use join algorithm (like pdf-extract)
                        Self::join_paragraph_lines(&page.text)

convert_to_markdown function · rust · L1156-L1262 (107 LOC)

src/converters/pdf.rs

    async fn convert_to_markdown(
        &self,
        parser: &PdfParser,
        options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        let pages = parser.extract_all_pages()?;

        // Use layout analysis if text blocks are available
        let analyzer = LayoutAnalyzer::new();
        let markdown_outputs: Vec<String> = if options.split_pages {
            // Generate separate markdown for each page
            pages
                .iter()
                .map(|page| {
                    if !page.text_blocks.is_empty() {
                        // Use semantic layout analysis
                        let analyzed = analyzer.analyze(&page.text_blocks);
                        MarkdownGenerator::from_analyzed_blocks(&analyzed, options.clone())
                    } else {
                        // Fallback to simple text extraction
                        let text = if options.optimize_for_llm {
                            self.text_optimizer.optimize(&

convert_to_json function · rust · L1265-L1311 (47 LOC)

src/converters/pdf.rs

    async fn convert_to_json(
        &self,
        parser: &PdfParser,
        options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        let pages = parser.extract_all_pages()?;
        let metadata = parser.get_metadata();

        // Create JSON structure
        let json_data = serde_json::json!({
            "format": "pdf",
            "metadata": {
                "title": metadata.title,
                "author": metadata.author,
                "created": metadata.created,
                "modified": metadata.modified,
                "page_count": metadata.page_count,
            },
            "pages": pages.iter().map(|page| serde_json::json!({
                "number": page.number,
                "text": if options.optimize_for_llm {
                    self.text_optimizer.optimize(&page.text)
                } else {
                    page.text.clone()
                },
                "width": page.width,
                "height": page.height,

build_metadata function · rust · L1314-L1326 (13 LOC)

src/converters/pdf.rs

    fn build_metadata(&self, parser: &PdfParser) -> DocumentMetadata {
        let pdf_meta = parser.get_metadata();

        DocumentMetadata {
            title: pdf_meta.title,
            author: pdf_meta.author,
            created: pdf_meta.created,
            modified: pdf_meta.modified,
            page_count: pdf_meta.page_count,
            language: None, // TODO: Implement language detection
            custom: std::collections::HashMap::new(),
        }
    }

output_formats function · rust · L1340-L1352 (13 LOC)

src/converters/pdf.rs

    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: true,
            },
        ]
    }

convert function · rust · L1353-L1445 (93 LOC)

src/converters/pdf.rs

    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        options: ConversionOptions,
    ) -> Result<ConversionResult> {
        let start_time = Instant::now();

        // Load PDF
        let parser = PdfParser::load(input)?;

        // Get input file size
        let input_size = tokio::fs::metadata(input).await?.len();

        // Convert based on output format
        let content = match output_format {
            OutputFormat::Markdown { .. } => {
                // Use pdf-extract for best quality
                if options.use_precision_mode || options.use_ffi {
                    // High-precision mode: Docling-style layout analysis for ~95% similarity
                    // Also used for FFI mode which tries docling-parse C++ first
                    self.convert_with_docling_style(input, &options).await?
                } else {
                    // Fast mode: Pure Rust heuristics, ~81% similarity, much faster

metadata function · rust · L1446-L1454 (9 LOC)

src/converters/pdf.rs

    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "PDF Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "Pure Rust PDF to Markdown/JSON converter using lopdf".to_string(),
            external_deps: vec!["lopdf".to_string()],
        }
    }

test_pdf_converter_metadata function · rust · L1468-L1473 (6 LOC)

src/converters/pdf.rs

    fn test_pdf_converter_metadata() {
        let converter = PdfConverter::new();
        let meta = converter.metadata();
        assert_eq!(meta.name, "PDF Converter");
        assert!(!meta.external_deps.is_empty());
    }

Repobility · code-quality intelligence platform · https://repobility.com

test_join_paragraph_lines_utf8_boundary function · rust · L1476-L1493 (18 LOC)

src/converters/pdf.rs

    fn test_join_paragraph_lines_utf8_boundary() {
        // Test with German text containing umlauts near the 500-byte boundary
        // "Gefährdungen" contains 'ä' which is a multibyte character (2 bytes in UTF-8)
        // This test ensures we don't panic when slicing at byte boundaries

        // Create a string where multibyte chars fall around byte 500
        let prefix = "A".repeat(495); // 495 ASCII chars = 495 bytes
        let german_text = "Elementare Gefährdungen"; // Contains ä (2 bytes)
        let suffix = " more text here for testing purposes";

        let input = format!("{}{}{}", prefix, german_text, suffix);

        // This should not panic - the fix ensures we find valid char boundaries
        let result = PdfConverter::join_paragraph_lines(&input);

        // The result should contain the original text (possibly reformatted)
        assert!(result.contains("Gefährdungen") || result.contains("Gef") || !result.is_empty());
    }

test_join_paragraph_lines_multibyte_at_boundary function · rust · L1496-L1511 (16 LOC)

src/converters/pdf.rs

    fn test_join_paragraph_lines_multibyte_at_boundary() {
        // Specifically test when a multibyte character spans byte 500
        // Chinese characters are 3 bytes each in UTF-8

        // Create text where byte 499-501 is inside a Chinese character
        let prefix = "x".repeat(498); // 498 bytes
        let chinese = "中文测试"; // 4 Chinese chars = 12 bytes, first char at bytes 498-500
        let suffix = " end";

        let input = format!("{}{}{}", prefix, chinese, suffix);
        assert!(input.len() > 500);

        // Should not panic
        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

test_join_paragraph_lines_emoji_at_boundary function · rust · L1514-L1526 (13 LOC)

src/converters/pdf.rs

    fn test_join_paragraph_lines_emoji_at_boundary() {
        // Emojis are 4 bytes in UTF-8
        let prefix = "y".repeat(497); // 497 bytes
        let emoji_text = "🎉🎊🎈"; // 3 emojis = 12 bytes
        let suffix = " celebration";

        let input = format!("{}{}{}", prefix, emoji_text, suffix);
        assert!(input.len() > 500);

        // Should not panic
        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

test_join_paragraph_lines_short_text function · rust · L1529-L1535 (7 LOC)

src/converters/pdf.rs

    fn test_join_paragraph_lines_short_text() {
        // Text shorter than 500 bytes should work fine
        let input = "Short text with Ümläuts and émojis 🎉";

        let result = PdfConverter::join_paragraph_lines(input);
        assert!(!result.is_empty());
    }

test_join_paragraph_lines_exactly_500_ascii function · rust · L1538-L1544 (7 LOC)

src/converters/pdf.rs

    fn test_join_paragraph_lines_exactly_500_ascii() {
        // Exactly 500 ASCII characters
        let input = "a".repeat(500);

        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

test_join_paragraph_lines_cyrillic_text function · rust · L1547-L1558 (12 LOC)

src/converters/pdf.rs

    fn test_join_paragraph_lines_cyrillic_text() {
        // Cyrillic characters are 2 bytes each
        let prefix = "z".repeat(499);
        let cyrillic = "Привет мир"; // Russian "Hello world"
        let suffix = " end";

        let input = format!("{}{}{}", prefix, cyrillic, suffix);

        // Should not panic
        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

test_join_paragraph_lines_mixed_scripts function · rust · L1561-L1571 (11 LOC)

src/converters/pdf.rs

    fn test_join_paragraph_lines_mixed_scripts() {
        // Mix of different scripts with varying byte lengths
        let input = format!(
            "{}Latin äöü Ελληνικά 日本語 한국어 العربية 🌍🌎🌏",
            "x".repeat(450)
        );

        // Should not panic regardless of where the 500-byte boundary falls
        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

extract_text_from_pptx function · rust · L40-L84 (45 LOC)

src/converters/pptx.rs

    fn extract_text_from_pptx(&self, path: &Path) -> Result<Vec<String>> {
        use std::fs::File;

        use zip::ZipArchive;

        eprintln!("📝 Extracting text from PPTX (Direct XML parsing)...");

        let file = File::open(path)?;
        let mut archive = ZipArchive::new(file).map_err(|e| {
            crate::TransmutationError::engine_error(
                "zip",
                format!("Failed to open PPTX as ZIP: {}", e),
            )
        })?;
        let mut slides = Vec::new();

        // Find all slide XML files
        for i in 0..archive.len() {
            let mut file = archive.by_index(i).map_err(|e| {
                crate::TransmutationError::engine_error(
                    "zip",
                    format!("Failed to read file from PPTX: {}", e),
                )
            })?;
            let name = file.name().to_string();

            // Process slide files: ppt/slides/slide*.xml
            if name.starts_with("ppt/slides/slide") && name.e

All rows above produced by Repobility · https://repobility.com

extract_text_from_xml function · rust · L87-L115 (29 LOC)

src/converters/pptx.rs

    fn extract_text_from_xml(&self, xml: &str) -> String {
        use quick_xml::Reader;
        use quick_xml::events::Event;

        let mut reader = Reader::from_str(xml);
        reader.config_mut().trim_text(true);

        let mut text_parts = Vec::new();
        let mut buf = Vec::new();

        loop {
            match reader.read_event_into(&mut buf) {
                Ok(Event::Text(e)) => {
                    if let Ok(txt) = e.unescape() {
                        let content = txt.trim();
                        if !content.is_empty() {
                            text_parts.push(content.to_string());
                        }
                    }
                }
                Ok(Event::Eof) => break,
                Err(_) => break,
                _ => {}
            }
            buf.clear();
        }

        text_parts.join(" ")
    }

output_formats function · rust · L201-L214 (14 LOC)

src/converters/pptx.rs

    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: true, // Split by slide
                optimize_for_llm: true,
            },
            OutputFormat::Image {
                format: crate::types::ImageFormat::Png,
                quality: 85,
                dpi: 150,
            },
        ]
    }

convert function · rust · L215-L348 (134 LOC)

src/converters/pptx.rs

    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        options: ConversionOptions,
    ) -> Result<ConversionResult> {
        // For images, use LibreOffice → PDF → Images
        // For Markdown, use direct XML parsing for better quality
        match output_format {
            OutputFormat::Image { .. } => {
                eprintln!("🔄 PPTX → Images Pipeline");
                eprintln!("   PPTX → PDF → Images (via LibreOffice)");
                eprintln!();

                // Use PDF pipeline for images
                let pdf_path = self.pptx_to_pdf(input).await?;
                let result = self
                    .pdf_converter
                    .convert(&pdf_path, output_format, options)
                    .await?;

                // Cleanup
                let temp_dir = pdf_path.parent().unwrap();
                let _ = fs::remove_dir_all(temp_dir).await;

                eprintln!(
                    "✅ PPTX → Ima

‹ prevpage 2 / 9next ›