← back to hivellm__transmutation

Function bodies 424 total

All specs Real LLM only Function bodies
metadata function · rust · L349-L359 (11 LOC)
src/converters/pptx.rs
    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "PPTX Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description:
                "PPTX converter: Direct XML parsing (Markdown) or LibreOffice pipeline (Images)"
                    .to_string(),
            external_deps: vec!["LibreOffice (images only)".to_string()],
        }
    }
test_extract_text_from_xml function · rust · L380-L385 (6 LOC)
src/converters/pptx.rs
    fn test_extract_text_from_xml() {
        let converter = PptxConverter::new();
        let xml = "<a:t>Test Text</a:t>";
        let result = converter.extract_text_from_xml(xml);
        assert!(result.contains("Test Text"));
    }
rtf_to_markdown function · rust · L30-L141 (112 LOC)
src/converters/rtf.rs
    fn rtf_to_markdown(&self, rtf: &str) -> String {
        let mut markdown = String::new();
        markdown.push_str("# Document\n\n");

        let mut text_content = String::new();
        let mut in_control_word = false;
        let mut brace_depth = 0;
        let mut skip_next = false;

        let chars: Vec<char> = rtf.chars().collect();
        let mut i = 0;

        while i < chars.len() {
            let ch = chars[i];

            if skip_next {
                skip_next = false;
                i += 1;
                continue;
            }

            match ch {
                '{' => {
                    brace_depth += 1;
                }
                '}' => {
                    brace_depth -= 1;
                    in_control_word = false;
                }
                '\\' => {
                    // Check for control word
                    if i + 1 < chars.len() {
                        let next_ch = chars[i + 1];

                        // Escape 
output_formats function · rust · L155-L167 (13 LOC)
src/converters/rtf.rs
    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: true,
            },
        ]
    }
convert function · rust · L168-L245 (78 LOC)
src/converters/rtf.rs
    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        _options: ConversionOptions,
    ) -> Result<ConversionResult> {
        eprintln!("🔄 RTF Conversion (Pure Rust)");
        eprintln!("   RTF → Parsing → {:?}", output_format);
        eprintln!();

        // Read RTF file
        let rtf_content = fs::read_to_string(input).await?;

        // Convert to requested format
        let output_data = match output_format {
            OutputFormat::Markdown { .. } => {
                eprintln!("📝 Converting to Markdown...");
                let markdown = self.rtf_to_markdown(&rtf_content);
                markdown.into_bytes()
            }
            OutputFormat::Json { .. } => {
                eprintln!("📝 Converting to JSON...");
                let text = self.rtf_to_markdown(&rtf_content);
                let json = serde_json::json!({
                    "text": {
                        "content": text,
                     
metadata function · rust · L246-L254 (9 LOC)
src/converters/rtf.rs
    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "RTF Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "RTF to Markdown converter (pure Rust, simplified parser)".to_string(),
            external_deps: vec![],
        }
    }
test_rtf_to_markdown_basic function · rust · L268-L277 (10 LOC)
src/converters/rtf.rs
    fn test_rtf_to_markdown_basic() {
        let converter = RtfConverter::new();
        // RTF with actual text commands
        let rtf = r"{\rtf1\ansi\deff0 {\fonttbl {\f0 Times New Roman;}}
\f0\fs24 Hello World\par
}";
        let result = converter.rtf_to_markdown(rtf);
        // Simplified parser may not extract perfectly, just check it doesn't crash
        assert!(!result.is_empty());
    }
All rows above produced by Repobility · https://repobility.com
default function · rust · L50-L57 (8 LOC)
src/converters/traits.rs
    fn default() -> Self {
        Self {
            name: "Unknown".to_string(),
            version: "0.0.0".to_string(),
            description: String::new(),
            external_deps: Vec::new(),
        }
    }
txt_to_markdown function · rust · L29-L56 (28 LOC)
src/converters/txt.rs
    fn txt_to_markdown(&self, text: &str) -> String {
        let mut markdown = String::new();
        markdown.push_str("# Document\n\n");

        // Simple paragraph detection based on blank lines
        let paragraphs: Vec<&str> = text
            .split("\n\n")
            .filter(|p| !p.trim().is_empty())
            .collect();

        for para in paragraphs {
            let trimmed = para.trim();

            // Detect if it might be a heading (short line, all caps, or ends with colon)
            if trimmed.len() < 80
                && (trimmed
                    .chars()
                    .all(|c| !c.is_lowercase() || !c.is_alphabetic())
                    || trimmed.ends_with(':'))
            {
                markdown.push_str(&format!("## {}\n\n", trimmed));
            } else {
                markdown.push_str(&format!("{}\n\n", trimmed));
            }
        }

        markdown
    }
output_formats function · rust · L70-L82 (13 LOC)
src/converters/txt.rs
    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: true,
            },
        ]
    }
convert function · rust · L83-L160 (78 LOC)
src/converters/txt.rs
    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        _options: ConversionOptions,
    ) -> Result<ConversionResult> {
        eprintln!("🔄 TXT Conversion (Pure Rust)");
        eprintln!("   TXT → Encoding Detection → {:?}", output_format);
        eprintln!();

        // Read text file with encoding detection
        let text_content = fs::read_to_string(input).await?;

        // Convert to requested format
        let output_data = match output_format {
            OutputFormat::Markdown { .. } => {
                eprintln!("📝 Converting to Markdown...");
                let markdown = self.txt_to_markdown(&text_content);
                markdown.into_bytes()
            }
            OutputFormat::Json { .. } => {
                eprintln!("📝 Converting to JSON...");
                let json = serde_json::json!({
                    "text": {
                        "content": text_content,
                        "lines": text_
metadata function · rust · L161-L169 (9 LOC)
src/converters/txt.rs
    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "TXT Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "Plain text to Markdown converter (pure Rust)".to_string(),
            external_deps: vec![],
        }
    }
test_txt_to_markdown_basic function · rust · L183-L188 (6 LOC)
src/converters/txt.rs
    fn test_txt_to_markdown_basic() {
        let converter = TxtConverter::new();
        let text = "This is a test\nSecond line";
        let result = converter.txt_to_markdown(text);
        assert!(result.contains("This is a test"));
    }
check_whisper function · rust · L40-L63 (24 LOC)
src/converters/video.rs
    fn check_whisper() -> bool {
        // Try whisper in PATH
        if Command::new("whisper").arg("--help").output().is_ok() {
            return true;
        }

        // Try common installation paths
        let paths = vec![
            format!(
                "{}/.local/bin/whisper",
                std::env::var("HOME").unwrap_or_default()
            ),
            "/usr/local/bin/whisper".to_string(),
            "/usr/bin/whisper".to_string(),
        ];

        for path in paths {
            if std::path::Path::new(&path).exists() {
                return true;
            }
        }

        false
    }
get_whisper_cmd function · rust · L66-L85 (20 LOC)
src/converters/video.rs
    fn get_whisper_cmd() -> String {
        // Try common paths
        let paths = vec![
            format!(
                "{}/.local/bin/whisper",
                std::env::var("HOME").unwrap_or_default()
            ),
            "/usr/local/bin/whisper".to_string(),
            "/usr/bin/whisper".to_string(),
            "whisper".to_string(),
        ];

        for path in &paths {
            if std::path::Path::new(path).exists() || path == "whisper" {
                return path.clone();
            }
        }

        "whisper".to_string()
    }
Repobility · code-quality intelligence platform · https://repobility.com
extract_audio function · rust · L88-L137 (50 LOC)
src/converters/video.rs
    async fn extract_audio(&self, video_path: &Path) -> Result<PathBuf> {
        if !Self::check_ffmpeg() {
            return Err(crate::TransmutationError::conversion_failed(
                "FFmpeg not found. Install: sudo apt-get install ffmpeg",
            ));
        }

        // Create temporary audio file
        let temp_audio = NamedTempFile::new().map_err(|e| {
            crate::TransmutationError::conversion_failed(&format!(
                "Failed to create temp file: {}",
                e
            ))
        })?;

        let audio_path = temp_audio.path().with_extension("wav");

        eprintln!("🎬 Extracting audio with FFmpeg...");

        // Extract audio to WAV format
        let output = Command::new("ffmpeg")
            .arg("-i")
            .arg(video_path)
            .arg("-vn") // No video
            .arg("-acodec")
            .arg("pcm_s16le") // WAV format
            .arg("-ar")
            .arg("16000") // 16kHz sample rate (Whisper default)
  
transcribe_audio function · rust · L140-L195 (56 LOC)
src/converters/video.rs
    async fn transcribe_audio(&self, audio_path: &Path, language: Option<&str>) -> Result<String> {
        if !Self::check_whisper() {
            return Err(crate::TransmutationError::conversion_failed(
                "Whisper not found. Install: pip install openai-whisper",
            ));
        }

        eprintln!("🎤 Running Whisper transcription...");

        let whisper_cmd = Self::get_whisper_cmd();
        let mut cmd = Command::new(&whisper_cmd);
        cmd.arg(audio_path);
        cmd.arg("--model").arg("base"); // Use base model
        cmd.arg("--output_format").arg("txt");
        cmd.arg("--output_dir").arg("/tmp");

        if let Some(lang) = language {
            cmd.arg("--language").arg(lang);
        }

        let output = cmd.output().map_err(|e| {
            crate::TransmutationError::conversion_failed(&format!(
                "Whisper execution failed: {}",
                e
            ))
        })?;

        if !output.status.success() {
            
video_to_markdown function · rust · L198-L220 (23 LOC)
src/converters/video.rs
    async fn video_to_markdown(&self, video_path: &Path, language: Option<&str>) -> Result<String> {
        // Extract audio
        let audio_path = self.extract_audio(video_path).await?;

        // Transcribe
        let transcript = self.transcribe_audio(&audio_path, language).await?;

        // Clean up audio file
        let _ = tokio::fs::remove_file(&audio_path).await;

        let mut markdown = String::new();
        markdown.push_str("# Video Transcription\n\n");

        if let Some(lang) = language {
            markdown.push_str(&format!("**Language**: {}\n\n", lang));
        }

        markdown.push_str("## Transcript\n\n");
        markdown.push_str(&transcript);
        markdown.push('\n');

        Ok(markdown)
    }
supported_formats function · rust · L231-L239 (9 LOC)
src/converters/video.rs
    fn supported_formats(&self) -> Vec<FileFormat> {
        vec![
            FileFormat::Mp4,
            FileFormat::Avi,
            FileFormat::Mkv,
            FileFormat::Mov,
            FileFormat::Webm,
        ]
    }
output_formats function · rust · L240-L252 (13 LOC)
src/converters/video.rs
    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: true,
            },
        ]
    }
convert function · rust · L253-L329 (77 LOC)
src/converters/video.rs
    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        _options: ConversionOptions,
    ) -> Result<ConversionResult> {
        eprintln!("🔄 Video Transcription (FFmpeg + Whisper)");
        eprintln!("   Video → Audio → Whisper → {:?}", output_format);
        eprintln!();

        let language = None; // Auto-detect

        // Convert video to text
        let markdown = self.video_to_markdown(input, language).await?;

        // Convert to requested format
        let output_data = match output_format {
            OutputFormat::Markdown { .. } => {
                eprintln!("✅ Transcription complete!");
                markdown.into_bytes()
            }
            OutputFormat::Json { .. } => {
                eprintln!("📝 Converting to JSON...");
                let json = serde_json::json!({
                    "transcription": {
                        "text": markdown,
                        "language": language.unwrap_or("a
metadata function · rust · L330-L338 (9 LOC)
src/converters/video.rs
    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "Video Transcription Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "Video to text converter using FFmpeg + Whisper ASR".to_string(),
            external_deps: vec!["ffmpeg".to_string(), "whisper".to_string()],
        }
    }
read_xlsx function · rust · L35-L47 (13 LOC)
src/converters/xlsx.rs
    fn read_xlsx(&self, path: &Path) -> Result<umya_spreadsheet::Spreadsheet> {
        eprintln!("📊 Reading XLSX file (umya-spreadsheet)...");

        let book = umya_spreadsheet::reader::xlsx::read(path).map_err(|e| {
            crate::TransmutationError::engine_error(
                "xlsx-parser",
                format!("Failed to read XLSX: {}", e),
            )
        })?;

        eprintln!("      ✓ Found {} sheets", book.get_sheet_count());
        Ok(book)
    }
Repobility's GitHub App fixes findings like these · https://github.com/apps/repobility-bot
to_markdown function · rust · L50-L101 (52 LOC)
src/converters/xlsx.rs
    fn to_markdown(&self, book: &umya_spreadsheet::Spreadsheet) -> String {
        let mut markdown = String::new();
        markdown.push_str("# Spreadsheet\n\n");

        for (idx, sheet) in book.get_sheet_collection().iter().enumerate() {
            let sheet_name = sheet.get_name();
            markdown.push_str(&format!("## Sheet {}: {}\n\n", idx + 1, sheet_name));

            // Get sheet dimensions
            let highest_row = sheet.get_highest_row();
            let highest_col = sheet.get_highest_column();

            if highest_row == 0 || highest_col == 0 {
                markdown.push_str("*(Empty sheet)*\n\n");
                continue;
            }

            // Build table
            for row in 1..=highest_row {
                if row == 1 {
                    // Header row
                    markdown.push('|');
                    for col in 1..=highest_col {
                        let cell = sheet.get_cell((col, row));
                        let value = 
to_csv function · rust · L104-L131 (28 LOC)
src/converters/xlsx.rs
    fn to_csv(&self, book: &umya_spreadsheet::Spreadsheet, delimiter: char) -> String {
        let mut csv = String::new();

        // Get first sheet
        if let Some(sheet) = book.get_sheet_collection().first() {
            let highest_row = sheet.get_highest_row();
            let highest_col = sheet.get_highest_column();

            for row in 1..=highest_row {
                let mut values = Vec::new();
                for col in 1..=highest_col {
                    let cell = sheet.get_cell((col, row));
                    let value = cell.map(|c| c.get_value().to_string()).unwrap_or_default();

                    // Quote values with commas
                    if value.contains(delimiter) || value.contains('"') {
                        values.push(format!("\"{}\"", value.replace('"', "\"\"")));
                    } else {
                        values.push(value);
                    }
                }
                csv.push_str(&values.join(&delimiter.to_string(
to_json function · rust · L134-L172 (39 LOC)
src/converters/xlsx.rs
    fn to_json(&self, book: &umya_spreadsheet::Spreadsheet) -> Result<String> {
        use serde_json::json;

        let mut sheets_json = Vec::new();

        for sheet in book.get_sheet_collection() {
            let sheet_name = sheet.get_name();
            let highest_row = sheet.get_highest_row();
            let highest_col = sheet.get_highest_column();

            let mut rows = Vec::new();

            for row in 1..=highest_row {
                let mut row_data = Vec::new();
                for col in 1..=highest_col {
                    let cell = sheet.get_cell((col, row));
                    let value = cell.map(|c| c.get_value().to_string()).unwrap_or_default();
                    row_data.push(value);
                }
                rows.push(row_data);
            }

            sheets_json.push(json!({
                "name": sheet_name,
                "rows": rows,
                "row_count": highest_row,
                "col_count": highest_col,
          
output_formats function · rust · L186-L202 (17 LOC)
src/converters/xlsx.rs
    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Csv {
                delimiter: ',',
                include_headers: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: true,
            },
        ]
    }
convert function · rust · L203-L273 (71 LOC)
src/converters/xlsx.rs
    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        _options: ConversionOptions,
    ) -> Result<ConversionResult> {
        eprintln!("🔄 XLSX Conversion (Pure Rust)");
        eprintln!("   XLSX (ZIP) → XML Parsing → {:?}", output_format);
        eprintln!();

        // Read XLSX file
        let book = self.read_xlsx(input)?;

        // Convert to requested format
        let output_data = match output_format {
            OutputFormat::Markdown { .. } => {
                eprintln!("📝 Converting to Markdown tables...");
                self.to_markdown(&book).into_bytes()
            }
            OutputFormat::Csv { delimiter, .. } => {
                eprintln!("📝 Converting to CSV (delimiter: '{}')...", delimiter);
                self.to_csv(&book, delimiter).into_bytes()
            }
            OutputFormat::Json { .. } => {
                eprintln!("📝 Converting to JSON...");
                self.to_json(&book)?.into_b
metadata function · rust · L274-L284 (11 LOC)
src/converters/xlsx.rs
    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "XLSX Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description:
                "XLSX to Markdown/CSV/JSON/XML converter (pure Rust, no LibreOffice needed)"
                    .to_string(),
            external_deps: vec![],
        }
    }
xml_to_json function · rust · L34-L48 (15 LOC)
src/converters/xml.rs
    fn xml_to_json(&self, xml: &str) -> Result<String> {
        use quick_xml::de::from_str;
        use serde_json::Value;

        // Parse XML to generic Value
        let value: Value = from_str(xml).map_err(|e| {
            crate::TransmutationError::engine_error(
                "xml-parser",
                format!("Failed to parse XML: {}", e),
            )
        })?;

        // Convert to pretty JSON
        Ok(serde_json::to_string_pretty(&value)?)
    }
xml_to_markdown function · rust · L51-L94 (44 LOC)
src/converters/xml.rs
    fn xml_to_markdown(&self, xml: &str) -> Result<String> {
        use quick_xml::Reader;
        use quick_xml::events::Event;

        let mut reader = Reader::from_str(xml);
        reader.config_mut().trim_text(true);

        let mut markdown = String::new();
        markdown.push_str("# XML Document\n\n");

        let mut current_element = String::new();
        let mut text_parts = Vec::new();
        let mut buf = Vec::new();

        loop {
            match reader.read_event_into(&mut buf) {
                Ok(Event::Start(e)) => {
                    current_element = String::from_utf8_lossy(e.name().as_ref()).to_string();
                }
                Ok(Event::Text(e)) => {
                    if let Ok(text) = e.unescape() {
                        let content = text.trim();
                        if !content.is_empty() && !current_element.is_empty() {
                            text_parts.push(format!("**{}**: {}", current_element, content));
                   
Source: Repobility analyzer · https://repobility.com
output_formats function · rust · L108-L120 (13 LOC)
src/converters/xml.rs
    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: false,
            },
        ]
    }
convert function · rust · L121-L192 (72 LOC)
src/converters/xml.rs
    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        _options: ConversionOptions,
    ) -> Result<ConversionResult> {
        eprintln!("🔄 XML Conversion (Pure Rust)");
        eprintln!("   XML → Parsing → {:?}", output_format);
        eprintln!();

        // Read XML file
        let xml_content = fs::read_to_string(input).await?;

        // Convert to requested format
        let output_data = match output_format {
            OutputFormat::Markdown { .. } => {
                eprintln!("📝 Converting to Markdown...");
                let markdown = self.xml_to_markdown(&xml_content)?;
                markdown.into_bytes()
            }
            OutputFormat::Json { .. } => {
                eprintln!("📝 Converting to JSON...");
                let json = self.xml_to_json(&xml_content)?;
                json.into_bytes()
            }
            _ => {
                return Err(crate::TransmutationError::UnsupportedFormat(fo
metadata function · rust · L193-L201 (9 LOC)
src/converters/xml.rs
    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "XML Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "XML to Markdown/JSON converter (pure Rust)".to_string(),
            external_deps: vec![],
        }
    }
test_xml_to_json_basic function · rust · L215-L220 (6 LOC)
src/converters/xml.rs
    fn test_xml_to_json_basic() {
        let converter = XmlConverter::new();
        let xml = "<root><item>test</item></root>";
        let result = converter.xml_to_json(xml);
        assert!(result.is_ok());
    }
new function · rust · L28-L34 (7 LOC)
src/document/hierarchy_builder.rs
    pub fn new() -> Self {
        Self {
            enable_section_tree: true,
            enable_list_grouping: true,
            enable_caption_pairing: true,
        }
    }
build function · rust · L37-L60 (24 LOC)
src/document/hierarchy_builder.rs
    pub fn build(&self, filename: String, mut items: Vec<DocItem>) -> Result<DoclingDocument> {
        // Build section tree if enabled
        if self.enable_section_tree {
            items = self.build_section_tree(items)?;
        }

        // Group consecutive list items if enabled
        if self.enable_list_grouping {
            items = self.group_list_items(items)?;
        }

        // Pair captions with figures/tables if enabled
        if self.enable_caption_pairing {
            items = self.pair_captions(items)?;
        }

        let mut doc = DoclingDocument::new(filename);

        for item in items {
            doc.add_item(item);
        }

        Ok(doc)
    }
build_section_tree function · rust · L65-L93 (29 LOC)
src/document/hierarchy_builder.rs
    fn build_section_tree(&self, items: Vec<DocItem>) -> Result<Vec<DocItem>> {
        let mut result = Vec::new();
        let mut current_level = 0;

        for item in items {
            match &item {
                DocItem::SectionHeader(header) => {
                    // Ensure level progression is reasonable (no jumps > 1)
                    let adjusted_level = if header.level > current_level + 1 {
                        current_level + 1
                    } else {
                        header.level
                    };

                    current_level = adjusted_level;

                    result.push(DocItem::SectionHeader(SectionHeaderItem {
                        level: adjusted_level,
                        ..header.clone()
                    }));
                }
                _ => {
                    result.push(item);
                }
            }
        }

        Ok(result)
    }
group_list_items function · rust · L98-L137 (40 LOC)
src/document/hierarchy_builder.rs
    fn group_list_items(&self, items: Vec<DocItem>) -> Result<Vec<DocItem>> {
        let mut result = Vec::new();
        let mut current_list: Vec<ListItemData> = Vec::new();
        let mut prev_level = 0;

        for item in items {
            match item {
                DocItem::ListItem(ref list_item) => {
                    // Adjust level based on previous items
                    let adjusted_level = if list_item.level > prev_level + 1 {
                        prev_level + 1
                    } else {
                        list_item.level
                    };

                    prev_level = adjusted_level;

                    current_list.push(ListItemData {
                        level: adjusted_level,
                        ..list_item.clone()
                    });
                }
                _ => {
                    // Flush accumulated list items
                    for list_item in current_list.drain(..) {
                        result.push(Doc
All rows above produced by Repobility · https://repobility.com
pair_captions function · rust · L142-L244 (103 LOC)
src/document/hierarchy_builder.rs
    fn pair_captions(&self, items: Vec<DocItem>) -> Result<Vec<DocItem>> {
        if items.len() < 2 {
            return Ok(items);
        }

        let mut result = Vec::new();
        let mut i = 0;

        while i < items.len() {
            let item = &items[i];

            match item {
                &DocItem::Table(ref table) if table.caption.is_none() => {
                    // Check if next item is caption (caption after table)
                    if i + 1 < items.len() {
                        if let DocItem::Paragraph(ref text_item) = items[i + 1] {
                            if Self::is_likely_caption(&text_item.text) {
                                // Merge caption into table
                                let mut new_table = table.clone();
                                new_table.caption = Some(text_item.text.clone());
                                result.push(DocItem::Table(new_table));
                                i += 2; // Skip caption
              
is_likely_caption function · rust · L247-L258 (12 LOC)
src/document/hierarchy_builder.rs
    fn is_likely_caption(text: &str) -> bool {
        let lower = text.to_lowercase();

        // Common caption patterns
        lower.starts_with("figure ")
            || lower.starts_with("fig. ")
            || lower.starts_with("table ")
            || lower.starts_with("image ")
            || lower.starts_with("chart ")
            || lower.starts_with("graph ")
            || (lower.len() < 100 && (lower.contains("figure") || lower.contains("table")))
    }
add_relationship function · rust · L287-L293 (7 LOC)
src/document/hierarchy_builder.rs
    pub fn add_relationship(&mut self, from: String, to: String, rel_type: &str) {
        let key = format!("{}:{}", from, rel_type);
        self.relationships
            .entry(key)
            .or_insert_with(Vec::new)
            .push(to);
    }
build_from_document function · rust · L302-L332 (31 LOC)
src/document/hierarchy_builder.rs
    pub fn build_from_document(&mut self, doc: &DoclingDocument) -> Result<()> {
        let mut current_section: Option<String> = None;
        let mut _current_figure: Option<String> = None;

        for (idx, item) in doc.items.iter().enumerate() {
            let item_ref = format!("item_{}", idx);

            match item {
                DocItem::SectionHeader(_) => {
                    current_section = Some(item_ref.clone());
                }
                DocItem::Paragraph(_) | DocItem::ListItem(_) => {
                    // Link to current section
                    if let Some(ref section) = current_section {
                        self.add_relationship(section.clone(), item_ref.clone(), "contains");
                    }
                }
                DocItem::Table(_) | DocItem::Picture(_) => {
                    _current_figure = Some(item_ref.clone());

                    // Link to current section
                    if let Some(ref section) = current_secti
test_is_likely_caption function · rust · L347-L356 (10 LOC)
src/document/hierarchy_builder.rs
    fn test_is_likely_caption() {
        assert!(HierarchyBuilder::is_likely_caption(
            "Figure 1: This is a test"
        ));
        assert!(HierarchyBuilder::is_likely_caption("Table 2: Results"));
        assert!(HierarchyBuilder::is_likely_caption("Fig. 3: Sample data"));
        assert!(!HierarchyBuilder::is_likely_caption(
            "This is a regular paragraph with no caption markers."
        ));
    }
test_section_tree_validation function · rust · L359-L387 (29 LOC)
src/document/hierarchy_builder.rs
    fn test_section_tree_validation() {
        let builder = HierarchyBuilder::new();

        let items = vec![
            DocItem::Title(TextItem {
                text: "Title".to_string(),
                formatting: None,
                label: DocItemLabel::Title,
            }),
            DocItem::SectionHeader(SectionHeaderItem {
                text: "Section 1".to_string(),
                level: 1,
                formatting: None,
            }),
            DocItem::SectionHeader(SectionHeaderItem {
                text: "Section 1.1".to_string(),
                level: 5, // Invalid jump - should be corrected to 2
                formatting: None,
            }),
        ];

        let result = builder.build_section_tree(items).unwrap();

        if let DocItem::SectionHeader(header) = &result[2] {
            assert_eq!(header.level, 2); // Should be corrected from 5 to 2
        } else {
            panic!("Expected SectionHeader");
        }
    }
test_caption_pairing function · rust · L390-L418 (29 LOC)
src/document/hierarchy_builder.rs
    fn test_caption_pairing() {
        let builder = HierarchyBuilder::new();

        let items = vec![
            DocItem::Paragraph(TextItem {
                text: "Figure 1: A beautiful chart".to_string(),
                formatting: None,
                label: DocItemLabel::Caption,
            }),
            DocItem::Picture(crate::document::types::PictureItem {
                caption: None,
                placeholder: "image".to_string(),
            }),
        ];

        let result = builder.pair_captions(items).unwrap();

        assert_eq!(result.len(), 1); // Caption should be merged

        if let DocItem::Picture(picture) = &result[0] {
            assert!(picture.caption.is_some());
            assert_eq!(
                picture.caption.as_ref().unwrap(),
                "Figure 1: A beautiful chart"
            );
        } else {
            panic!("Expected Picture");
        }
    }
default function · rust · L27-L34 (8 LOC)
src/document/page_assembler.rs
    fn default() -> Self {
        Self {
            enable_text_sanitization: true,
            enable_heading_detection: true,
            enable_list_detection: true,
            merge_adjacent_text: true,
        }
    }
Repobility · code-quality intelligence platform · https://repobility.com
new function · rust · L45-L50 (6 LOC)
src/document/page_assembler.rs
    pub fn new(options: PageAssemblerOptions) -> Self {
        Self {
            options,
            sanitizer: TextSanitizer::new(),
        }
    }
assemble function · rust · L53-L67 (15 LOC)
src/document/page_assembler.rs
    pub fn assemble(&self, clusters: &[Cluster]) -> Result<Vec<DocItem>> {
        let mut items = Vec::new();

        for cluster in clusters {
            let doc_items = self.process_cluster(cluster)?;
            items.extend(doc_items);
        }

        // Post-processing: merge adjacent text blocks if enabled
        if self.options.merge_adjacent_text {
            items = self.merge_adjacent_text_items(items)?;
        }

        Ok(items)
    }
process_cluster function · rust · L70-L89 (20 LOC)
src/document/page_assembler.rs
    fn process_cluster(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        match cluster.label {
            DocItemLabel::Title => self.process_title(cluster),
            DocItemLabel::SectionHeader => self.process_section_header(cluster),
            DocItemLabel::Paragraph | DocItemLabel::Text => self.process_text(cluster),
            DocItemLabel::ListItem => self.process_list_item(cluster),
            DocItemLabel::Caption => self.process_caption(cluster),
            DocItemLabel::Footnote => self.process_footnote(cluster),
            DocItemLabel::PageHeader | DocItemLabel::PageFooter => {
                self.process_header_footer(cluster)
            }
            DocItemLabel::Table => self.process_table(cluster),
            DocItemLabel::Picture | DocItemLabel::Figure => self.process_picture(cluster),
            DocItemLabel::Code => self.process_code(cluster),
            DocItemLabel::Formula => self.process_formula(cluster),
            DocItemLabel::Checkbo
‹ prevpage 3 / 9next ›