← back to hivellm__transmutation

Function bodies 424 total

All specs Real LLM only Function bodies
test_character_normalization function · rust · L414-L419 (6 LOC)
src/document/text_utils.rs
    fn test_character_normalization() {
        let sanitizer = TextSanitizer::new();
        let text = "Price: $100⁄month — \u{201C}special\u{201D} offer";
        let result = sanitizer.sanitize(text);
        assert_eq!(result, "Price: $100/month - \"special\" offer");
    }
test_whitespace_normalization function · rust · L422-L427 (6 LOC)
src/document/text_utils.rs
    fn test_whitespace_normalization() {
        let sanitizer = TextSanitizer::new();
        let text = "Too    many     spaces";
        let result = sanitizer.sanitize(text);
        assert_eq!(result, "Too many spaces");
    }
test_is_likely_heading function · rust · L430-L438 (9 LOC)
src/document/text_utils.rs
    fn test_is_likely_heading() {
        assert!(is_likely_heading("1. Introduction"));
        assert!(is_likely_heading("CHAPTER 1"));
        assert!(is_likely_heading("1.2.3 Methods"));
        assert!(!is_likely_heading("This is a regular sentence."));
        assert!(!is_likely_heading(
            "This is a very long text that goes on and on and definitely should not be considered a heading because it's way too long."
        ));
    }
test_extract_section_number function · rust · L441-L451 (11 LOC)
src/document/text_utils.rs
    fn test_extract_section_number() {
        assert_eq!(
            extract_section_number("1.2.3 Methods"),
            Some("1.2.3".to_string())
        );
        assert_eq!(
            extract_section_number("1. Introduction"),
            Some("1".to_string())
        );
        assert_eq!(extract_section_number("No number here"), None);
    }
test_calculate_section_level function · rust · L454-L459 (6 LOC)
src/document/text_utils.rs
    fn test_calculate_section_level() {
        assert_eq!(calculate_section_level("1"), 1);
        assert_eq!(calculate_section_level("1.2"), 2);
        assert_eq!(calculate_section_level("1.2.3"), 3);
        assert_eq!(calculate_section_level("1.2.3.4"), 4);
    }
test_ligature_normalization function · rust · L469-L474 (6 LOC)
src/document/text_utils.rs
    fn test_ligature_normalization() {
        let sanitizer = TextSanitizer::new();
        let text = "file with ligatures: ff, fi, fl";
        let result = sanitizer.sanitize(text);
        assert_eq!(result, "file with ligatures: ff, fi, fl");
    }
intersection_over_union function · rust · L49-L67 (19 LOC)
src/document/types_extended.rs
    pub fn intersection_over_union(&self, other: &BoundingBox) -> f64 {
        let inter_l = self.l.max(other.l);
        let inter_t = self.t.max(other.t);
        let inter_r = self.r.min(other.r);
        let inter_b = self.b.min(other.b);

        if inter_r <= inter_l || inter_b <= inter_t {
            return 0.0;
        }

        let inter_area = (inter_r - inter_l) * (inter_b - inter_t);
        let union_area = self.area() + other.area() - inter_area;

        if union_area > 0.0 {
            inter_area / union_area
        } else {
            0.0
        }
    }
Repobility — same analyzer, your code, free for public repos · /scan/
intersection_over_self function · rust · L70-L88 (19 LOC)
src/document/types_extended.rs
    pub fn intersection_over_self(&self, other: &BoundingBox) -> f64 {
        let inter_l = self.l.max(other.l);
        let inter_t = self.t.max(other.t);
        let inter_r = self.r.min(other.r);
        let inter_b = self.b.min(other.b);

        if inter_r <= inter_l || inter_b <= inter_t {
            return 0.0;
        }

        let inter_area = (inter_r - inter_l) * (inter_b - inter_t);
        let self_area = self.area();

        if self_area > 0.0 {
            inter_area / self_area
        } else {
            0.0
        }
    }
test_bbox_intersection function · rust · L183-L189 (7 LOC)
src/document/types_extended.rs
    fn test_bbox_intersection() {
        let bbox1 = BoundingBox::new(0.0, 0.0, 10.0, 10.0, CoordOrigin::TopLeft);
        let bbox2 = BoundingBox::new(5.0, 5.0, 15.0, 15.0, CoordOrigin::TopLeft);

        let iou = bbox1.intersection_over_union(&bbox2);
        assert!(iou > 0.0 && iou < 1.0);
    }
new function · rust · L18-L24 (7 LOC)
src/document/types.rs
    pub fn new(name: String) -> Self {
        Self {
            name,
            items: Vec::new(),
            items_by_ref: HashMap::new(),
        }
    }
add_item function · rust · L25-L31 (7 LOC)
src/document/types.rs
    pub fn add_item(&mut self, item: DocItem) -> String {
        let item_ref = format!("item_{}", self.items.len());
        self.items_by_ref.insert(item_ref.clone(), self.items.len());
        self.items.push(item);
        item_ref
    }
test_add_item function · rust · L146-L156 (11 LOC)
src/document/types.rs
    fn test_add_item() {
        let mut doc = DoclingDocument::new("test".to_string());
        let item = DocItem::Paragraph(TextItem {
            text: "Test".to_string(),
            formatting: None,
            label: DocItemLabel::Paragraph,
        });
        let item_ref = doc.add_item(item);
        assert_eq!(doc.items.len(), 1);
        assert_eq!(item_ref, "item_0");
    }
test_formatting_default function · rust · L159-L164 (6 LOC)
src/document/types.rs
    fn test_formatting_default() {
        let formatting = Formatting::default();
        assert!(!formatting.bold);
        assert!(!formatting.italic);
        assert!(!formatting.underline);
    }
test_table_cell_creation function · rust · L167-L175 (9 LOC)
src/document/types.rs
    fn test_table_cell_creation() {
        let cell = TableCell {
            text: "Cell".to_string(),
            row_span: 1,
            col_span: 1,
        };
        assert_eq!(cell.text, "Cell");
        assert_eq!(cell.row_span, 1);
    }
test_section_header_item function · rust · L178-L186 (9 LOC)
src/document/types.rs
    fn test_section_header_item() {
        let header = SectionHeaderItem {
            text: "Section 1".to_string(),
            level: 1,
            formatting: None,
        };
        assert_eq!(header.text, "Section 1");
        assert_eq!(header.level, 1);
    }
Repobility (the analyzer behind this table) · https://repobility.com
test_list_item_data function · rust · L189-L198 (10 LOC)
src/document/types.rs
    fn test_list_item_data() {
        let list_item = ListItemData {
            text: "Item 1".to_string(),
            marker: "-".to_string(),
            enumerated: false,
            level: 0,
        };
        assert_eq!(list_item.text, "Item 1");
        assert!(!list_item.enumerated);
    }
test_table_data_creation function · rust · L201-L209 (9 LOC)
src/document/types.rs
    fn test_table_data_creation() {
        let table = TableData {
            num_rows: 2,
            num_cols: 3,
            grid: vec![],
        };
        assert_eq!(table.num_rows, 2);
        assert_eq!(table.num_cols, 3);
    }
parse_docling_json_to_markdown function · rust · L9-L88 (80 LOC)
src/engines/docling_json_parser.rs
pub fn parse_docling_json_to_markdown(json_str: &str) -> Result<String> {
    eprintln!(
        "[JSON Parser] Starting parse, input size: {} bytes",
        json_str.len()
    );

    let json: Value = serde_json::from_str(json_str).map_err(|e| {
        eprintln!("[JSON Parser] ERROR parsing JSON: {e}");
        e
    })?;

    eprintln!("[JSON Parser] JSON parsed successfully");

    let mut markdown = String::new();

    // Extract table of contents for heading detection
    let toc = extract_table_of_contents(&json);
    let heading_titles: HashMap<String, usize> = toc
        .iter()
        .map(|(title, level)| (title.to_lowercase(), *level))
        .collect();

    // Process each page
    if let Some(pages) = json["pages"].as_array() {
        for page in pages {
            if let Some(page_num) = page["page_number"].as_u64() {
                eprintln!("[JSON Parser] Processing page {page_num}");
            }

            // Extract text from lines
            if let Som
extract_table_of_contents function · rust · L91-L101 (11 LOC)
src/engines/docling_json_parser.rs
fn extract_table_of_contents(json: &Value) -> Vec<(String, usize)> {
    let mut toc = Vec::new();

    if let Some(annotations) = json["annotations"].as_object() {
        if let Some(toc_array) = annotations["table_of_contents"].as_array() {
            extract_toc_recursive(toc_array, &mut toc);
        }
    }

    toc
}
extract_line_text function · rust · L118-L154 (37 LOC)
src/engines/docling_json_parser.rs
fn extract_line_text(line: &Value, original: &Value) -> Option<String> {
    // Lines contain indices that point to cells
    if let Some(indices) = line["i"].as_array() {
        if indices.is_empty() {
            return None;
        }

        // Get cell data
        if let Some(cells_obj) = original["cells"].as_object() {
            if let Some(cell_data) = cells_obj["data"].as_array() {
                let mut text = String::new();

                // Extract characters from the cell range
                if let (Some(start), Some(end)) =
                    (indices[0].as_u64(), indices.get(1).and_then(|v| v.as_u64()))
                {
                    for idx in start as usize..end as usize {
                        if let Some(cell) = cell_data.get(idx) {
                            if let Some(cell_array) = cell.as_array() {
                                // Character is at index 12 in the cell array
                                if let Some(ch) = cell_array.get(12).
should_join_line function · rust · L157-L181 (25 LOC)
src/engines/docling_json_parser.rs
fn should_join_line(prev: &str, current: &str) -> bool {
    if prev.is_empty() {
        return false;
    }

    // Check if previous line ends with:
    // - lowercase letter (likely continuation)
    // - comma, semicolon (mid-sentence)
    // - hyphen (word break)
    let last_char = prev.chars().last();

    match last_char {
        Some(c) if c.is_lowercase() => true,
        Some(',') | Some(';') | Some('-') => true,
        Some('.') | Some('!') | Some('?') => false, // Sentence end
        _ => {
            // Check if current starts with lowercase (likely continuation)
            current
                .chars()
                .next()
                .map(|c| c.is_lowercase())
                .unwrap_or(false)
        }
    }
}
test_should_join_line function · rust · L188-L196 (9 LOC)
src/engines/docling_json_parser.rs
    fn test_should_join_line() {
        assert!(should_join_line(
            "This is a sentence that continues",
            "on the next line."
        ));
        assert!(should_join_line("First part,", "second part."));
        assert!(!should_join_line("First sentence.", "Second sentence."));
        assert!(!should_join_line("Question?", "Next question?"));
    }
test_extract_toc function · rust · L199-L219 (21 LOC)
src/engines/docling_json_parser.rs
    fn test_extract_toc() {
        let json: Value = serde_json::from_str(
            r#"{
            "annotations": {
                "table_of_contents": [
                    {"title": "Introduction", "level": 0},
                    {"title": "Methods", "level": 0, "children": [
                        {"title": "Experiment 1", "level": 1}
                    ]}
                ]
            }
        }"#,
        )
        .unwrap();

        let toc = extract_table_of_contents(&json);
        assert_eq!(toc.len(), 3);
        assert_eq!(toc[0], ("Introduction".to_string(), 0));
        assert_eq!(toc[1], ("Methods".to_string(), 0));
        assert_eq!(toc[2], ("Experiment 1".to_string(), 1));
    }
If a scraper extracted this row, it came from Repobility (https://repobility.com)
open function · rust · L87-L122 (36 LOC)
src/engines/docling_parse_ffi.rs
    pub fn open(path: &Path) -> Result<Self> {
        #[cfg(feature = "docling-ffi")]
        {
            let path_str = path
                .to_str()
                .ok_or_else(|| TransmutationError::conversion_failed("Invalid file path"))?;

            let c_path = CString::new(path_str).map_err(|e| {
                TransmutationError::engine_error("docling-parse", format!("Invalid path: {e}"))
            })?;

            let mut handle: DoclingDocumentHandle = ptr::null_mut();

            unsafe {
                let result = docling_open_pdf(c_path.as_ptr(), &mut handle);

                if result != DoclingError::Ok {
                    let err_msg = CStr::from_ptr(docling_get_last_error())
                        .to_string_lossy()
                        .to_string();
                    return Err(TransmutationError::engine_error("docling-parse", err_msg));
                }
            }

            Ok(Self { handle })
        }

        #[cfg(not(feature = "docli
page_count function · rust · L125-L150 (26 LOC)
src/engines/docling_parse_ffi.rs
    pub fn page_count(&self) -> Result<usize> {
        #[cfg(feature = "docling-ffi")]
        {
            let mut count: c_int = 0;

            unsafe {
                let result = docling_get_page_count(self.handle, &mut count);
                if result != DoclingError::Ok {
                    let err_msg = CStr::from_ptr(docling_get_last_error())
                        .to_string_lossy()
                        .to_string();
                    return Err(TransmutationError::engine_error("docling-parse", err_msg));
                }
            }

            Ok(count as usize)
        }

        #[cfg(not(feature = "docling-ffi"))]
        {
            Err(TransmutationError::engine_error(
                "docling-parse",
                "Feature not enabled",
            ))
        }
    }
export_markdown function · rust · L153-L182 (30 LOC)
src/engines/docling_parse_ffi.rs
    pub fn export_markdown(&self) -> Result<String> {
        #[cfg(feature = "docling-ffi")]
        {
            let mut markdown_ptr: *mut c_char = ptr::null_mut();

            unsafe {
                let result = docling_export_markdown(self.handle, &mut markdown_ptr);
                if result != DoclingError::Ok {
                    let err_msg = CStr::from_ptr(docling_get_last_error())
                        .to_string_lossy()
                        .to_string();
                    return Err(TransmutationError::engine_error("docling-parse", err_msg));
                }

                let markdown = CStr::from_ptr(markdown_ptr).to_string_lossy().to_string();

                docling_free_string(markdown_ptr);

                Ok(markdown)
            }
        }

        #[cfg(not(feature = "docling-ffi"))]
        {
            Err(TransmutationError::engine_error(
                "docling-parse",
                "Feature not enabled",
            ))
        }
    }
drop function · rust · L186-L191 (6 LOC)
src/engines/docling_parse_ffi.rs
    fn drop(&mut self) {
        #[cfg(feature = "docling-ffi")]
        unsafe {
            docling_close_pdf(self.handle);
        }
    }
get_layout_clusters function · rust · L14-L148 (135 LOC)
src/engines/docling_python_bridge.rs
pub fn get_layout_clusters(pdf_path: &Path) -> Result<Vec<Cluster>> {
    eprintln!("🐍 Calling Python docling for layout analysis...");
    
    // Create a temporary Python script  
    let script = r#"
import sys
import json
from pathlib import Path
from docling.document_converter import DocumentConverter

try:
    # Convert PDF
    converter = DocumentConverter()
    pdf_path = Path(sys.argv[1])
    result = converter.convert(pdf_path)
    
    # Extract layout information
    clusters = []
    cluster_id = 0
    
    # Iterate through document items
    for item in result.document.items:
        # Get bounding box
        if hasattr(item, 'prov') and hasattr(item.prov, 'bbox'):
            bbox = item.prov.bbox
            
            cluster = {{
                "id": cluster_id,
                "label": str(item.label) if hasattr(item, 'label') else "text",
                "bbox": {{
                    "l": bbox.l,
                    "t": bbox.t,
                    "r": bbox.
parse_label function · rust · L149-L164 (16 LOC)
src/engines/docling_python_bridge.rs
fn parse_label(label_str: &str) -> DocItemLabel {
    match label_str.to_lowercase().as_str() {
        "title" => DocItemLabel::Title,
        "section_header" | "sectionheader" => DocItemLabel::SectionHeader,
        "paragraph" => DocItemLabel::Paragraph,
        "list_item" | "listitem" => DocItemLabel::ListItem,
        "table" => DocItemLabel::Table,
        "picture" | "figure" => DocItemLabel::Picture,
        "code" => DocItemLabel::Code,
        "formula" => DocItemLabel::Formula,
        "caption" => DocItemLabel::Caption,
        "footnote" => DocItemLabel::Footnote,
        _ => DocItemLabel::Text,
    }
}
new function · rust · L23-L29 (7 LOC)
src/engines/layout_analyzer.rs
    pub fn new() -> Self {
        Self {
            heading_font_threshold: 12.0,
            paragraph_y_gap: 10.0,
            base_font_size: 10.0,
        }
    }
analyze function · rust · L32-L65 (34 LOC)
src/engines/layout_analyzer.rs
    pub fn analyze(&self, blocks: &[TextBlock]) -> Vec<AnalyzedBlock> {
        if blocks.is_empty() {
            return Vec::new();
        }

        let mut analyzed = Vec::new();
        let mut i = 0;

        while i < blocks.len() {
            let block = &blocks[i];
            let content = block.text.trim();

            if content.is_empty() {
                i += 1;
                continue;
            }

            // Detect block type
            let block_type = self.detect_block_type(block, blocks, i);

            analyzed.push(AnalyzedBlock {
                block_type: block_type.clone(),
                content: content.to_string(),
                level: self.get_heading_level(&block_type, block.font_size),
                font_size: block.font_size,
                y_position: block.y,
            });

            i += 1;
        }

        // Post-process to merge multi-line elements
        self.merge_multiline_elements(analyzed)
    }
Repobility · severity-and-effort ranking · https://repobility.com
detect_block_type function · rust · L68-L113 (46 LOC)
src/engines/layout_analyzer.rs
    fn detect_block_type(
        &self,
        block: &TextBlock,
        _all_blocks: &[TextBlock],
        _index: usize,
    ) -> BlockType {
        let content = block.text.trim();

        // Check for formulas (high math symbol density)
        if self.is_formula(content) {
            return BlockType::Formula;
        }

        // Check for image captions
        if self.is_image_caption(content) {
            return BlockType::Image;
        }

        // Check for table content
        if content.contains('|') || content.contains('\t') {
            return BlockType::Table;
        }

        // Check for list items
        if self.is_list_item(content) {
            return BlockType::ListItem;
        }

        // Check for headings based on font size
        if block.font_size > self.heading_font_threshold {
            return self.classify_heading(content, block.font_size);
        }

        // Check for numbered sections like "1 Introduction"
        if self.is_sect
classify_heading function · rust · L116-L129 (14 LOC)
src/engines/layout_analyzer.rs
    fn classify_heading(&self, content: &str, font_size: f32) -> BlockType {
        // Title if very large font or specific keywords
        if font_size >= 18.0 || content.contains("Attention Is All You Need") {
            return BlockType::Title;
        }

        // Major section if large font
        if font_size >= 14.0 {
            return BlockType::Heading(1); // Changed from 2 to 1
        }

        // Subsection
        BlockType::Heading(2) // Changed from 3 to 2
    }
is_section_heading function · rust · L132-L173 (42 LOC)
src/engines/layout_analyzer.rs
    fn is_section_heading(&self, content: &str) -> bool {
        let trimmed = content.trim();

        // Keywords that indicate sections (exact match at start)
        let section_keywords = [
            "Abstract",
            "Introduction",
            "Background",
            "Conclusion",
            "Acknowledgements",
            "References",
            "Appendix",
            "Attention Visualizations",
        ];
        if section_keywords
            .iter()
            .any(|&kw| trimmed == kw || trimmed.starts_with(&format!("{} ", kw)))
        {
            return true;
        }

        // Pattern: single digit + space + capitalized word (like "1 Introduction")
        if let Some(first_char) = trimmed.chars().next() {
            if first_char.is_numeric() {
                let parts: Vec<&str> = trimmed.splitn(2, ' ').collect();
                if parts.len() == 2 {
                    let number_part = parts[0];
                    let text_part = parts[1];

 
is_subsection_heading function · rust · L176-L195 (20 LOC)
src/engines/layout_analyzer.rs
    fn is_subsection_heading(&self, content: &str) -> bool {
        let trimmed = content.trim();

        // Pattern like "3.1", "3.2.1", etc.
        if trimmed.len() < 150 {
            let first_part = trimmed.split_whitespace().next().unwrap_or("");
            let dot_count = first_part.matches('.').count();
            let digit_count = first_part.chars().filter(|c| c.is_numeric()).count();

            // Must have pattern like "3.1" or "3.2.1" or "3.2.2"
            if dot_count >= 1 && digit_count >= 2 && first_part.len() < 10 {
                // Verify there's text after the number
                if trimmed.len() > first_part.len() + 1 {
                    return true;
                }
            }
        }

        false
    }
is_list_item function · rust · L198-L227 (30 LOC)
src/engines/layout_analyzer.rs
    fn is_list_item(&self, content: &str) -> bool {
        let trimmed = content.trim();

        // Bullet points
        if trimmed.starts_with('•') || trimmed.starts_with('▪') || trimmed.starts_with('◦') {
            return true;
        }

        // Dash bullets
        if trimmed.starts_with("- ") || trimmed.starts_with("– ") || trimmed.starts_with("— ") {
            return true;
        }

        // Numbered lists (1., a., i., etc.)
        if let Some(first_word) = trimmed.split_whitespace().next() {
            if first_word.ends_with('.') || first_word.ends_with(')') {
                let without_punct = first_word.trim_end_matches(&['.', ')'][..]);
                // Roman numerals, letters, or numbers
                if without_punct
                    .chars()
                    .all(|c| c.is_numeric() || c.is_alphabetic())
                    && without_punct.len() <= 3
                {
                    return true;
                }
            }
        }

   
is_formula function · rust · L230-L264 (35 LOC)
src/engines/layout_analyzer.rs
    fn is_formula(&self, content: &str) -> bool {
        // Math Unicode characters
        let math_chars = [
            '∑', '∫', '√', '∈', '∉', '⊂', '⊃', '≤', '≥', '≠', '≈', '∞', '∂', '∇', '×', '÷', '±',
            'α', 'β', 'γ', 'δ', 'θ', 'λ', 'μ', 'π', 'σ', 'ω',
        ];

        let math_count = content.chars().filter(|c| math_chars.contains(c)).count();
        let total_chars = content.chars().count();

        // High density of math symbols (>10%)
        if total_chars > 0 && (math_count as f32 / total_chars as f32) > 0.1 {
            return true;
        }

        // Common LaTeX-like patterns that weren't converted
        if content.contains("\\frac") || content.contains("\\sum") || content.contains("\\int") {
            return true;
        }

        // Equations with equals and operators
        if content.contains('=')
            && (content.matches('+').count()
                + content.matches('*').count()
                + content.matches('/').count())
   
is_image_caption function · rust · L267-L273 (7 LOC)
src/engines/layout_analyzer.rs
    fn is_image_caption(&self, content: &str) -> bool {
        let lower = content.to_lowercase();
        lower.starts_with("figure")
            || lower.starts_with("fig.")
            || lower.starts_with("image")
            || lower.starts_with("diagram")
    }
get_heading_level function · rust · L276-L293 (18 LOC)
src/engines/layout_analyzer.rs
    fn get_heading_level(&self, block_type: &BlockType, font_size: f32) -> Option<usize> {
        match block_type {
            BlockType::Title => Some(1),
            BlockType::Heading(level) => Some(*level),
            _ => {
                // Derive from font size
                if font_size >= 18.0 {
                    Some(2)
                } else if font_size >= 14.0 {
                    Some(3)
                } else if font_size >= 12.0 {
                    Some(4)
                } else {
                    None
                }
            }
        }
    }
Repobility — same analyzer, your code, free for public repos · /scan/
merge_multiline_elements function · rust · L296-L338 (43 LOC)
src/engines/layout_analyzer.rs
    fn merge_multiline_elements(&self, blocks: Vec<AnalyzedBlock>) -> Vec<AnalyzedBlock> {
        if blocks.is_empty() {
            return blocks;
        }

        let mut merged = Vec::new();
        let mut current: Option<AnalyzedBlock> = None;

        for block in blocks {
            match (&current, &block.block_type) {
                // Merge consecutive paragraphs if Y gap is small
                (Some(curr), BlockType::Paragraph)
                    if matches!(curr.block_type, BlockType::Paragraph) =>
                {
                    if let Some(ref mut c) = current {
                        // Check if should merge (close Y positions)
                        let y_diff = (c.y_position - block.y_position).abs();
                        if y_diff < self.paragraph_y_gap * 2.0 {
                            c.content.push(' ');
                            c.content.push_str(&block.content);
                        } else {
                            merged.push(curre
test_section_heading_detection function · rust · L388-L394 (7 LOC)
src/engines/layout_analyzer.rs
    fn test_section_heading_detection() {
        let analyzer = LayoutAnalyzer::new();
        assert!(analyzer.is_section_heading("1 Introduction"));
        assert!(analyzer.is_section_heading("2 Background"));
        assert!(analyzer.is_section_heading("Abstract"));
        assert!(!analyzer.is_section_heading("This is regular text"));
    }
test_subsection_detection function · rust · L397-L402 (6 LOC)
src/engines/layout_analyzer.rs
    fn test_subsection_detection() {
        let analyzer = LayoutAnalyzer::new();
        assert!(analyzer.is_subsection_heading("3.1 Encoder and Decoder Stacks"));
        assert!(analyzer.is_subsection_heading("3.2.1 Scaled Dot-Product"));
        assert!(!analyzer.is_subsection_heading("This is not a subsection"));
    }
test_list_detection function · rust · L405-L411 (7 LOC)
src/engines/layout_analyzer.rs
    fn test_list_detection() {
        let analyzer = LayoutAnalyzer::new();
        assert!(analyzer.is_list_item("• First item"));
        assert!(analyzer.is_list_item("- Second item"));
        assert!(analyzer.is_list_item("1. Numbered item"));
        assert!(!analyzer.is_list_item("Regular text"));
    }
test_formula_detection function · rust · L414-L419 (6 LOC)
src/engines/layout_analyzer.rs
    fn test_formula_detection() {
        let analyzer = LayoutAnalyzer::new();
        assert!(analyzer.is_formula("x = ∑ᵢ yᵢ + √z"));
        assert!(analyzer.is_formula("α + β = γ"));
        assert!(!analyzer.is_formula("This is regular text"));
    }
test_image_caption_detection function · rust · L422-L427 (6 LOC)
src/engines/layout_analyzer.rs
    fn test_image_caption_detection() {
        let analyzer = LayoutAnalyzer::new();
        assert!(analyzer.is_image_caption("Figure 1: The Transformer"));
        assert!(analyzer.is_image_caption("Fig. 2: Attention mechanism"));
        assert!(!analyzer.is_image_caption("Regular text"));
    }
new function · rust · L27-L37 (11 LOC)
src/engines/layout_postprocessor.rs
    fn new(elements: &[usize]) -> Self {
        let mut parent = HashMap::new();
        let mut rank = HashMap::new();

        for &elem in elements {
            parent.insert(elem, elem);
            rank.insert(elem, 0);
        }

        Self { parent, rank }
    }
find function · rust · L38-L45 (8 LOC)
src/engines/layout_postprocessor.rs
    fn find(&mut self, x: usize) -> usize {
        if self.parent[&x] != x {
            let root = self.find(self.parent[&x]);
            self.parent.insert(x, root); // Path compression
        }
        self.parent[&x]
    }
Repobility (the analyzer behind this table) · https://repobility.com
union function · rust · L46-L66 (21 LOC)
src/engines/layout_postprocessor.rs
    fn union(&mut self, x: usize, y: usize) {
        let root_x = self.find(x);
        let root_y = self.find(y);

        if root_x == root_y {
            return;
        }

        let rank_x = self.rank[&root_x];
        let rank_y = self.rank[&root_y];

        if rank_x > rank_y {
            self.parent.insert(root_y, root_x);
        } else if rank_x < rank_y {
            self.parent.insert(root_x, root_y);
        } else {
            self.parent.insert(root_y, root_x);
            self.rank.insert(root_x, rank_x + 1);
        }
    }
get_groups function · rust · L67-L80 (14 LOC)
src/engines/layout_postprocessor.rs
    fn get_groups(&mut self) -> HashMap<usize, Vec<usize>> {
        let mut groups: HashMap<usize, Vec<usize>> = HashMap::new();

        // Clone keys to avoid borrowing issue
        let keys: Vec<usize> = self.parent.keys().copied().collect();

        for elem in keys {
            let root = self.find(elem);
            groups.entry(root).or_default().push(elem);
        }

        groups
    }
new function · rust · L103-L115 (13 LOC)
src/engines/layout_postprocessor.rs
    fn new(clusters: &[Cluster]) -> Self {
        let rects: Vec<ClusterRect> = clusters
            .iter()
            .map(|c| ClusterRect {
                id: c.id,
                bbox: c.bbox,
            })
            .collect();

        let rtree = RTree::bulk_load(rects);

        Self { rtree }
    }
‹ prevpage 5 / 9next ›