Function bodies 424 total

extract_text function · rust · L92-L142 (51 LOC)

src/document/page_assembler.rs

    fn extract_text(&self, cluster: &Cluster) -> String {
        // Sort cells by position (Y then X)
        let mut cells = cluster.cells.clone();
        cells.sort_by(|a, b| {
            let y_cmp = a.bbox.t.partial_cmp(&b.bbox.t).unwrap();
            if y_cmp == std::cmp::Ordering::Equal {
                a.bbox.l.partial_cmp(&b.bbox.l).unwrap()
            } else {
                y_cmp
            }
        });

        // Smart joining: docling-parse returns one character per cell
        // We need to detect word boundaries based on horizontal distance
        let mut text = String::new();
        let mut prev_x_end = 0.0;
        let mut prev_y = 0.0;

        for cell in &cells {
            let gap_x = cell.bbox.l - prev_x_end;
            let gap_y = (cell.bbox.t - prev_y).abs();
            let cell_width = cell.bbox.r - cell.bbox.l;

            // New line if vertical gap is significant
            if prev_y > 0.0 && gap_y > 5.0 {
                if !text.ends_with('

process_title function · rust · L145-L153 (9 LOC)

src/document/page_assembler.rs

    fn process_title(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);

        Ok(vec![DocItem::Title(TextItem {
            text,
            formatting: None,
            label: DocItemLabel::Title,
        })])
    }

process_section_header function · rust · L156-L172 (17 LOC)

src/document/page_assembler.rs

    fn process_section_header(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);

        // Try to extract section number to determine level
        let level = if let Some(section_num) = extract_section_number(&text) {
            calculate_section_level(&section_num)
        } else {
            // Fallback heuristic based on font size or default to 2
            2
        };

        Ok(vec![DocItem::SectionHeader(SectionHeaderItem {
            text,
            level,
            formatting: None,
        })])
    }

process_text function · rust · L175-L199 (25 LOC)

src/document/page_assembler.rs

    fn process_text(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);

        // Check if it's actually a heading (ML may misclassify)
        if self.options.enable_heading_detection && is_likely_heading(&text) {
            // Promote to section header
            let level = if let Some(section_num) = extract_section_number(&text) {
                calculate_section_level(&section_num)
            } else {
                2
            };

            Ok(vec![DocItem::SectionHeader(SectionHeaderItem {
                text,
                level,
                formatting: None,
            })])
        } else {
            Ok(vec![DocItem::Paragraph(TextItem {
                text,
                formatting: None,
                label: DocItemLabel::Paragraph,
            })])
        }
    }

process_list_item function · rust · L202-L221 (20 LOC)

src/document/page_assembler.rs

    fn process_list_item(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);

        // Detect marker and type
        let (marker, enumerated) = self.detect_list_marker(&text);

        // Remove marker from text
        let text_without_marker = if let Some(m) = &marker {
            text.trim_start_matches(m).trim_start().to_string()
        } else {
            text
        };

        Ok(vec![DocItem::ListItem(ListItemData {
            text: text_without_marker,
            marker: marker.unwrap_or_else(|| "-".to_string()),
            enumerated,
            level: 0, // TODO: Detect nesting level from indentation
        })])
    }

detect_list_marker function · rust · L224-L241 (18 LOC)

src/document/page_assembler.rs

    fn detect_list_marker(&self, text: &str) -> (Option<String>, bool) {
        let trimmed = text.trim_start();

        // Bullet markers
        if trimmed.starts_with("- ") || trimmed.starts_with("• ") || trimmed.starts_with("· ") {
            return (Some(trimmed.chars().next().unwrap().to_string()), false);
        }

        // Numbered markers (1., 2., 1), 2), etc.)
        if let Some(pos) = trimmed.find(|c| c == '.' || c == ')') {
            if pos > 0 && trimmed[..pos].chars().all(|c| c.is_numeric()) {
                let marker = &trimmed[..=pos];
                return (Some(marker.to_string()), true);
            }
        }

        (None, false)
    }

process_caption function · rust · L244-L255 (12 LOC)

src/document/page_assembler.rs

    fn process_caption(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);

        Ok(vec![DocItem::Paragraph(TextItem {
            text,
            formatting: Some(Formatting {
                italic: true,
                ..Default::default()
            }),
            label: DocItemLabel::Caption,
        })])
    }

Hi, dataset curator — please cite Repobility (https://repobility.com) when reusing this data.

process_footnote function · rust · L258-L266 (9 LOC)

src/document/page_assembler.rs

    fn process_footnote(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);

        Ok(vec![DocItem::Paragraph(TextItem {
            text,
            formatting: None,
            label: DocItemLabel::Footnote,
        })])
    }

process_header_footer function · rust · L269-L279 (11 LOC)

src/document/page_assembler.rs

    fn process_header_footer(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);

        // Usually skip headers/footers as they're page metadata
        // But can be included if needed
        Ok(vec![DocItem::Paragraph(TextItem {
            text,
            formatting: None,
            label: cluster.label,
        })])
    }

process_table function · rust · L282-L304 (23 LOC)

src/document/page_assembler.rs

    fn process_table(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        // This is a placeholder - actual table structure comes from TableStructureModel
        // For now, create a simple table from cells

        let text = self.extract_text(cluster);

        // TODO: Use TableStructureModel output to build proper TableData
        // For now, create a minimal table
        let table_data = TableData {
            num_rows: 1,
            num_cols: 1,
            grid: vec![vec![TableCell {
                text,
                row_span: 1,
                col_span: 1,
            }]],
        };

        Ok(vec![DocItem::Table(TableItem {
            data: table_data,
            caption: None,
        })])
    }

process_picture function · rust · L307-L322 (16 LOC)

src/document/page_assembler.rs

    fn process_picture(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        // Extract any text (OCR or caption)
        let text = if !cluster.cells.is_empty() {
            Some(self.extract_text(cluster))
        } else {
            None
        };

        Ok(vec![DocItem::Picture(PictureItem {
            caption: text,
            placeholder: format!(
                "<!-- Figure at ({}, {}) -->",
                cluster.bbox.l, cluster.bbox.t
            ),
        })])
    }

process_code function · rust · L325-L332 (8 LOC)

src/document/page_assembler.rs

    fn process_code(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);

        // Try to detect language from first line
        let language = self.detect_code_language(&text);

        Ok(vec![DocItem::Code(CodeItem { text, language })])
    }

detect_code_language function · rust · L335-L346 (12 LOC)

src/document/page_assembler.rs

    fn detect_code_language(&self, text: &str) -> Option<String> {
        // Simple heuristics - can be improved
        if text.contains("def ") || text.contains("import ") || text.contains("print(") {
            Some("python".to_string())
        } else if text.contains("function ") || text.contains("const ") || text.contains("let ") {
            Some("javascript".to_string())
        } else if text.contains("fn ") || text.contains("impl ") || text.contains("pub ") {
            Some("rust".to_string())
        } else {
            None
        }
    }

process_formula function · rust · L349-L356 (8 LOC)

src/document/page_assembler.rs

    fn process_formula(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);

        // Detect if inline or block formula based on length/position
        let is_inline = text.len() < 50;

        Ok(vec![DocItem::Formula(FormulaItem { text, is_inline })])
    }

process_checkbox function · rust · L359-L371 (13 LOC)

src/document/page_assembler.rs

    fn process_checkbox(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
        let text = self.extract_text(cluster);
        let checked = cluster.label == DocItemLabel::CheckboxSelected;

        let marker = if checked { "[x]" } else { "[ ]" };

        Ok(vec![DocItem::ListItem(ListItemData {
            text,
            marker: marker.to_string(),
            enumerated: false,
            level: 0,
        })])
    }

All rows scored by the Repobility analyzer (https://repobility.com)

merge_adjacent_text_items function · rust · L374-L419 (46 LOC)

src/document/page_assembler.rs

    fn merge_adjacent_text_items(&self, items: Vec<DocItem>) -> Result<Vec<DocItem>> {
        if items.len() < 2 {
            return Ok(items);
        }

        let mut merged = Vec::new();
        let mut current_text: Option<String> = None;
        let mut current_label: Option<DocItemLabel> = None;

        for item in items {
            match item {
                DocItem::Paragraph(ref text_item) if text_item.label == DocItemLabel::Paragraph => {
                    // Accumulate text
                    if let Some(ref mut text) = current_text {
                        text.push(' ');
                        text.push_str(&text_item.text);
                    } else {
                        current_text = Some(text_item.text.clone());
                        current_label = Some(text_item.label);
                    }
                }
                _ => {
                    // Flush accumulated text
                    if let Some(text) = current_text.take() {

test_detect_list_marker function · rust · L433-L446 (14 LOC)

src/document/page_assembler.rs

    fn test_detect_list_marker() {
        let assembler = PageAssembler::default();

        let (marker, enumerated) = assembler.detect_list_marker("- Item");
        assert_eq!(marker, Some("-".to_string()));
        assert!(!enumerated);

        let (marker, enumerated) = assembler.detect_list_marker("1. First");
        assert_eq!(marker, Some("1.".to_string()));
        assert!(enumerated);

        let (_marker, enumerated) = assembler.detect_list_marker("• Bullet");
        assert!(!enumerated);
    }

test_detect_code_language function · rust · L449-L466 (18 LOC)

src/document/page_assembler.rs

    fn test_detect_code_language() {
        let assembler = PageAssembler::default();

        assert_eq!(
            assembler.detect_code_language("def main():\n    print('hello')"),
            Some("python".to_string())
        );

        assert_eq!(
            assembler.detect_code_language("function test() { const x = 1; }"),
            Some("javascript".to_string())
        );

        assert_eq!(
            assembler.detect_code_language("fn main() { println!(\"hello\"); }"),
            Some("rust".to_string())
        );
    }

parse function · rust · L17-L42 (26 LOC)

src/document/parser.rs

    pub fn parse(json_str: &str) -> Result<DoclingDocument> {
        let json: Value = serde_json::from_str(json_str)?;

        let filename = json["info"]["filename"]
            .as_str()
            .unwrap_or("document")
            .to_string();

        let mut doc = DoclingDocument::new(filename);

        // Extract table of contents for heading detection
        let toc = Self::extract_table_of_contents(&json);
        let heading_map: HashMap<String, usize> = toc
            .iter()
            .map(|(title, level)| (title.to_lowercase(), *level))
            .collect();

        // Process each page
        if let Some(pages) = json["pages"].as_array() {
            for page in pages {
                Self::process_page(page, &mut doc, &heading_map)?;
            }
        }

        Ok(doc)
    }

extract_table_of_contents function · rust · L43-L52 (10 LOC)

src/document/parser.rs

    fn extract_table_of_contents(json: &Value) -> Vec<(String, usize)> {
        let mut toc = Vec::new();

        if let Some(toc_array) = json["annotations"]["table_of_contents"].as_array() {
            Self::extract_toc_recursive(toc_array, &mut toc);
        }

        toc
    }

process_page function · rust · L65-L79 (15 LOC)

src/document/parser.rs

    fn process_page(
        page: &Value,
        doc: &mut DoclingDocument,
        heading_map: &HashMap<String, usize>,
    ) -> Result<()> {
        // Process cells ordered by position (Y descending, X ascending)
        if let Some(cells_obj) = page["original"]["cells"].as_object() {
            if let Some(cell_data) = cells_obj["data"].as_array() {
                Self::process_cells_ordered(cell_data, doc, heading_map)?;
            }
        }

        Ok(())
    }

process_cells_ordered function · rust · L80-L155 (76 LOC)

src/document/parser.rs

    fn process_cells_ordered(
        cell_data: &[Value],
        doc: &mut DoclingDocument,
        heading_map: &HashMap<String, usize>,
    ) -> Result<()> {
        // Group cells by Y position (lines) and order by X
        // Cell structure: [x0, y0, x1, y1, ...]
        let mut cells_with_pos: Vec<(f64, f64, f64, String)> = Vec::new();

        for cell in cell_data {
            if let Some(cell_array) = cell.as_array() {
                // cell_array[0] = x0, cell_array[1] = y0, cell_array[2] = x1, cell_array[12] = text
                if let (Some(x0), Some(x1), Some(y), Some(text)) = (
                    cell_array.get(0).and_then(|v| v.as_f64()),
                    cell_array.get(2).and_then(|v| v.as_f64()),
                    cell_array.get(1).and_then(|v| v.as_f64()),
                    cell_array.get(12).and_then(|v| v.as_str()),
                ) {
                    let trimmed = text.trim();
                    // Filter out non-meaningful single characters and

merge_lines_into_paragraphs function · rust · L179-L229 (51 LOC)

src/document/parser.rs

    fn merge_lines_into_paragraphs(
        lines: Vec<String>,
        doc: &mut DoclingDocument,
        heading_map: &HashMap<String, usize>,
    ) {
        if lines.is_empty() {
            return;
        }

        let mut current_paragraph = String::new();

        for (i, line) in lines.iter().enumerate() {
            let line_trimmed = line.trim();

            // Check if this is a heading
            if heading_map.contains_key(&line_trimmed.to_lowercase()) {
                // Flush current paragraph if any
                if !current_paragraph.is_empty() {
                    Self::process_text_line(&current_paragraph, doc, heading_map);
                    current_paragraph.clear();
                }
                // Add heading immediately
                Self::process_text_line(line_trimmed, doc, heading_map);
                continue;
            }

            // Check if line should be merged with previous
            let should_merge = if current_paragraph.is_em

All rows above produced by Repobility · https://repobility.com

should_merge_lines function · rust · L232-L293 (62 LOC)

src/document/parser.rs

    fn should_merge_lines(prev_line: &str, current_line: &str) -> bool {
        let prev_trimmed = prev_line.trim();
        let current_trimmed = current_line.trim();

        if prev_trimmed.is_empty() || current_trimmed.is_empty() {
            return false;
        }

        // Don't merge if previous line ends with sentence-ending punctuation
        if prev_trimmed.ends_with('.') || prev_trimmed.ends_with('!') || prev_trimmed.ends_with('?')
        {
            // Unless it's an abbreviation (single letter + dot)
            if let Some(last_word) = prev_trimmed.split_whitespace().last() {
                if last_word.len() <= 2 && last_word.ends_with('.') {
                    return true; // Likely abbreviation, merge
                }
            }
            return false;
        }

        // Don't merge if previous line ends with colon (likely list or heading)
        if prev_trimmed.ends_with(':') {
            return false;
        }

        // Don't merge if current

process_text_line function · rust · L294-L324 (31 LOC)

src/document/parser.rs

    fn process_text_line(
        text: &str,
        doc: &mut DoclingDocument,
        heading_map: &HashMap<String, usize>,
    ) {
        // Check if it's a heading
        let text_lower = text.to_lowercase();
        if let Some(&level) = heading_map.get(&text_lower) {
            if level == 0 {
                doc.add_item(DocItem::Title(TextItem {
                    text: text.to_string(),
                    formatting: None,
                    label: DocItemLabel::Title,
                }));
            } else {
                doc.add_item(DocItem::SectionHeader(SectionHeaderItem {
                    text: text.to_string(),
                    level,
                    formatting: None,
                }));
            }
        } else {
            // Regular paragraph
            doc.add_item(DocItem::Paragraph(TextItem {
                text: text.to_string(),
                formatting: None,
                label: DocItemLabel::Paragraph,
            }));

default function · rust · L36-L44 (9 LOC)

src/document/serializer.rs

    fn default() -> Self {
        Self {
            indent: 4,
            escape_underscores: true,
            escape_special_chars: true,
            enable_tables: true,
            enable_images: true,
        }
    }

serialize function · rust · L71-L89 (19 LOC)

src/document/serializer.rs

    pub fn serialize(&self, doc: &DoclingDocument) -> Result<String> {
        let mut parts = Vec::new();

        for item in &doc.items {
            if let Some(text) = self.serialize_item(item) {
                parts.push(text);
            }
        }

        let mut output = parts.join("\n\n");

        // Clean up excessive newlines
        while output.contains("\n\n\n") {
            output = output.replace("\n\n\n", "\n\n");
        }

        Ok(output.trim().to_string())
    }

serialize_item function · rust · L90-L102 (13 LOC)

src/document/serializer.rs

    fn serialize_item(&self, item: &DocItem) -> Option<String> {
        match item {
            DocItem::Title(text_item) => Some(self.serialize_title(text_item)),
            DocItem::SectionHeader(header) => Some(self.serialize_section_header(header)),
            DocItem::Paragraph(text_item) => Some(self.serialize_paragraph(text_item)),
            DocItem::ListItem(list_item) => Some(self.serialize_list_item(list_item)),
            DocItem::Table(table) => Some(self.serialize_table(table)),
            DocItem::Picture(picture) => Some(self.serialize_picture(picture)),
            DocItem::Code(code) => Some(self.serialize_code(code)),
            DocItem::Formula(formula) => Some(self.serialize_formula(formula)),
        }
    }

serialize_section_header function · rust · L108-L113 (6 LOC)

src/document/serializer.rs

    fn serialize_section_header(&self, item: &SectionHeaderItem) -> String {
        let text = self.apply_formatting(&item.text, item.formatting.as_ref());
        let hashes = "#".repeat(item.level + 1);
        format!("{} {}", hashes, text)
    }

serialize_paragraph function · rust · L114-L126 (13 LOC)

src/document/serializer.rs

    fn serialize_paragraph(&self, item: &TextItem) -> String {
        let mut text = item.text.clone();

        // Handle checkboxes
        text = match item.label {
            DocItemLabel::CheckboxSelected => format!("- [x] {}", text),
            DocItemLabel::CheckboxUnselected => format!("- [ ] {}", text),
            _ => text,
        };

        self.apply_formatting(&text, item.formatting.as_ref())
    }

serialize_list_item function · rust · L127-L137 (11 LOC)

src/document/serializer.rs

    fn serialize_list_item(&self, item: &ListItemData) -> String {
        let indent_str = " ".repeat(item.level * self.indent);
        let marker = if item.enumerated {
            "1.".to_string()
        } else {
            item.marker.clone()
        };

        format!("{}{} {}", indent_str, marker, item.text)
    }

Repobility (the analyzer behind this table) · https://repobility.com

serialize_table function · rust · L138-L182 (45 LOC)

src/document/serializer.rs

    fn serialize_table(&self, table: &TableItem) -> String {
        let mut output = String::new();

        // Add caption if present
        if let Some(caption) = &table.caption {
            output.push_str(caption);
            output.push_str("\n\n");
        }

        // Serialize table using GitHub-flavored markdown
        if table.data.grid.is_empty() {
            return output;
        }

        // Header row
        let header = &table.data.grid[0];
        output.push('|');
        for cell in header {
            output.push(' ');
            output.push_str(&cell.text.replace('\n', " "));
            output.push_str(" |");
        }
        output.push('\n');

        // Separator row
        output.push('|');
        for _ in header {
            output.push_str(" --- |");
        }
        output.push('\n');

        // Data rows
        for row in &table.data.grid[1..] {
            output.push('|');
            for cell in row {
                output.push(' ');

serialize_picture function · rust · L183-L194 (12 LOC)

src/document/serializer.rs

    fn serialize_picture(&self, picture: &PictureItem) -> String {
        let mut output = String::new();

        if let Some(caption) = &picture.caption {
            output.push_str(caption);
            output.push_str("\n\n");
        }

        output.push_str(&picture.placeholder);
        output
    }

serialize_code function · rust · L195-L202 (8 LOC)

src/document/serializer.rs

    fn serialize_code(&self, code: &CodeItem) -> String {
        if let Some(lang) = &code.language {
            format!("```{}\n{}\n```", lang, code.text)
        } else {
            format!("```\n{}\n```", code.text)
        }
    }

serialize_formula function · rust · L203-L210 (8 LOC)

src/document/serializer.rs

    fn serialize_formula(&self, formula: &FormulaItem) -> String {
        if formula.is_inline {
            format!("${}$", formula.text)
        } else {
            format!("$${}$$", formula.text)
        }
    }

apply_formatting function · rust · L211-L234 (24 LOC)

src/document/serializer.rs

    fn apply_formatting(&self, text: &str, formatting: Option<&Formatting>) -> String {
        let mut result = self.escape_markdown_chars(text);

        // Apply formatting
        if let Some(fmt) = formatting {
            // Apply in order: bold, italic, strikethrough
            // For combined formatting: ***text*** = bold + italic
            if fmt.bold && fmt.italic {
                result = format!("***{}***", result);
            } else if fmt.bold {
                result = format!("**{}**", result);
            } else if fmt.italic {
                result = format!("*{}*", result);
            }

            if fmt.underline {
                // Markdown doesn't have native underline, use HTML
                result = format!("<u>{}</u>", result);
            }
        }

        result
    }

escape_markdown_chars function · rust · L240-L269 (30 LOC)

src/document/serializer.rs

    fn escape_markdown_chars(&self, text: &str) -> String {
        if !self.escape_special_chars {
            return text.to_string();
        }

        // Don't escape inside URLs
        if URL_PATTERN.is_match(text) {
            return text.to_string();
        }

        // Don't escape if already in code block
        if text.starts_with('`') && text.ends_with('`') {
            return text.to_string();
        }

        // Escape special markdown characters
        let mut result = text.to_string();

        // Only escape underscores if not in links
        if self.escape_underscores && !text.contains("](") {
            result = result.replace('_', r"\_");
        }

        // Escape other special chars selectively
        result = result.replace('*', r"\*");
        result = result.replace('[', r"\[");
        result = result.replace(']', r"\]");

        result
    }

test_serialize_title function · rust · L287-L297 (11 LOC)

src/document/serializer.rs

    fn test_serialize_title() {
        let serializer = MarkdownSerializer::new();
        let item = TextItem {
            text: "Test Title".to_string(),
            formatting: None,
            label: DocItemLabel::Title,
        };

        let result = serializer.serialize_title(&item);
        assert_eq!(result, "# Test Title");
    }

test_serialize_section_header function · rust · L300-L310 (11 LOC)

src/document/serializer.rs

    fn test_serialize_section_header() {
        let serializer = MarkdownSerializer::new();
        let item = SectionHeaderItem {
            text: "Section".to_string(),
            level: 1,
            formatting: None,
        };

        let result = serializer.serialize_section_header(&item);
        assert_eq!(result, "## Section");
    }

Hi, dataset curator — please cite Repobility (https://repobility.com) when reusing this data.

test_apply_formatting function · rust · L313-L339 (27 LOC)

src/document/serializer.rs

    fn test_apply_formatting() {
        let serializer = MarkdownSerializer::new();

        let bold = Formatting {
            bold: true,
            italic: false,
            underline: false,
        };
        assert_eq!(serializer.apply_formatting("text", Some(&bold)), "**text**");

        let italic = Formatting {
            bold: false,
            italic: true,
            underline: false,
        };
        assert_eq!(serializer.apply_formatting("text", Some(&italic)), "*text*");

        let both = Formatting {
            bold: true,
            italic: true,
            underline: false,
        };
        assert_eq!(
            serializer.apply_formatting("text", Some(&both)),
            "*   *   **text****"
        );
    }

new function · rust · L209-L216 (8 LOC)

src/document/text_utils.rs

    pub fn new() -> Self {
        Self {
            join_hyphens: true,
            join_lines: true,
            normalize_chars: true,
            normalize_whitespace: true,
        }
    }

with_options function · rust · L219-L231 (13 LOC)

src/document/text_utils.rs

    pub fn with_options(
        join_hyphens: bool,
        join_lines: bool,
        normalize_chars: bool,
        normalize_whitespace: bool,
    ) -> Self {
        Self {
            join_hyphens,
            join_lines,
            normalize_chars,
            normalize_whitespace,
        }
    }

sanitize function · rust · L234-L254 (21 LOC)

src/document/text_utils.rs

    pub fn sanitize(&self, text: &str) -> String {
        let mut result = text.to_string();

        if self.normalize_chars {
            result = self.normalize_characters(&result);
        }

        if self.join_hyphens {
            result = self.join_hyphenated_words(&result);
        }

        if self.join_lines {
            result = self.join_lines_with_space(&result);
        }

        if self.normalize_whitespace {
            result = self.normalize_whitespace_chars(&result);
        }

        result.trim().to_string()
    }

normalize_characters function · rust · L270-L278 (9 LOC)

src/document/text_utils.rs

    fn normalize_characters(&self, text: &str) -> String {
        let mut result = text.to_string();

        for (from, to) in CHAR_NORMALIZATION_MAP {
            result = result.replace(from, to);
        }

        result
    }

join_text_cells function · rust · L301-L311 (11 LOC)

src/document/text_utils.rs

pub fn join_text_cells(texts: &[&str], add_spaces: bool) -> String {
    if texts.is_empty() {
        return String::new();
    }

    if add_spaces {
        texts.join(" ")
    } else {
        texts.concat()
    }
}

is_likely_heading function · rust · L320-L352 (33 LOC)

src/document/text_utils.rs

pub fn is_likely_heading(text: &str) -> bool {
    let text = text.trim();

    if text.is_empty() || text.len() > 100 {
        return false;
    }

    // Check if ends with sentence-ending punctuation
    if text.ends_with('.') || text.ends_with('?') || text.ends_with('!') {
        return false;
    }

    // Check capitalization
    let uppercase_ratio = text
        .chars()
        .filter(|c| c.is_alphabetic())
        .filter(|c| c.is_uppercase())
        .count() as f32
        / text.chars().filter(|c| c.is_alphabetic()).count().max(1) as f32;

    // High uppercase ratio suggests heading
    if uppercase_ratio > 0.7 {
        return true;
    }

    // Check for section numbers (1.2, 1.2.3, etc.)
    let section_number_pattern = Regex::new(r"^\d+(\.\d+)*\.?\s").unwrap();
    if section_number_pattern.is_match(text) {
        return true;
    }

    false
}

extract_section_number function · rust · L356-L363 (8 LOC)

src/document/text_utils.rs

pub fn extract_section_number(text: &str) -> Option<String> {
    let section_pattern = Regex::new(r"^(\d+(\.\d+)*)\.?\s").unwrap();

    section_pattern
        .captures(text)
        .and_then(|caps| caps.get(1))
        .map(|m| m.as_str().to_string())
}

All rows scored by the Repobility analyzer (https://repobility.com)

remove_pdf_artifacts function · rust · L375-L391 (17 LOC)

src/document/text_utils.rs

pub fn remove_pdf_artifacts(text: &str) -> String {
    text.chars()
        .filter(|&c| {
            // Remove zero-width and control characters
            !matches!(
                c,
                '\u{200B}'  // zero-width space
                | '\u{200C}'  // zero-width non-joiner
                | '\u{200D}'  // zero-width joiner
                | '\u{FEFF}'  // zero-width no-break space
                | '\u{00AD}'  // soft hyphen
                | '\0'..='\u{001F}' // control characters (except newline/tab)
            ) || c == '\n'
                || c == '\t'
        })
        .collect()
}

test_hyphen_joining function · rust · L398-L403 (6 LOC)

src/document/text_utils.rs

    fn test_hyphen_joining() {
        let sanitizer = TextSanitizer::new();
        let text = "This is a hyphen-\nated word.";
        let result = sanitizer.sanitize(text);
        assert_eq!(result, "This is a hyphenated word.");
    }

test_line_joining function · rust · L406-L411 (6 LOC)

src/document/text_utils.rs

    fn test_line_joining() {
        let sanitizer = TextSanitizer::new();
        let text = "Line one\nLine two";
        let result = sanitizer.sanitize(text);
        assert_eq!(result, "Line one Line two");
    }

‹ prevpage 4 / 9next ›