Function bodies 424 total
extract_text function · rust · L92-L142 (51 LOC)src/document/page_assembler.rs
fn extract_text(&self, cluster: &Cluster) -> String {
// Sort cells by position (Y then X)
let mut cells = cluster.cells.clone();
cells.sort_by(|a, b| {
let y_cmp = a.bbox.t.partial_cmp(&b.bbox.t).unwrap();
if y_cmp == std::cmp::Ordering::Equal {
a.bbox.l.partial_cmp(&b.bbox.l).unwrap()
} else {
y_cmp
}
});
// Smart joining: docling-parse returns one character per cell
// We need to detect word boundaries based on horizontal distance
let mut text = String::new();
let mut prev_x_end = 0.0;
let mut prev_y = 0.0;
for cell in &cells {
let gap_x = cell.bbox.l - prev_x_end;
let gap_y = (cell.bbox.t - prev_y).abs();
let cell_width = cell.bbox.r - cell.bbox.l;
// New line if vertical gap is significant
if prev_y > 0.0 && gap_y > 5.0 {
if !text.ends_with('process_title function · rust · L145-L153 (9 LOC)src/document/page_assembler.rs
fn process_title(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
Ok(vec![DocItem::Title(TextItem {
text,
formatting: None,
label: DocItemLabel::Title,
})])
}process_section_header function · rust · L156-L172 (17 LOC)src/document/page_assembler.rs
fn process_section_header(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
// Try to extract section number to determine level
let level = if let Some(section_num) = extract_section_number(&text) {
calculate_section_level(§ion_num)
} else {
// Fallback heuristic based on font size or default to 2
2
};
Ok(vec![DocItem::SectionHeader(SectionHeaderItem {
text,
level,
formatting: None,
})])
}process_text function · rust · L175-L199 (25 LOC)src/document/page_assembler.rs
fn process_text(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
// Check if it's actually a heading (ML may misclassify)
if self.options.enable_heading_detection && is_likely_heading(&text) {
// Promote to section header
let level = if let Some(section_num) = extract_section_number(&text) {
calculate_section_level(§ion_num)
} else {
2
};
Ok(vec![DocItem::SectionHeader(SectionHeaderItem {
text,
level,
formatting: None,
})])
} else {
Ok(vec![DocItem::Paragraph(TextItem {
text,
formatting: None,
label: DocItemLabel::Paragraph,
})])
}
}process_list_item function · rust · L202-L221 (20 LOC)src/document/page_assembler.rs
fn process_list_item(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
// Detect marker and type
let (marker, enumerated) = self.detect_list_marker(&text);
// Remove marker from text
let text_without_marker = if let Some(m) = &marker {
text.trim_start_matches(m).trim_start().to_string()
} else {
text
};
Ok(vec![DocItem::ListItem(ListItemData {
text: text_without_marker,
marker: marker.unwrap_or_else(|| "-".to_string()),
enumerated,
level: 0, // TODO: Detect nesting level from indentation
})])
}detect_list_marker function · rust · L224-L241 (18 LOC)src/document/page_assembler.rs
fn detect_list_marker(&self, text: &str) -> (Option<String>, bool) {
let trimmed = text.trim_start();
// Bullet markers
if trimmed.starts_with("- ") || trimmed.starts_with("• ") || trimmed.starts_with("· ") {
return (Some(trimmed.chars().next().unwrap().to_string()), false);
}
// Numbered markers (1., 2., 1), 2), etc.)
if let Some(pos) = trimmed.find(|c| c == '.' || c == ')') {
if pos > 0 && trimmed[..pos].chars().all(|c| c.is_numeric()) {
let marker = &trimmed[..=pos];
return (Some(marker.to_string()), true);
}
}
(None, false)
}process_caption function · rust · L244-L255 (12 LOC)src/document/page_assembler.rs
fn process_caption(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
Ok(vec![DocItem::Paragraph(TextItem {
text,
formatting: Some(Formatting {
italic: true,
..Default::default()
}),
label: DocItemLabel::Caption,
})])
}Hi, dataset curator — please cite Repobility (https://repobility.com) when reusing this data.
process_footnote function · rust · L258-L266 (9 LOC)src/document/page_assembler.rs
fn process_footnote(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
Ok(vec![DocItem::Paragraph(TextItem {
text,
formatting: None,
label: DocItemLabel::Footnote,
})])
}process_header_footer function · rust · L269-L279 (11 LOC)src/document/page_assembler.rs
fn process_header_footer(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
// Usually skip headers/footers as they're page metadata
// But can be included if needed
Ok(vec![DocItem::Paragraph(TextItem {
text,
formatting: None,
label: cluster.label,
})])
}process_table function · rust · L282-L304 (23 LOC)src/document/page_assembler.rs
fn process_table(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
// This is a placeholder - actual table structure comes from TableStructureModel
// For now, create a simple table from cells
let text = self.extract_text(cluster);
// TODO: Use TableStructureModel output to build proper TableData
// For now, create a minimal table
let table_data = TableData {
num_rows: 1,
num_cols: 1,
grid: vec![vec![TableCell {
text,
row_span: 1,
col_span: 1,
}]],
};
Ok(vec![DocItem::Table(TableItem {
data: table_data,
caption: None,
})])
}process_picture function · rust · L307-L322 (16 LOC)src/document/page_assembler.rs
fn process_picture(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
// Extract any text (OCR or caption)
let text = if !cluster.cells.is_empty() {
Some(self.extract_text(cluster))
} else {
None
};
Ok(vec![DocItem::Picture(PictureItem {
caption: text,
placeholder: format!(
"<!-- Figure at ({}, {}) -->",
cluster.bbox.l, cluster.bbox.t
),
})])
}process_code function · rust · L325-L332 (8 LOC)src/document/page_assembler.rs
fn process_code(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
// Try to detect language from first line
let language = self.detect_code_language(&text);
Ok(vec![DocItem::Code(CodeItem { text, language })])
}detect_code_language function · rust · L335-L346 (12 LOC)src/document/page_assembler.rs
fn detect_code_language(&self, text: &str) -> Option<String> {
// Simple heuristics - can be improved
if text.contains("def ") || text.contains("import ") || text.contains("print(") {
Some("python".to_string())
} else if text.contains("function ") || text.contains("const ") || text.contains("let ") {
Some("javascript".to_string())
} else if text.contains("fn ") || text.contains("impl ") || text.contains("pub ") {
Some("rust".to_string())
} else {
None
}
}process_formula function · rust · L349-L356 (8 LOC)src/document/page_assembler.rs
fn process_formula(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
// Detect if inline or block formula based on length/position
let is_inline = text.len() < 50;
Ok(vec![DocItem::Formula(FormulaItem { text, is_inline })])
}process_checkbox function · rust · L359-L371 (13 LOC)src/document/page_assembler.rs
fn process_checkbox(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
let text = self.extract_text(cluster);
let checked = cluster.label == DocItemLabel::CheckboxSelected;
let marker = if checked { "[x]" } else { "[ ]" };
Ok(vec![DocItem::ListItem(ListItemData {
text,
marker: marker.to_string(),
enumerated: false,
level: 0,
})])
}All rows scored by the Repobility analyzer (https://repobility.com)
merge_adjacent_text_items function · rust · L374-L419 (46 LOC)src/document/page_assembler.rs
fn merge_adjacent_text_items(&self, items: Vec<DocItem>) -> Result<Vec<DocItem>> {
if items.len() < 2 {
return Ok(items);
}
let mut merged = Vec::new();
let mut current_text: Option<String> = None;
let mut current_label: Option<DocItemLabel> = None;
for item in items {
match item {
DocItem::Paragraph(ref text_item) if text_item.label == DocItemLabel::Paragraph => {
// Accumulate text
if let Some(ref mut text) = current_text {
text.push(' ');
text.push_str(&text_item.text);
} else {
current_text = Some(text_item.text.clone());
current_label = Some(text_item.label);
}
}
_ => {
// Flush accumulated text
if let Some(text) = current_text.take() {
test_detect_list_marker function · rust · L433-L446 (14 LOC)src/document/page_assembler.rs
fn test_detect_list_marker() {
let assembler = PageAssembler::default();
let (marker, enumerated) = assembler.detect_list_marker("- Item");
assert_eq!(marker, Some("-".to_string()));
assert!(!enumerated);
let (marker, enumerated) = assembler.detect_list_marker("1. First");
assert_eq!(marker, Some("1.".to_string()));
assert!(enumerated);
let (_marker, enumerated) = assembler.detect_list_marker("• Bullet");
assert!(!enumerated);
}test_detect_code_language function · rust · L449-L466 (18 LOC)src/document/page_assembler.rs
fn test_detect_code_language() {
let assembler = PageAssembler::default();
assert_eq!(
assembler.detect_code_language("def main():\n print('hello')"),
Some("python".to_string())
);
assert_eq!(
assembler.detect_code_language("function test() { const x = 1; }"),
Some("javascript".to_string())
);
assert_eq!(
assembler.detect_code_language("fn main() { println!(\"hello\"); }"),
Some("rust".to_string())
);
}parse function · rust · L17-L42 (26 LOC)src/document/parser.rs
pub fn parse(json_str: &str) -> Result<DoclingDocument> {
let json: Value = serde_json::from_str(json_str)?;
let filename = json["info"]["filename"]
.as_str()
.unwrap_or("document")
.to_string();
let mut doc = DoclingDocument::new(filename);
// Extract table of contents for heading detection
let toc = Self::extract_table_of_contents(&json);
let heading_map: HashMap<String, usize> = toc
.iter()
.map(|(title, level)| (title.to_lowercase(), *level))
.collect();
// Process each page
if let Some(pages) = json["pages"].as_array() {
for page in pages {
Self::process_page(page, &mut doc, &heading_map)?;
}
}
Ok(doc)
}extract_table_of_contents function · rust · L43-L52 (10 LOC)src/document/parser.rs
fn extract_table_of_contents(json: &Value) -> Vec<(String, usize)> {
let mut toc = Vec::new();
if let Some(toc_array) = json["annotations"]["table_of_contents"].as_array() {
Self::extract_toc_recursive(toc_array, &mut toc);
}
toc
}process_page function · rust · L65-L79 (15 LOC)src/document/parser.rs
fn process_page(
page: &Value,
doc: &mut DoclingDocument,
heading_map: &HashMap<String, usize>,
) -> Result<()> {
// Process cells ordered by position (Y descending, X ascending)
if let Some(cells_obj) = page["original"]["cells"].as_object() {
if let Some(cell_data) = cells_obj["data"].as_array() {
Self::process_cells_ordered(cell_data, doc, heading_map)?;
}
}
Ok(())
}process_cells_ordered function · rust · L80-L155 (76 LOC)src/document/parser.rs
fn process_cells_ordered(
cell_data: &[Value],
doc: &mut DoclingDocument,
heading_map: &HashMap<String, usize>,
) -> Result<()> {
// Group cells by Y position (lines) and order by X
// Cell structure: [x0, y0, x1, y1, ...]
let mut cells_with_pos: Vec<(f64, f64, f64, String)> = Vec::new();
for cell in cell_data {
if let Some(cell_array) = cell.as_array() {
// cell_array[0] = x0, cell_array[1] = y0, cell_array[2] = x1, cell_array[12] = text
if let (Some(x0), Some(x1), Some(y), Some(text)) = (
cell_array.get(0).and_then(|v| v.as_f64()),
cell_array.get(2).and_then(|v| v.as_f64()),
cell_array.get(1).and_then(|v| v.as_f64()),
cell_array.get(12).and_then(|v| v.as_str()),
) {
let trimmed = text.trim();
// Filter out non-meaningful single characters andmerge_lines_into_paragraphs function · rust · L179-L229 (51 LOC)src/document/parser.rs
fn merge_lines_into_paragraphs(
lines: Vec<String>,
doc: &mut DoclingDocument,
heading_map: &HashMap<String, usize>,
) {
if lines.is_empty() {
return;
}
let mut current_paragraph = String::new();
for (i, line) in lines.iter().enumerate() {
let line_trimmed = line.trim();
// Check if this is a heading
if heading_map.contains_key(&line_trimmed.to_lowercase()) {
// Flush current paragraph if any
if !current_paragraph.is_empty() {
Self::process_text_line(¤t_paragraph, doc, heading_map);
current_paragraph.clear();
}
// Add heading immediately
Self::process_text_line(line_trimmed, doc, heading_map);
continue;
}
// Check if line should be merged with previous
let should_merge = if current_paragraph.is_emAll rows above produced by Repobility · https://repobility.com
should_merge_lines function · rust · L232-L293 (62 LOC)src/document/parser.rs
fn should_merge_lines(prev_line: &str, current_line: &str) -> bool {
let prev_trimmed = prev_line.trim();
let current_trimmed = current_line.trim();
if prev_trimmed.is_empty() || current_trimmed.is_empty() {
return false;
}
// Don't merge if previous line ends with sentence-ending punctuation
if prev_trimmed.ends_with('.') || prev_trimmed.ends_with('!') || prev_trimmed.ends_with('?')
{
// Unless it's an abbreviation (single letter + dot)
if let Some(last_word) = prev_trimmed.split_whitespace().last() {
if last_word.len() <= 2 && last_word.ends_with('.') {
return true; // Likely abbreviation, merge
}
}
return false;
}
// Don't merge if previous line ends with colon (likely list or heading)
if prev_trimmed.ends_with(':') {
return false;
}
// Don't merge if currentprocess_text_line function · rust · L294-L324 (31 LOC)src/document/parser.rs
fn process_text_line(
text: &str,
doc: &mut DoclingDocument,
heading_map: &HashMap<String, usize>,
) {
// Check if it's a heading
let text_lower = text.to_lowercase();
if let Some(&level) = heading_map.get(&text_lower) {
if level == 0 {
doc.add_item(DocItem::Title(TextItem {
text: text.to_string(),
formatting: None,
label: DocItemLabel::Title,
}));
} else {
doc.add_item(DocItem::SectionHeader(SectionHeaderItem {
text: text.to_string(),
level,
formatting: None,
}));
}
} else {
// Regular paragraph
doc.add_item(DocItem::Paragraph(TextItem {
text: text.to_string(),
formatting: None,
label: DocItemLabel::Paragraph,
}));
default function · rust · L36-L44 (9 LOC)src/document/serializer.rs
fn default() -> Self {
Self {
indent: 4,
escape_underscores: true,
escape_special_chars: true,
enable_tables: true,
enable_images: true,
}
}serialize function · rust · L71-L89 (19 LOC)src/document/serializer.rs
pub fn serialize(&self, doc: &DoclingDocument) -> Result<String> {
let mut parts = Vec::new();
for item in &doc.items {
if let Some(text) = self.serialize_item(item) {
parts.push(text);
}
}
let mut output = parts.join("\n\n");
// Clean up excessive newlines
while output.contains("\n\n\n") {
output = output.replace("\n\n\n", "\n\n");
}
Ok(output.trim().to_string())
}serialize_item function · rust · L90-L102 (13 LOC)src/document/serializer.rs
fn serialize_item(&self, item: &DocItem) -> Option<String> {
match item {
DocItem::Title(text_item) => Some(self.serialize_title(text_item)),
DocItem::SectionHeader(header) => Some(self.serialize_section_header(header)),
DocItem::Paragraph(text_item) => Some(self.serialize_paragraph(text_item)),
DocItem::ListItem(list_item) => Some(self.serialize_list_item(list_item)),
DocItem::Table(table) => Some(self.serialize_table(table)),
DocItem::Picture(picture) => Some(self.serialize_picture(picture)),
DocItem::Code(code) => Some(self.serialize_code(code)),
DocItem::Formula(formula) => Some(self.serialize_formula(formula)),
}
}serialize_section_header function · rust · L108-L113 (6 LOC)src/document/serializer.rs
fn serialize_section_header(&self, item: &SectionHeaderItem) -> String {
let text = self.apply_formatting(&item.text, item.formatting.as_ref());
let hashes = "#".repeat(item.level + 1);
format!("{} {}", hashes, text)
}serialize_paragraph function · rust · L114-L126 (13 LOC)src/document/serializer.rs
fn serialize_paragraph(&self, item: &TextItem) -> String {
let mut text = item.text.clone();
// Handle checkboxes
text = match item.label {
DocItemLabel::CheckboxSelected => format!("- [x] {}", text),
DocItemLabel::CheckboxUnselected => format!("- [ ] {}", text),
_ => text,
};
self.apply_formatting(&text, item.formatting.as_ref())
}serialize_list_item function · rust · L127-L137 (11 LOC)src/document/serializer.rs
fn serialize_list_item(&self, item: &ListItemData) -> String {
let indent_str = " ".repeat(item.level * self.indent);
let marker = if item.enumerated {
"1.".to_string()
} else {
item.marker.clone()
};
format!("{}{} {}", indent_str, marker, item.text)
}Repobility (the analyzer behind this table) · https://repobility.com
serialize_table function · rust · L138-L182 (45 LOC)src/document/serializer.rs
fn serialize_table(&self, table: &TableItem) -> String {
let mut output = String::new();
// Add caption if present
if let Some(caption) = &table.caption {
output.push_str(caption);
output.push_str("\n\n");
}
// Serialize table using GitHub-flavored markdown
if table.data.grid.is_empty() {
return output;
}
// Header row
let header = &table.data.grid[0];
output.push('|');
for cell in header {
output.push(' ');
output.push_str(&cell.text.replace('\n', " "));
output.push_str(" |");
}
output.push('\n');
// Separator row
output.push('|');
for _ in header {
output.push_str(" --- |");
}
output.push('\n');
// Data rows
for row in &table.data.grid[1..] {
output.push('|');
for cell in row {
output.push(' ');serialize_picture function · rust · L183-L194 (12 LOC)src/document/serializer.rs
fn serialize_picture(&self, picture: &PictureItem) -> String {
let mut output = String::new();
if let Some(caption) = &picture.caption {
output.push_str(caption);
output.push_str("\n\n");
}
output.push_str(&picture.placeholder);
output
}serialize_code function · rust · L195-L202 (8 LOC)src/document/serializer.rs
fn serialize_code(&self, code: &CodeItem) -> String {
if let Some(lang) = &code.language {
format!("```{}\n{}\n```", lang, code.text)
} else {
format!("```\n{}\n```", code.text)
}
}serialize_formula function · rust · L203-L210 (8 LOC)src/document/serializer.rs
fn serialize_formula(&self, formula: &FormulaItem) -> String {
if formula.is_inline {
format!("${}$", formula.text)
} else {
format!("$${}$$", formula.text)
}
}apply_formatting function · rust · L211-L234 (24 LOC)src/document/serializer.rs
fn apply_formatting(&self, text: &str, formatting: Option<&Formatting>) -> String {
let mut result = self.escape_markdown_chars(text);
// Apply formatting
if let Some(fmt) = formatting {
// Apply in order: bold, italic, strikethrough
// For combined formatting: ***text*** = bold + italic
if fmt.bold && fmt.italic {
result = format!("***{}***", result);
} else if fmt.bold {
result = format!("**{}**", result);
} else if fmt.italic {
result = format!("*{}*", result);
}
if fmt.underline {
// Markdown doesn't have native underline, use HTML
result = format!("<u>{}</u>", result);
}
}
result
}escape_markdown_chars function · rust · L240-L269 (30 LOC)src/document/serializer.rs
fn escape_markdown_chars(&self, text: &str) -> String {
if !self.escape_special_chars {
return text.to_string();
}
// Don't escape inside URLs
if URL_PATTERN.is_match(text) {
return text.to_string();
}
// Don't escape if already in code block
if text.starts_with('`') && text.ends_with('`') {
return text.to_string();
}
// Escape special markdown characters
let mut result = text.to_string();
// Only escape underscores if not in links
if self.escape_underscores && !text.contains("](") {
result = result.replace('_', r"\_");
}
// Escape other special chars selectively
result = result.replace('*', r"\*");
result = result.replace('[', r"\[");
result = result.replace(']', r"\]");
result
}test_serialize_title function · rust · L287-L297 (11 LOC)src/document/serializer.rs
fn test_serialize_title() {
let serializer = MarkdownSerializer::new();
let item = TextItem {
text: "Test Title".to_string(),
formatting: None,
label: DocItemLabel::Title,
};
let result = serializer.serialize_title(&item);
assert_eq!(result, "# Test Title");
}test_serialize_section_header function · rust · L300-L310 (11 LOC)src/document/serializer.rs
fn test_serialize_section_header() {
let serializer = MarkdownSerializer::new();
let item = SectionHeaderItem {
text: "Section".to_string(),
level: 1,
formatting: None,
};
let result = serializer.serialize_section_header(&item);
assert_eq!(result, "## Section");
}Hi, dataset curator — please cite Repobility (https://repobility.com) when reusing this data.
test_apply_formatting function · rust · L313-L339 (27 LOC)src/document/serializer.rs
fn test_apply_formatting() {
let serializer = MarkdownSerializer::new();
let bold = Formatting {
bold: true,
italic: false,
underline: false,
};
assert_eq!(serializer.apply_formatting("text", Some(&bold)), "**text**");
let italic = Formatting {
bold: false,
italic: true,
underline: false,
};
assert_eq!(serializer.apply_formatting("text", Some(&italic)), "*text*");
let both = Formatting {
bold: true,
italic: true,
underline: false,
};
assert_eq!(
serializer.apply_formatting("text", Some(&both)),
"* * **text****"
);
}new function · rust · L209-L216 (8 LOC)src/document/text_utils.rs
pub fn new() -> Self {
Self {
join_hyphens: true,
join_lines: true,
normalize_chars: true,
normalize_whitespace: true,
}
}with_options function · rust · L219-L231 (13 LOC)src/document/text_utils.rs
pub fn with_options(
join_hyphens: bool,
join_lines: bool,
normalize_chars: bool,
normalize_whitespace: bool,
) -> Self {
Self {
join_hyphens,
join_lines,
normalize_chars,
normalize_whitespace,
}
}sanitize function · rust · L234-L254 (21 LOC)src/document/text_utils.rs
pub fn sanitize(&self, text: &str) -> String {
let mut result = text.to_string();
if self.normalize_chars {
result = self.normalize_characters(&result);
}
if self.join_hyphens {
result = self.join_hyphenated_words(&result);
}
if self.join_lines {
result = self.join_lines_with_space(&result);
}
if self.normalize_whitespace {
result = self.normalize_whitespace_chars(&result);
}
result.trim().to_string()
}normalize_characters function · rust · L270-L278 (9 LOC)src/document/text_utils.rs
fn normalize_characters(&self, text: &str) -> String {
let mut result = text.to_string();
for (from, to) in CHAR_NORMALIZATION_MAP {
result = result.replace(from, to);
}
result
}join_text_cells function · rust · L301-L311 (11 LOC)src/document/text_utils.rs
pub fn join_text_cells(texts: &[&str], add_spaces: bool) -> String {
if texts.is_empty() {
return String::new();
}
if add_spaces {
texts.join(" ")
} else {
texts.concat()
}
}is_likely_heading function · rust · L320-L352 (33 LOC)src/document/text_utils.rs
pub fn is_likely_heading(text: &str) -> bool {
let text = text.trim();
if text.is_empty() || text.len() > 100 {
return false;
}
// Check if ends with sentence-ending punctuation
if text.ends_with('.') || text.ends_with('?') || text.ends_with('!') {
return false;
}
// Check capitalization
let uppercase_ratio = text
.chars()
.filter(|c| c.is_alphabetic())
.filter(|c| c.is_uppercase())
.count() as f32
/ text.chars().filter(|c| c.is_alphabetic()).count().max(1) as f32;
// High uppercase ratio suggests heading
if uppercase_ratio > 0.7 {
return true;
}
// Check for section numbers (1.2, 1.2.3, etc.)
let section_number_pattern = Regex::new(r"^\d+(\.\d+)*\.?\s").unwrap();
if section_number_pattern.is_match(text) {
return true;
}
false
}extract_section_number function · rust · L356-L363 (8 LOC)src/document/text_utils.rs
pub fn extract_section_number(text: &str) -> Option<String> {
let section_pattern = Regex::new(r"^(\d+(\.\d+)*)\.?\s").unwrap();
section_pattern
.captures(text)
.and_then(|caps| caps.get(1))
.map(|m| m.as_str().to_string())
}All rows scored by the Repobility analyzer (https://repobility.com)
remove_pdf_artifacts function · rust · L375-L391 (17 LOC)src/document/text_utils.rs
pub fn remove_pdf_artifacts(text: &str) -> String {
text.chars()
.filter(|&c| {
// Remove zero-width and control characters
!matches!(
c,
'\u{200B}' // zero-width space
| '\u{200C}' // zero-width non-joiner
| '\u{200D}' // zero-width joiner
| '\u{FEFF}' // zero-width no-break space
| '\u{00AD}' // soft hyphen
| '\0'..='\u{001F}' // control characters (except newline/tab)
) || c == '\n'
|| c == '\t'
})
.collect()
}test_hyphen_joining function · rust · L398-L403 (6 LOC)src/document/text_utils.rs
fn test_hyphen_joining() {
let sanitizer = TextSanitizer::new();
let text = "This is a hyphen-\nated word.";
let result = sanitizer.sanitize(text);
assert_eq!(result, "This is a hyphenated word.");
}test_line_joining function · rust · L406-L411 (6 LOC)src/document/text_utils.rs
fn test_line_joining() {
let sanitizer = TextSanitizer::new();
let text = "Line one\nLine two";
let result = sanitizer.sanitize(text);
assert_eq!(result, "Line one Line two");
}