Function bodies 424 total
metadata function · rust · L349-L359 (11 LOC)src/converters/pptx.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "PPTX Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description:
"PPTX converter: Direct XML parsing (Markdown) or LibreOffice pipeline (Images)"
.to_string(),
external_deps: vec!["LibreOffice (images only)".to_string()],
}
}test_extract_text_from_xml function · rust · L380-L385 (6 LOC)src/converters/pptx.rs
fn test_extract_text_from_xml() {
let converter = PptxConverter::new();
let xml = "<a:t>Test Text</a:t>";
let result = converter.extract_text_from_xml(xml);
assert!(result.contains("Test Text"));
}rtf_to_markdown function · rust · L30-L141 (112 LOC)src/converters/rtf.rs
fn rtf_to_markdown(&self, rtf: &str) -> String {
let mut markdown = String::new();
markdown.push_str("# Document\n\n");
let mut text_content = String::new();
let mut in_control_word = false;
let mut brace_depth = 0;
let mut skip_next = false;
let chars: Vec<char> = rtf.chars().collect();
let mut i = 0;
while i < chars.len() {
let ch = chars[i];
if skip_next {
skip_next = false;
i += 1;
continue;
}
match ch {
'{' => {
brace_depth += 1;
}
'}' => {
brace_depth -= 1;
in_control_word = false;
}
'\\' => {
// Check for control word
if i + 1 < chars.len() {
let next_ch = chars[i + 1];
// Escape output_formats function · rust · L155-L167 (13 LOC)src/converters/rtf.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Json {
structured: true,
include_metadata: true,
},
]
}convert function · rust · L168-L245 (78 LOC)src/converters/rtf.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
_options: ConversionOptions,
) -> Result<ConversionResult> {
eprintln!("🔄 RTF Conversion (Pure Rust)");
eprintln!(" RTF → Parsing → {:?}", output_format);
eprintln!();
// Read RTF file
let rtf_content = fs::read_to_string(input).await?;
// Convert to requested format
let output_data = match output_format {
OutputFormat::Markdown { .. } => {
eprintln!("📝 Converting to Markdown...");
let markdown = self.rtf_to_markdown(&rtf_content);
markdown.into_bytes()
}
OutputFormat::Json { .. } => {
eprintln!("📝 Converting to JSON...");
let text = self.rtf_to_markdown(&rtf_content);
let json = serde_json::json!({
"text": {
"content": text,
metadata function · rust · L246-L254 (9 LOC)src/converters/rtf.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "RTF Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "RTF to Markdown converter (pure Rust, simplified parser)".to_string(),
external_deps: vec![],
}
}test_rtf_to_markdown_basic function · rust · L268-L277 (10 LOC)src/converters/rtf.rs
fn test_rtf_to_markdown_basic() {
let converter = RtfConverter::new();
// RTF with actual text commands
let rtf = r"{\rtf1\ansi\deff0 {\fonttbl {\f0 Times New Roman;}}
\f0\fs24 Hello World\par
}";
let result = converter.rtf_to_markdown(rtf);
// Simplified parser may not extract perfectly, just check it doesn't crash
assert!(!result.is_empty());
}All rows above produced by Repobility · https://repobility.com
default function · rust · L50-L57 (8 LOC)src/converters/traits.rs
fn default() -> Self {
Self {
name: "Unknown".to_string(),
version: "0.0.0".to_string(),
description: String::new(),
external_deps: Vec::new(),
}
}txt_to_markdown function · rust · L29-L56 (28 LOC)src/converters/txt.rs
fn txt_to_markdown(&self, text: &str) -> String {
let mut markdown = String::new();
markdown.push_str("# Document\n\n");
// Simple paragraph detection based on blank lines
let paragraphs: Vec<&str> = text
.split("\n\n")
.filter(|p| !p.trim().is_empty())
.collect();
for para in paragraphs {
let trimmed = para.trim();
// Detect if it might be a heading (short line, all caps, or ends with colon)
if trimmed.len() < 80
&& (trimmed
.chars()
.all(|c| !c.is_lowercase() || !c.is_alphabetic())
|| trimmed.ends_with(':'))
{
markdown.push_str(&format!("## {}\n\n", trimmed));
} else {
markdown.push_str(&format!("{}\n\n", trimmed));
}
}
markdown
}output_formats function · rust · L70-L82 (13 LOC)src/converters/txt.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Json {
structured: true,
include_metadata: true,
},
]
}convert function · rust · L83-L160 (78 LOC)src/converters/txt.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
_options: ConversionOptions,
) -> Result<ConversionResult> {
eprintln!("🔄 TXT Conversion (Pure Rust)");
eprintln!(" TXT → Encoding Detection → {:?}", output_format);
eprintln!();
// Read text file with encoding detection
let text_content = fs::read_to_string(input).await?;
// Convert to requested format
let output_data = match output_format {
OutputFormat::Markdown { .. } => {
eprintln!("📝 Converting to Markdown...");
let markdown = self.txt_to_markdown(&text_content);
markdown.into_bytes()
}
OutputFormat::Json { .. } => {
eprintln!("📝 Converting to JSON...");
let json = serde_json::json!({
"text": {
"content": text_content,
"lines": text_metadata function · rust · L161-L169 (9 LOC)src/converters/txt.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "TXT Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "Plain text to Markdown converter (pure Rust)".to_string(),
external_deps: vec![],
}
}test_txt_to_markdown_basic function · rust · L183-L188 (6 LOC)src/converters/txt.rs
fn test_txt_to_markdown_basic() {
let converter = TxtConverter::new();
let text = "This is a test\nSecond line";
let result = converter.txt_to_markdown(text);
assert!(result.contains("This is a test"));
}check_whisper function · rust · L40-L63 (24 LOC)src/converters/video.rs
fn check_whisper() -> bool {
// Try whisper in PATH
if Command::new("whisper").arg("--help").output().is_ok() {
return true;
}
// Try common installation paths
let paths = vec![
format!(
"{}/.local/bin/whisper",
std::env::var("HOME").unwrap_or_default()
),
"/usr/local/bin/whisper".to_string(),
"/usr/bin/whisper".to_string(),
];
for path in paths {
if std::path::Path::new(&path).exists() {
return true;
}
}
false
}get_whisper_cmd function · rust · L66-L85 (20 LOC)src/converters/video.rs
fn get_whisper_cmd() -> String {
// Try common paths
let paths = vec![
format!(
"{}/.local/bin/whisper",
std::env::var("HOME").unwrap_or_default()
),
"/usr/local/bin/whisper".to_string(),
"/usr/bin/whisper".to_string(),
"whisper".to_string(),
];
for path in &paths {
if std::path::Path::new(path).exists() || path == "whisper" {
return path.clone();
}
}
"whisper".to_string()
}Repobility · code-quality intelligence platform · https://repobility.com
extract_audio function · rust · L88-L137 (50 LOC)src/converters/video.rs
async fn extract_audio(&self, video_path: &Path) -> Result<PathBuf> {
if !Self::check_ffmpeg() {
return Err(crate::TransmutationError::conversion_failed(
"FFmpeg not found. Install: sudo apt-get install ffmpeg",
));
}
// Create temporary audio file
let temp_audio = NamedTempFile::new().map_err(|e| {
crate::TransmutationError::conversion_failed(&format!(
"Failed to create temp file: {}",
e
))
})?;
let audio_path = temp_audio.path().with_extension("wav");
eprintln!("🎬 Extracting audio with FFmpeg...");
// Extract audio to WAV format
let output = Command::new("ffmpeg")
.arg("-i")
.arg(video_path)
.arg("-vn") // No video
.arg("-acodec")
.arg("pcm_s16le") // WAV format
.arg("-ar")
.arg("16000") // 16kHz sample rate (Whisper default)
transcribe_audio function · rust · L140-L195 (56 LOC)src/converters/video.rs
async fn transcribe_audio(&self, audio_path: &Path, language: Option<&str>) -> Result<String> {
if !Self::check_whisper() {
return Err(crate::TransmutationError::conversion_failed(
"Whisper not found. Install: pip install openai-whisper",
));
}
eprintln!("🎤 Running Whisper transcription...");
let whisper_cmd = Self::get_whisper_cmd();
let mut cmd = Command::new(&whisper_cmd);
cmd.arg(audio_path);
cmd.arg("--model").arg("base"); // Use base model
cmd.arg("--output_format").arg("txt");
cmd.arg("--output_dir").arg("/tmp");
if let Some(lang) = language {
cmd.arg("--language").arg(lang);
}
let output = cmd.output().map_err(|e| {
crate::TransmutationError::conversion_failed(&format!(
"Whisper execution failed: {}",
e
))
})?;
if !output.status.success() {
video_to_markdown function · rust · L198-L220 (23 LOC)src/converters/video.rs
async fn video_to_markdown(&self, video_path: &Path, language: Option<&str>) -> Result<String> {
// Extract audio
let audio_path = self.extract_audio(video_path).await?;
// Transcribe
let transcript = self.transcribe_audio(&audio_path, language).await?;
// Clean up audio file
let _ = tokio::fs::remove_file(&audio_path).await;
let mut markdown = String::new();
markdown.push_str("# Video Transcription\n\n");
if let Some(lang) = language {
markdown.push_str(&format!("**Language**: {}\n\n", lang));
}
markdown.push_str("## Transcript\n\n");
markdown.push_str(&transcript);
markdown.push('\n');
Ok(markdown)
}supported_formats function · rust · L231-L239 (9 LOC)src/converters/video.rs
fn supported_formats(&self) -> Vec<FileFormat> {
vec![
FileFormat::Mp4,
FileFormat::Avi,
FileFormat::Mkv,
FileFormat::Mov,
FileFormat::Webm,
]
}output_formats function · rust · L240-L252 (13 LOC)src/converters/video.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Json {
structured: true,
include_metadata: true,
},
]
}convert function · rust · L253-L329 (77 LOC)src/converters/video.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
_options: ConversionOptions,
) -> Result<ConversionResult> {
eprintln!("🔄 Video Transcription (FFmpeg + Whisper)");
eprintln!(" Video → Audio → Whisper → {:?}", output_format);
eprintln!();
let language = None; // Auto-detect
// Convert video to text
let markdown = self.video_to_markdown(input, language).await?;
// Convert to requested format
let output_data = match output_format {
OutputFormat::Markdown { .. } => {
eprintln!("✅ Transcription complete!");
markdown.into_bytes()
}
OutputFormat::Json { .. } => {
eprintln!("📝 Converting to JSON...");
let json = serde_json::json!({
"transcription": {
"text": markdown,
"language": language.unwrap_or("ametadata function · rust · L330-L338 (9 LOC)src/converters/video.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "Video Transcription Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "Video to text converter using FFmpeg + Whisper ASR".to_string(),
external_deps: vec!["ffmpeg".to_string(), "whisper".to_string()],
}
}read_xlsx function · rust · L35-L47 (13 LOC)src/converters/xlsx.rs
fn read_xlsx(&self, path: &Path) -> Result<umya_spreadsheet::Spreadsheet> {
eprintln!("📊 Reading XLSX file (umya-spreadsheet)...");
let book = umya_spreadsheet::reader::xlsx::read(path).map_err(|e| {
crate::TransmutationError::engine_error(
"xlsx-parser",
format!("Failed to read XLSX: {}", e),
)
})?;
eprintln!(" ✓ Found {} sheets", book.get_sheet_count());
Ok(book)
}Repobility's GitHub App fixes findings like these · https://github.com/apps/repobility-bot
to_markdown function · rust · L50-L101 (52 LOC)src/converters/xlsx.rs
fn to_markdown(&self, book: &umya_spreadsheet::Spreadsheet) -> String {
let mut markdown = String::new();
markdown.push_str("# Spreadsheet\n\n");
for (idx, sheet) in book.get_sheet_collection().iter().enumerate() {
let sheet_name = sheet.get_name();
markdown.push_str(&format!("## Sheet {}: {}\n\n", idx + 1, sheet_name));
// Get sheet dimensions
let highest_row = sheet.get_highest_row();
let highest_col = sheet.get_highest_column();
if highest_row == 0 || highest_col == 0 {
markdown.push_str("*(Empty sheet)*\n\n");
continue;
}
// Build table
for row in 1..=highest_row {
if row == 1 {
// Header row
markdown.push('|');
for col in 1..=highest_col {
let cell = sheet.get_cell((col, row));
let value = to_csv function · rust · L104-L131 (28 LOC)src/converters/xlsx.rs
fn to_csv(&self, book: &umya_spreadsheet::Spreadsheet, delimiter: char) -> String {
let mut csv = String::new();
// Get first sheet
if let Some(sheet) = book.get_sheet_collection().first() {
let highest_row = sheet.get_highest_row();
let highest_col = sheet.get_highest_column();
for row in 1..=highest_row {
let mut values = Vec::new();
for col in 1..=highest_col {
let cell = sheet.get_cell((col, row));
let value = cell.map(|c| c.get_value().to_string()).unwrap_or_default();
// Quote values with commas
if value.contains(delimiter) || value.contains('"') {
values.push(format!("\"{}\"", value.replace('"', "\"\"")));
} else {
values.push(value);
}
}
csv.push_str(&values.join(&delimiter.to_string(to_json function · rust · L134-L172 (39 LOC)src/converters/xlsx.rs
fn to_json(&self, book: &umya_spreadsheet::Spreadsheet) -> Result<String> {
use serde_json::json;
let mut sheets_json = Vec::new();
for sheet in book.get_sheet_collection() {
let sheet_name = sheet.get_name();
let highest_row = sheet.get_highest_row();
let highest_col = sheet.get_highest_column();
let mut rows = Vec::new();
for row in 1..=highest_row {
let mut row_data = Vec::new();
for col in 1..=highest_col {
let cell = sheet.get_cell((col, row));
let value = cell.map(|c| c.get_value().to_string()).unwrap_or_default();
row_data.push(value);
}
rows.push(row_data);
}
sheets_json.push(json!({
"name": sheet_name,
"rows": rows,
"row_count": highest_row,
"col_count": highest_col,
output_formats function · rust · L186-L202 (17 LOC)src/converters/xlsx.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Csv {
delimiter: ',',
include_headers: true,
},
OutputFormat::Json {
structured: true,
include_metadata: true,
},
]
}convert function · rust · L203-L273 (71 LOC)src/converters/xlsx.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
_options: ConversionOptions,
) -> Result<ConversionResult> {
eprintln!("🔄 XLSX Conversion (Pure Rust)");
eprintln!(" XLSX (ZIP) → XML Parsing → {:?}", output_format);
eprintln!();
// Read XLSX file
let book = self.read_xlsx(input)?;
// Convert to requested format
let output_data = match output_format {
OutputFormat::Markdown { .. } => {
eprintln!("📝 Converting to Markdown tables...");
self.to_markdown(&book).into_bytes()
}
OutputFormat::Csv { delimiter, .. } => {
eprintln!("📝 Converting to CSV (delimiter: '{}')...", delimiter);
self.to_csv(&book, delimiter).into_bytes()
}
OutputFormat::Json { .. } => {
eprintln!("📝 Converting to JSON...");
self.to_json(&book)?.into_bmetadata function · rust · L274-L284 (11 LOC)src/converters/xlsx.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "XLSX Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description:
"XLSX to Markdown/CSV/JSON/XML converter (pure Rust, no LibreOffice needed)"
.to_string(),
external_deps: vec![],
}
}xml_to_json function · rust · L34-L48 (15 LOC)src/converters/xml.rs
fn xml_to_json(&self, xml: &str) -> Result<String> {
use quick_xml::de::from_str;
use serde_json::Value;
// Parse XML to generic Value
let value: Value = from_str(xml).map_err(|e| {
crate::TransmutationError::engine_error(
"xml-parser",
format!("Failed to parse XML: {}", e),
)
})?;
// Convert to pretty JSON
Ok(serde_json::to_string_pretty(&value)?)
}xml_to_markdown function · rust · L51-L94 (44 LOC)src/converters/xml.rs
fn xml_to_markdown(&self, xml: &str) -> Result<String> {
use quick_xml::Reader;
use quick_xml::events::Event;
let mut reader = Reader::from_str(xml);
reader.config_mut().trim_text(true);
let mut markdown = String::new();
markdown.push_str("# XML Document\n\n");
let mut current_element = String::new();
let mut text_parts = Vec::new();
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
current_element = String::from_utf8_lossy(e.name().as_ref()).to_string();
}
Ok(Event::Text(e)) => {
if let Ok(text) = e.unescape() {
let content = text.trim();
if !content.is_empty() && !current_element.is_empty() {
text_parts.push(format!("**{}**: {}", current_element, content));
Source: Repobility analyzer · https://repobility.com
output_formats function · rust · L108-L120 (13 LOC)src/converters/xml.rs
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Json {
structured: true,
include_metadata: false,
},
]
}convert function · rust · L121-L192 (72 LOC)src/converters/xml.rs
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
_options: ConversionOptions,
) -> Result<ConversionResult> {
eprintln!("🔄 XML Conversion (Pure Rust)");
eprintln!(" XML → Parsing → {:?}", output_format);
eprintln!();
// Read XML file
let xml_content = fs::read_to_string(input).await?;
// Convert to requested format
let output_data = match output_format {
OutputFormat::Markdown { .. } => {
eprintln!("📝 Converting to Markdown...");
let markdown = self.xml_to_markdown(&xml_content)?;
markdown.into_bytes()
}
OutputFormat::Json { .. } => {
eprintln!("📝 Converting to JSON...");
let json = self.xml_to_json(&xml_content)?;
json.into_bytes()
}
_ => {
return Err(crate::TransmutationError::UnsupportedFormat(fometadata function · rust · L193-L201 (9 LOC)src/converters/xml.rs
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "XML Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "XML to Markdown/JSON converter (pure Rust)".to_string(),
external_deps: vec![],
}
}test_xml_to_json_basic function · rust · L215-L220 (6 LOC)src/converters/xml.rs
fn test_xml_to_json_basic() {
let converter = XmlConverter::new();
let xml = "<root><item>test</item></root>";
let result = converter.xml_to_json(xml);
assert!(result.is_ok());
}new function · rust · L28-L34 (7 LOC)src/document/hierarchy_builder.rs
pub fn new() -> Self {
Self {
enable_section_tree: true,
enable_list_grouping: true,
enable_caption_pairing: true,
}
}build function · rust · L37-L60 (24 LOC)src/document/hierarchy_builder.rs
pub fn build(&self, filename: String, mut items: Vec<DocItem>) -> Result<DoclingDocument> {
// Build section tree if enabled
if self.enable_section_tree {
items = self.build_section_tree(items)?;
}
// Group consecutive list items if enabled
if self.enable_list_grouping {
items = self.group_list_items(items)?;
}
// Pair captions with figures/tables if enabled
if self.enable_caption_pairing {
items = self.pair_captions(items)?;
}
let mut doc = DoclingDocument::new(filename);
for item in items {
doc.add_item(item);
}
Ok(doc)
}build_section_tree function · rust · L65-L93 (29 LOC)src/document/hierarchy_builder.rs
fn build_section_tree(&self, items: Vec<DocItem>) -> Result<Vec<DocItem>> {
let mut result = Vec::new();
let mut current_level = 0;
for item in items {
match &item {
DocItem::SectionHeader(header) => {
// Ensure level progression is reasonable (no jumps > 1)
let adjusted_level = if header.level > current_level + 1 {
current_level + 1
} else {
header.level
};
current_level = adjusted_level;
result.push(DocItem::SectionHeader(SectionHeaderItem {
level: adjusted_level,
..header.clone()
}));
}
_ => {
result.push(item);
}
}
}
Ok(result)
}group_list_items function · rust · L98-L137 (40 LOC)src/document/hierarchy_builder.rs
fn group_list_items(&self, items: Vec<DocItem>) -> Result<Vec<DocItem>> {
let mut result = Vec::new();
let mut current_list: Vec<ListItemData> = Vec::new();
let mut prev_level = 0;
for item in items {
match item {
DocItem::ListItem(ref list_item) => {
// Adjust level based on previous items
let adjusted_level = if list_item.level > prev_level + 1 {
prev_level + 1
} else {
list_item.level
};
prev_level = adjusted_level;
current_list.push(ListItemData {
level: adjusted_level,
..list_item.clone()
});
}
_ => {
// Flush accumulated list items
for list_item in current_list.drain(..) {
result.push(DocAll rows above produced by Repobility · https://repobility.com
pair_captions function · rust · L142-L244 (103 LOC)src/document/hierarchy_builder.rs
fn pair_captions(&self, items: Vec<DocItem>) -> Result<Vec<DocItem>> {
if items.len() < 2 {
return Ok(items);
}
let mut result = Vec::new();
let mut i = 0;
while i < items.len() {
let item = &items[i];
match item {
&DocItem::Table(ref table) if table.caption.is_none() => {
// Check if next item is caption (caption after table)
if i + 1 < items.len() {
if let DocItem::Paragraph(ref text_item) = items[i + 1] {
if Self::is_likely_caption(&text_item.text) {
// Merge caption into table
let mut new_table = table.clone();
new_table.caption = Some(text_item.text.clone());
result.push(DocItem::Table(new_table));
i += 2; // Skip caption
is_likely_caption function · rust · L247-L258 (12 LOC)src/document/hierarchy_builder.rs
fn is_likely_caption(text: &str) -> bool {
let lower = text.to_lowercase();
// Common caption patterns
lower.starts_with("figure ")
|| lower.starts_with("fig. ")
|| lower.starts_with("table ")
|| lower.starts_with("image ")
|| lower.starts_with("chart ")
|| lower.starts_with("graph ")
|| (lower.len() < 100 && (lower.contains("figure") || lower.contains("table")))
}add_relationship function · rust · L287-L293 (7 LOC)src/document/hierarchy_builder.rs
pub fn add_relationship(&mut self, from: String, to: String, rel_type: &str) {
let key = format!("{}:{}", from, rel_type);
self.relationships
.entry(key)
.or_insert_with(Vec::new)
.push(to);
}build_from_document function · rust · L302-L332 (31 LOC)src/document/hierarchy_builder.rs
pub fn build_from_document(&mut self, doc: &DoclingDocument) -> Result<()> {
let mut current_section: Option<String> = None;
let mut _current_figure: Option<String> = None;
for (idx, item) in doc.items.iter().enumerate() {
let item_ref = format!("item_{}", idx);
match item {
DocItem::SectionHeader(_) => {
current_section = Some(item_ref.clone());
}
DocItem::Paragraph(_) | DocItem::ListItem(_) => {
// Link to current section
if let Some(ref section) = current_section {
self.add_relationship(section.clone(), item_ref.clone(), "contains");
}
}
DocItem::Table(_) | DocItem::Picture(_) => {
_current_figure = Some(item_ref.clone());
// Link to current section
if let Some(ref section) = current_sectitest_is_likely_caption function · rust · L347-L356 (10 LOC)src/document/hierarchy_builder.rs
fn test_is_likely_caption() {
assert!(HierarchyBuilder::is_likely_caption(
"Figure 1: This is a test"
));
assert!(HierarchyBuilder::is_likely_caption("Table 2: Results"));
assert!(HierarchyBuilder::is_likely_caption("Fig. 3: Sample data"));
assert!(!HierarchyBuilder::is_likely_caption(
"This is a regular paragraph with no caption markers."
));
}test_section_tree_validation function · rust · L359-L387 (29 LOC)src/document/hierarchy_builder.rs
fn test_section_tree_validation() {
let builder = HierarchyBuilder::new();
let items = vec![
DocItem::Title(TextItem {
text: "Title".to_string(),
formatting: None,
label: DocItemLabel::Title,
}),
DocItem::SectionHeader(SectionHeaderItem {
text: "Section 1".to_string(),
level: 1,
formatting: None,
}),
DocItem::SectionHeader(SectionHeaderItem {
text: "Section 1.1".to_string(),
level: 5, // Invalid jump - should be corrected to 2
formatting: None,
}),
];
let result = builder.build_section_tree(items).unwrap();
if let DocItem::SectionHeader(header) = &result[2] {
assert_eq!(header.level, 2); // Should be corrected from 5 to 2
} else {
panic!("Expected SectionHeader");
}
}test_caption_pairing function · rust · L390-L418 (29 LOC)src/document/hierarchy_builder.rs
fn test_caption_pairing() {
let builder = HierarchyBuilder::new();
let items = vec![
DocItem::Paragraph(TextItem {
text: "Figure 1: A beautiful chart".to_string(),
formatting: None,
label: DocItemLabel::Caption,
}),
DocItem::Picture(crate::document::types::PictureItem {
caption: None,
placeholder: "image".to_string(),
}),
];
let result = builder.pair_captions(items).unwrap();
assert_eq!(result.len(), 1); // Caption should be merged
if let DocItem::Picture(picture) = &result[0] {
assert!(picture.caption.is_some());
assert_eq!(
picture.caption.as_ref().unwrap(),
"Figure 1: A beautiful chart"
);
} else {
panic!("Expected Picture");
}
}default function · rust · L27-L34 (8 LOC)src/document/page_assembler.rs
fn default() -> Self {
Self {
enable_text_sanitization: true,
enable_heading_detection: true,
enable_list_detection: true,
merge_adjacent_text: true,
}
}Repobility · code-quality intelligence platform · https://repobility.com
new function · rust · L45-L50 (6 LOC)src/document/page_assembler.rs
pub fn new(options: PageAssemblerOptions) -> Self {
Self {
options,
sanitizer: TextSanitizer::new(),
}
}assemble function · rust · L53-L67 (15 LOC)src/document/page_assembler.rs
pub fn assemble(&self, clusters: &[Cluster]) -> Result<Vec<DocItem>> {
let mut items = Vec::new();
for cluster in clusters {
let doc_items = self.process_cluster(cluster)?;
items.extend(doc_items);
}
// Post-processing: merge adjacent text blocks if enabled
if self.options.merge_adjacent_text {
items = self.merge_adjacent_text_items(items)?;
}
Ok(items)
}process_cluster function · rust · L70-L89 (20 LOC)src/document/page_assembler.rs
fn process_cluster(&self, cluster: &Cluster) -> Result<Vec<DocItem>> {
match cluster.label {
DocItemLabel::Title => self.process_title(cluster),
DocItemLabel::SectionHeader => self.process_section_header(cluster),
DocItemLabel::Paragraph | DocItemLabel::Text => self.process_text(cluster),
DocItemLabel::ListItem => self.process_list_item(cluster),
DocItemLabel::Caption => self.process_caption(cluster),
DocItemLabel::Footnote => self.process_footnote(cluster),
DocItemLabel::PageHeader | DocItemLabel::PageFooter => {
self.process_header_footer(cluster)
}
DocItemLabel::Table => self.process_table(cluster),
DocItemLabel::Picture | DocItemLabel::Figure => self.process_picture(cluster),
DocItemLabel::Code => self.process_code(cluster),
DocItemLabel::Formula => self.process_formula(cluster),
DocItemLabel::Checkbo