Function bodies 69 total
create_table_component function · python · L7-L83 (77 LOC)src/papersift/ui/components/table.py
def create_table_component(row_data: list) -> html.Div:
"""
Create AG Grid table with multi-select enabled.
Args:
row_data: List of row dictionaries
Returns:
Dash Div containing the AG Grid component
"""
column_defs = [
{
'headerName': '',
'field': 'cluster_color',
'width': 30,
'cellStyle': {
'function': "{'backgroundColor': params.value}"
},
'headerCheckboxSelection': True,
'checkboxSelection': True,
},
{
'headerName': 'Cluster',
'field': 'cluster',
'width': 80,
'filter': 'agNumberColumnFilter',
'sortable': True,
},
{
'headerName': 'Year',
'field': 'year',
'width': 70,
'filter': 'agNumberColumnFilter',
'sortable': True,
},
{
'headerName': 'Title',
'export_network_html function · python · L14-L51 (38 LOC)src/papersift/ui/exporter.py
def export_network_html(
papers_path: str,
output_path: str,
resolution: float = 1.0,
mode: str = "cluster",
) -> None:
"""
Export paper network as standalone interactive HTML.
Args:
papers_path: Path to papers JSON
output_path: Output HTML file path
resolution: Leiden clustering resolution
mode: Visualization mode ("cluster" or "paper")
"""
# Load and cluster
papers = load_papers(papers_path)
clusters, builder = cluster_papers(papers, resolution=resolution)
colors = generate_cluster_colors(len(set(clusters.values())))
if mode == "cluster":
summaries = builder.get_cluster_summary(clusters)
fig = _create_cluster_view_figure(summaries, builder, papers, colors)
else:
# Paper mode (original)
G = _build_networkx_graph(papers, clusters, builder, colors)
fig = _create_plotly_figure(G, papers)
# Export as self-contained HTML
fig.write_html(
output_p_build_networkx_graph function · python · L54-L78 (25 LOC)src/papersift/ui/exporter.py
def _build_networkx_graph(papers, clusters, builder, colors):
"""Convert PaperSift graph to NetworkX."""
G = nx.Graph()
# Add nodes
doi_to_title = {p['doi']: p.get('title', p['doi']) for p in papers}
for doi, cluster_id in clusters.items():
G.add_node(
doi,
title=doi_to_title.get(doi, doi),
cluster=cluster_id,
color=colors[cluster_id % len(colors)],
)
# Add edges from builder.graph (igraph)
# NOTE: EntityLayerBuilder uses 'doi' attribute, not 'name'
# Reference: entity_layer.py:323, data_loader.py:135-136
for edge in builder.graph.es:
source = builder.graph.vs[edge.source]['doi']
target = builder.graph.vs[edge.target]['doi']
weight = edge['weight'] if 'weight' in edge.attributes() else 1
if source in clusters and target in clusters:
G.add_edge(source, target, weight=weight)
return G_compute_cluster_edges function · python · L81-L99 (19 LOC)src/papersift/ui/exporter.py
def _compute_cluster_edges(summaries, builder):
"""Compute edges between clusters based on shared entities."""
cluster_entities = {}
for summary in summaries:
cid = summary['cluster_id']
entities = set()
for doi in summary['dois']:
entities |= builder._paper_entities.get(doi, set())
cluster_entities[cid] = entities
edges = []
cluster_ids = list(cluster_entities.keys())
for i, cid_a in enumerate(cluster_ids):
for cid_b in cluster_ids[i+1:]:
shared = cluster_entities[cid_a] & cluster_entities[cid_b]
if len(shared) >= 2:
edges.append((cid_a, cid_b, len(shared)))
return edges_create_cluster_view_figure function · python · L102-L188 (87 LOC)src/papersift/ui/exporter.py
def _create_cluster_view_figure(summaries, builder, papers, colors):
"""Create Plotly figure with cluster-level nodes."""
G = nx.Graph()
# Build DOI-to-title map
doi_to_title = {p['doi']: p.get('title', p['doi']) for p in papers}
# Add cluster nodes
for s in summaries:
cid = s['cluster_id']
# Sample papers (first 3)
sample_titles = []
for doi in s['dois'][:3]:
title = doi_to_title.get(doi, doi)
sample_titles.append(title[:60])
hover = (
f"<b>Cluster {cid + 1}</b> ({s['size']} papers)<br>"
f"<br>Top Entities: {', '.join(s['top_entities'][:5])}<br>"
f"<br>Sample Papers:<br>"
+ "<br>".join(f"- {t}" for t in sample_titles)
)
G.add_node(
cid,
size=s['size'],
color=colors[cid % len(colors)],
hover=hover,
)
# Add inter-cluster edges
edges = _compute_cluster_edges(summaries, b_create_plotly_figure function · python · L191-L247 (57 LOC)src/papersift/ui/exporter.py
def _create_plotly_figure(G: nx.Graph, papers: list) -> go.Figure:
"""Create Plotly figure from NetworkX graph."""
# Layout
pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
# Edge traces
edge_x, edge_y = [], []
for u, v in G.edges():
x0, y0 = pos[u]
x1, y1 = pos[v]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
mode='lines',
line=dict(width=0.5, color='#ccc'),
hoverinfo='none'
)
# Node traces (grouped by cluster for coloring)
node_traces = []
cluster_nodes = {}
for node in G.nodes():
cluster = G.nodes[node]['cluster']
if cluster not in cluster_nodes:
cluster_nodes[cluster] = {'x': [], 'y': [], 'text': [], 'color': G.nodes[node]['color']}
x, y = pos[node]
cluster_nodes[cluster]['x'].append(x)
cluster_nodes[cluster]['y'].append(y)
title = G.nodes[node_truncate function · python · L9-L15 (7 LOC)src/papersift/ui/utils/data_loader.py
def _truncate(text: str, max_len: int) -> str:
"""Truncate text with ellipsis if needed."""
if not text:
return ''
if len(text) <= max_len:
return text
return text[:max_len] + '...'All rows scored by the Repobility analyzer (https://repobility.com)
slim_papers function · python · L18-L36 (19 LOC)src/papersift/ui/utils/data_loader.py
def slim_papers(papers: List[Dict[str, Any]], keep_topics: bool = False) -> List[Dict[str, Any]]:
"""
Create a lighter version of papers for Store (reduces payload size).
Args:
papers: full paper list
keep_topics: if True, preserve 'topics' field for re-clustering with use_topics
"""
result = []
for p in papers:
slim = {
'doi': p['doi'],
'title': p.get('title', ''),
'year': p.get('year', ''),
}
if keep_topics and 'topics' in p:
slim['topics'] = p['topics']
result.append(slim)
return resultload_papers function · python · L39-L66 (28 LOC)src/papersift/ui/utils/data_loader.py
def load_papers(path: str) -> List[Dict[str, Any]]:
"""Load papers from JSON file with validation."""
try:
with open(path) as f:
data = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Papers file not found: {path}")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in papers file: {e}")
raw_papers = data.get('papers', data)
# Validate required fields
papers = []
skipped = 0
for p in raw_papers:
if not p.get('doi'):
skipped += 1
continue
papers.append(p)
if skipped > 0:
print(f"Warning: Skipped {skipped} papers without DOI")
if not papers:
raise ValueError("No valid papers found (all missing DOI)")
return paperscluster_papers function · python · L69-L79 (11 LOC)src/papersift/ui/utils/data_loader.py
def cluster_papers(
papers: List[Dict[str, Any]],
resolution: float = 1.0,
seed: int = 42,
use_topics: bool = False,
) -> Tuple[Dict[str, int], EntityLayerBuilder]:
"""Run Leiden clustering on papers with optional topic-enhanced entities."""
builder = EntityLayerBuilder(use_topics=use_topics)
builder.build_from_papers(papers)
clusters = builder.run_leiden(resolution=resolution, seed=seed)
return clusters, buildergenerate_cluster_colors function · python · L82-L106 (25 LOC)src/papersift/ui/utils/data_loader.py
def generate_cluster_colors(cluster_ids) -> Dict[Any, str]:
"""
Generate distinct colors for clusters.
Uses a categorical color palette that's colorblind-friendly.
Accepts any hashable cluster IDs (int, str, or mixed).
Args:
cluster_ids: set of unique cluster IDs (e.g., {0, 1, 2} or {"3.1", "3.2"})
Also accepts int for backward compatibility.
"""
# Backward compatibility: if int passed, treat as range(n)
if isinstance(cluster_ids, int):
cluster_ids = set(range(cluster_ids))
palette = [
'#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
'#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5',
'#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d', '#9edae5'
]
colors = {}
for i, cid in enumerate(sorted(cluster_ids, key=str)):
colors[cid] = palette[i % len(palette)]
return colorspapers_to_cytoscape_elements function · python · L109-L156 (48 LOC)src/papersift/ui/utils/data_loader.py
def papers_to_cytoscape_elements(
papers: List[Dict[str, Any]],
clusters: Dict[str, int],
builder: EntityLayerBuilder
) -> List[Dict[str, Any]]:
"""
Convert papers and clusters to Cytoscape elements.
Returns:
List of node and edge elements for dash-cytoscape
"""
colors = generate_cluster_colors(set(clusters.values()))
elements = []
# Create nodes
doi_to_paper = {p['doi']: p for p in papers}
for doi, cluster_id in clusters.items():
paper = doi_to_paper.get(doi, {})
elements.append({
'data': {
'id': doi,
'label': paper.get('title', doi)[:50] + '...',
'title': paper.get('title', ''),
'cluster': cluster_id,
'color': colors[cluster_id],
'year': paper.get('year', ''),
'abstract': _truncate(paper.get('abstract', ''), 200)
}
})
# Create edges from entity graph
graph = bpapers_to_table_data function · python · L159-L184 (26 LOC)src/papersift/ui/utils/data_loader.py
def papers_to_table_data(
papers: List[Dict[str, Any]],
clusters: Dict[str, int]
) -> List[Dict[str, Any]]:
"""
Convert papers to AG Grid row data.
Returns:
List of row dictionaries for dash-ag-grid
"""
colors = generate_cluster_colors(set(clusters.values()))
rows = []
for paper in papers:
doi = paper['doi']
cluster_id = clusters.get(doi, -1)
rows.append({
'doi': doi,
'title': paper.get('title', ''),
'year': paper.get('year', ''),
'cluster': cluster_id,
'cluster_color': colors.get(cluster_id, '#cccccc'),
'abstract': _truncate(paper.get('abstract', ''), 100)
})
return rowscompute_paper_embedding function · python · L187-L197 (11 LOC)src/papersift/ui/utils/data_loader.py
def compute_paper_embedding(
papers: list,
method: str = "tsne",
use_topics: bool = False,
) -> Dict[str, list]:
"""
Compute embedding standalone, return JSON-serializable {doi: [x, y]}.
"""
from papersift.embedding import embed_papers
result = embed_papers(papers, method=method, use_topics=use_topics)
return {doi: list(coords) for doi, coords in result.items()}ClusterValidator.__init__ method · python · L32-L57 (26 LOC)src/papersift/validator.py
def __init__(
self,
entity_clusters: Dict[str, int],
papers: List[Dict[str, Any]]
):
"""
Args:
entity_clusters: {doi: cluster_id} from EntityLayerBuilder
papers: Original papers with 'doi' and optional 'referenced_works'
"""
self.entity_clusters = entity_clusters
self.papers = papers
# Build citation lookup
self.paper_dois = {p['doi'] for p in papers}
self.citations = {} # doi -> set(cited_dois in collection)
for paper in papers:
doi = paper['doi']
refs = paper.get('referenced_works', [])
# Only keep citations within our collection
self.citations[doi] = {
ref for ref in refs if ref in self.paper_dois
}
self._citation_clusters: Optional[Dict[str, int]] = NoneHi, dataset curator — please cite Repobility (https://repobility.com) when reusing this data.
ClusterValidator.compute_citation_clusters method · python · L64-L99 (36 LOC)src/papersift/validator.py
def compute_citation_clusters(self, resolution: float = 1.0) -> Dict[str, int]:
"""
Run Leiden on citation graph for comparison.
Returns trivial clustering if insufficient citation data.
"""
dois = list(self.paper_dois)
doi_to_idx = {doi: i for i, doi in enumerate(dois)}
# Build edges
edges = []
for doi, refs in self.citations.items():
i = doi_to_idx[doi]
for ref in refs:
j = doi_to_idx[ref]
edges.append((i, j))
if not edges:
# No edges: all papers in one cluster
return {doi: 0 for doi in dois}
g = ig.Graph(n=len(dois), edges=edges, directed=False)
g.vs['doi'] = dois
partition = leidenalg.find_partition(
g,
leidenalg.RBConfigurationVertexPartition,
resolution_parameter=resolution,
seed=0
)
self._citation_clusters = {
doiClusterValidator.compute_ari method · python · L101-L113 (13 LOC)src/papersift/validator.py
def compute_ari(self) -> float:
"""Adjusted Rand Index between entity and citation clusters."""
if self._citation_clusters is None:
self.compute_citation_clusters()
common = set(self.entity_clusters.keys()) & set(self._citation_clusters.keys())
if len(common) < 2:
return 0.0
labels_e = [self.entity_clusters[d] for d in common]
labels_c = [self._citation_clusters[d] for d in common]
return adjusted_rand_score(labels_e, labels_c)ClusterValidator.compute_confidence method · python · L115-L147 (33 LOC)src/papersift/validator.py
def compute_confidence(self) -> Dict[str, float]:
"""
Compute confidence for each paper's cluster assignment.
Confidence = fraction of same-cluster papers that are citation-connected.
High confidence: Paper's cluster members cite each other
Low confidence: Paper's cluster members have no citation links
"""
confidence = {}
# Group papers by entity cluster
cluster_members = defaultdict(set)
for doi, cid in self.entity_clusters.items():
cluster_members[cid].add(doi)
for doi, cid in self.entity_clusters.items():
same_cluster = cluster_members[cid] - {doi}
if not same_cluster:
confidence[doi] = 1.0 # Singleton
continue
# How many cluster members are citation-connected to this paper?
my_citations = self.citations.get(doi, set())
connected = 0
for other in same_cluster:
ClusterValidator.generate_report method · python · L149-L200 (52 LOC)src/papersift/validator.py
def generate_report(self) -> ValidationReport:
"""Generate full validation report."""
if not self.has_citation_data():
# Return minimal report
return ValidationReport(
ari=0.0,
nmi=0.0,
num_papers=len(self.entity_clusters),
num_entity_clusters=len(set(self.entity_clusters.values())),
num_citation_clusters=0,
confidence_scores={},
confidence_summary={'insufficient_data': len(self.entity_clusters)},
interpretation="Insufficient citation data for validation."
)
# Compute citation clusters
self.compute_citation_clusters()
# Compute metrics
ari = self.compute_ari()
common = set(self.entity_clusters.keys()) & set(self._citation_clusters.keys())
labels_e = [self.entity_clusters[d] for d in common]
labels_c = [self._citation_clusters[d] for d in common]
‹ prevpage 2 / 2