← back to kyuwon-shim-ARL__papersift

Function bodies 69 total

All specs Real LLM only Function bodies
create_table_component function · python · L7-L83 (77 LOC)
src/papersift/ui/components/table.py
def create_table_component(row_data: list) -> html.Div:
    """
    Create AG Grid table with multi-select enabled.

    Args:
        row_data: List of row dictionaries

    Returns:
        Dash Div containing the AG Grid component
    """
    column_defs = [
        {
            'headerName': '',
            'field': 'cluster_color',
            'width': 30,
            'cellStyle': {
                'function': "{'backgroundColor': params.value}"
            },
            'headerCheckboxSelection': True,
            'checkboxSelection': True,
        },
        {
            'headerName': 'Cluster',
            'field': 'cluster',
            'width': 80,
            'filter': 'agNumberColumnFilter',
            'sortable': True,
        },
        {
            'headerName': 'Year',
            'field': 'year',
            'width': 70,
            'filter': 'agNumberColumnFilter',
            'sortable': True,
        },
        {
            'headerName': 'Title',
            '
export_network_html function · python · L14-L51 (38 LOC)
src/papersift/ui/exporter.py
def export_network_html(
    papers_path: str,
    output_path: str,
    resolution: float = 1.0,
    mode: str = "cluster",
) -> None:
    """
    Export paper network as standalone interactive HTML.

    Args:
        papers_path: Path to papers JSON
        output_path: Output HTML file path
        resolution: Leiden clustering resolution
        mode: Visualization mode ("cluster" or "paper")
    """
    # Load and cluster
    papers = load_papers(papers_path)
    clusters, builder = cluster_papers(papers, resolution=resolution)
    colors = generate_cluster_colors(len(set(clusters.values())))

    if mode == "cluster":
        summaries = builder.get_cluster_summary(clusters)
        fig = _create_cluster_view_figure(summaries, builder, papers, colors)
    else:
        # Paper mode (original)
        G = _build_networkx_graph(papers, clusters, builder, colors)
        fig = _create_plotly_figure(G, papers)

    # Export as self-contained HTML
    fig.write_html(
        output_p
_build_networkx_graph function · python · L54-L78 (25 LOC)
src/papersift/ui/exporter.py
def _build_networkx_graph(papers, clusters, builder, colors):
    """Convert PaperSift graph to NetworkX."""
    G = nx.Graph()

    # Add nodes
    doi_to_title = {p['doi']: p.get('title', p['doi']) for p in papers}
    for doi, cluster_id in clusters.items():
        G.add_node(
            doi,
            title=doi_to_title.get(doi, doi),
            cluster=cluster_id,
            color=colors[cluster_id % len(colors)],
        )

    # Add edges from builder.graph (igraph)
    # NOTE: EntityLayerBuilder uses 'doi' attribute, not 'name'
    # Reference: entity_layer.py:323, data_loader.py:135-136
    for edge in builder.graph.es:
        source = builder.graph.vs[edge.source]['doi']
        target = builder.graph.vs[edge.target]['doi']
        weight = edge['weight'] if 'weight' in edge.attributes() else 1
        if source in clusters and target in clusters:
            G.add_edge(source, target, weight=weight)

    return G
_compute_cluster_edges function · python · L81-L99 (19 LOC)
src/papersift/ui/exporter.py
def _compute_cluster_edges(summaries, builder):
    """Compute edges between clusters based on shared entities."""
    cluster_entities = {}
    for summary in summaries:
        cid = summary['cluster_id']
        entities = set()
        for doi in summary['dois']:
            entities |= builder._paper_entities.get(doi, set())
        cluster_entities[cid] = entities

    edges = []
    cluster_ids = list(cluster_entities.keys())
    for i, cid_a in enumerate(cluster_ids):
        for cid_b in cluster_ids[i+1:]:
            shared = cluster_entities[cid_a] & cluster_entities[cid_b]
            if len(shared) >= 2:
                edges.append((cid_a, cid_b, len(shared)))

    return edges
_create_cluster_view_figure function · python · L102-L188 (87 LOC)
src/papersift/ui/exporter.py
def _create_cluster_view_figure(summaries, builder, papers, colors):
    """Create Plotly figure with cluster-level nodes."""
    G = nx.Graph()

    # Build DOI-to-title map
    doi_to_title = {p['doi']: p.get('title', p['doi']) for p in papers}

    # Add cluster nodes
    for s in summaries:
        cid = s['cluster_id']
        # Sample papers (first 3)
        sample_titles = []
        for doi in s['dois'][:3]:
            title = doi_to_title.get(doi, doi)
            sample_titles.append(title[:60])

        hover = (
            f"<b>Cluster {cid + 1}</b> ({s['size']} papers)<br>"
            f"<br>Top Entities: {', '.join(s['top_entities'][:5])}<br>"
            f"<br>Sample Papers:<br>"
            + "<br>".join(f"- {t}" for t in sample_titles)
        )

        G.add_node(
            cid,
            size=s['size'],
            color=colors[cid % len(colors)],
            hover=hover,
        )

    # Add inter-cluster edges
    edges = _compute_cluster_edges(summaries, b
_create_plotly_figure function · python · L191-L247 (57 LOC)
src/papersift/ui/exporter.py
def _create_plotly_figure(G: nx.Graph, papers: list) -> go.Figure:
    """Create Plotly figure from NetworkX graph."""
    # Layout
    pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)

    # Edge traces
    edge_x, edge_y = [], []
    for u, v in G.edges():
        x0, y0 = pos[u]
        x1, y1 = pos[v]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        mode='lines',
        line=dict(width=0.5, color='#ccc'),
        hoverinfo='none'
    )

    # Node traces (grouped by cluster for coloring)
    node_traces = []
    cluster_nodes = {}
    for node in G.nodes():
        cluster = G.nodes[node]['cluster']
        if cluster not in cluster_nodes:
            cluster_nodes[cluster] = {'x': [], 'y': [], 'text': [], 'color': G.nodes[node]['color']}
        x, y = pos[node]
        cluster_nodes[cluster]['x'].append(x)
        cluster_nodes[cluster]['y'].append(y)
        title = G.nodes[node
_truncate function · python · L9-L15 (7 LOC)
src/papersift/ui/utils/data_loader.py
def _truncate(text: str, max_len: int) -> str:
    """Truncate text with ellipsis if needed."""
    if not text:
        return ''
    if len(text) <= max_len:
        return text
    return text[:max_len] + '...'
All rows scored by the Repobility analyzer (https://repobility.com)
slim_papers function · python · L18-L36 (19 LOC)
src/papersift/ui/utils/data_loader.py
def slim_papers(papers: List[Dict[str, Any]], keep_topics: bool = False) -> List[Dict[str, Any]]:
    """
    Create a lighter version of papers for Store (reduces payload size).

    Args:
        papers: full paper list
        keep_topics: if True, preserve 'topics' field for re-clustering with use_topics
    """
    result = []
    for p in papers:
        slim = {
            'doi': p['doi'],
            'title': p.get('title', ''),
            'year': p.get('year', ''),
        }
        if keep_topics and 'topics' in p:
            slim['topics'] = p['topics']
        result.append(slim)
    return result
load_papers function · python · L39-L66 (28 LOC)
src/papersift/ui/utils/data_loader.py
def load_papers(path: str) -> List[Dict[str, Any]]:
    """Load papers from JSON file with validation."""
    try:
        with open(path) as f:
            data = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Papers file not found: {path}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON in papers file: {e}")

    raw_papers = data.get('papers', data)

    # Validate required fields
    papers = []
    skipped = 0
    for p in raw_papers:
        if not p.get('doi'):
            skipped += 1
            continue
        papers.append(p)

    if skipped > 0:
        print(f"Warning: Skipped {skipped} papers without DOI")

    if not papers:
        raise ValueError("No valid papers found (all missing DOI)")

    return papers
cluster_papers function · python · L69-L79 (11 LOC)
src/papersift/ui/utils/data_loader.py
def cluster_papers(
    papers: List[Dict[str, Any]],
    resolution: float = 1.0,
    seed: int = 42,
    use_topics: bool = False,
) -> Tuple[Dict[str, int], EntityLayerBuilder]:
    """Run Leiden clustering on papers with optional topic-enhanced entities."""
    builder = EntityLayerBuilder(use_topics=use_topics)
    builder.build_from_papers(papers)
    clusters = builder.run_leiden(resolution=resolution, seed=seed)
    return clusters, builder
generate_cluster_colors function · python · L82-L106 (25 LOC)
src/papersift/ui/utils/data_loader.py
def generate_cluster_colors(cluster_ids) -> Dict[Any, str]:
    """
    Generate distinct colors for clusters.

    Uses a categorical color palette that's colorblind-friendly.
    Accepts any hashable cluster IDs (int, str, or mixed).

    Args:
        cluster_ids: set of unique cluster IDs (e.g., {0, 1, 2} or {"3.1", "3.2"})
                     Also accepts int for backward compatibility.
    """
    # Backward compatibility: if int passed, treat as range(n)
    if isinstance(cluster_ids, int):
        cluster_ids = set(range(cluster_ids))

    palette = [
        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
        '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
        '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5',
        '#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d', '#9edae5'
    ]
    colors = {}
    for i, cid in enumerate(sorted(cluster_ids, key=str)):
        colors[cid] = palette[i % len(palette)]
    return colors
papers_to_cytoscape_elements function · python · L109-L156 (48 LOC)
src/papersift/ui/utils/data_loader.py
def papers_to_cytoscape_elements(
    papers: List[Dict[str, Any]],
    clusters: Dict[str, int],
    builder: EntityLayerBuilder
) -> List[Dict[str, Any]]:
    """
    Convert papers and clusters to Cytoscape elements.

    Returns:
        List of node and edge elements for dash-cytoscape
    """
    colors = generate_cluster_colors(set(clusters.values()))

    elements = []

    # Create nodes
    doi_to_paper = {p['doi']: p for p in papers}
    for doi, cluster_id in clusters.items():
        paper = doi_to_paper.get(doi, {})
        elements.append({
            'data': {
                'id': doi,
                'label': paper.get('title', doi)[:50] + '...',
                'title': paper.get('title', ''),
                'cluster': cluster_id,
                'color': colors[cluster_id],
                'year': paper.get('year', ''),
                'abstract': _truncate(paper.get('abstract', ''), 200)
            }
        })

    # Create edges from entity graph
    graph = b
papers_to_table_data function · python · L159-L184 (26 LOC)
src/papersift/ui/utils/data_loader.py
def papers_to_table_data(
    papers: List[Dict[str, Any]],
    clusters: Dict[str, int]
) -> List[Dict[str, Any]]:
    """
    Convert papers to AG Grid row data.

    Returns:
        List of row dictionaries for dash-ag-grid
    """
    colors = generate_cluster_colors(set(clusters.values()))

    rows = []
    for paper in papers:
        doi = paper['doi']
        cluster_id = clusters.get(doi, -1)
        rows.append({
            'doi': doi,
            'title': paper.get('title', ''),
            'year': paper.get('year', ''),
            'cluster': cluster_id,
            'cluster_color': colors.get(cluster_id, '#cccccc'),
            'abstract': _truncate(paper.get('abstract', ''), 100)
        })

    return rows
compute_paper_embedding function · python · L187-L197 (11 LOC)
src/papersift/ui/utils/data_loader.py
def compute_paper_embedding(
    papers: list,
    method: str = "tsne",
    use_topics: bool = False,
) -> Dict[str, list]:
    """
    Compute embedding standalone, return JSON-serializable {doi: [x, y]}.
    """
    from papersift.embedding import embed_papers
    result = embed_papers(papers, method=method, use_topics=use_topics)
    return {doi: list(coords) for doi, coords in result.items()}
ClusterValidator.__init__ method · python · L32-L57 (26 LOC)
src/papersift/validator.py
    def __init__(
        self,
        entity_clusters: Dict[str, int],
        papers: List[Dict[str, Any]]
    ):
        """
        Args:
            entity_clusters: {doi: cluster_id} from EntityLayerBuilder
            papers: Original papers with 'doi' and optional 'referenced_works'
        """
        self.entity_clusters = entity_clusters
        self.papers = papers

        # Build citation lookup
        self.paper_dois = {p['doi'] for p in papers}
        self.citations = {}  # doi -> set(cited_dois in collection)

        for paper in papers:
            doi = paper['doi']
            refs = paper.get('referenced_works', [])
            # Only keep citations within our collection
            self.citations[doi] = {
                ref for ref in refs if ref in self.paper_dois
            }

        self._citation_clusters: Optional[Dict[str, int]] = None
Hi, dataset curator — please cite Repobility (https://repobility.com) when reusing this data.
ClusterValidator.compute_citation_clusters method · python · L64-L99 (36 LOC)
src/papersift/validator.py
    def compute_citation_clusters(self, resolution: float = 1.0) -> Dict[str, int]:
        """
        Run Leiden on citation graph for comparison.

        Returns trivial clustering if insufficient citation data.
        """
        dois = list(self.paper_dois)
        doi_to_idx = {doi: i for i, doi in enumerate(dois)}

        # Build edges
        edges = []
        for doi, refs in self.citations.items():
            i = doi_to_idx[doi]
            for ref in refs:
                j = doi_to_idx[ref]
                edges.append((i, j))

        if not edges:
            # No edges: all papers in one cluster
            return {doi: 0 for doi in dois}

        g = ig.Graph(n=len(dois), edges=edges, directed=False)
        g.vs['doi'] = dois

        partition = leidenalg.find_partition(
            g,
            leidenalg.RBConfigurationVertexPartition,
            resolution_parameter=resolution,
            seed=0
        )

        self._citation_clusters = {
            doi
ClusterValidator.compute_ari method · python · L101-L113 (13 LOC)
src/papersift/validator.py
    def compute_ari(self) -> float:
        """Adjusted Rand Index between entity and citation clusters."""
        if self._citation_clusters is None:
            self.compute_citation_clusters()

        common = set(self.entity_clusters.keys()) & set(self._citation_clusters.keys())
        if len(common) < 2:
            return 0.0

        labels_e = [self.entity_clusters[d] for d in common]
        labels_c = [self._citation_clusters[d] for d in common]

        return adjusted_rand_score(labels_e, labels_c)
ClusterValidator.compute_confidence method · python · L115-L147 (33 LOC)
src/papersift/validator.py
    def compute_confidence(self) -> Dict[str, float]:
        """
        Compute confidence for each paper's cluster assignment.

        Confidence = fraction of same-cluster papers that are citation-connected.

        High confidence: Paper's cluster members cite each other
        Low confidence: Paper's cluster members have no citation links
        """
        confidence = {}

        # Group papers by entity cluster
        cluster_members = defaultdict(set)
        for doi, cid in self.entity_clusters.items():
            cluster_members[cid].add(doi)

        for doi, cid in self.entity_clusters.items():
            same_cluster = cluster_members[cid] - {doi}
            if not same_cluster:
                confidence[doi] = 1.0  # Singleton
                continue

            # How many cluster members are citation-connected to this paper?
            my_citations = self.citations.get(doi, set())
            connected = 0
            for other in same_cluster:
            
ClusterValidator.generate_report method · python · L149-L200 (52 LOC)
src/papersift/validator.py
    def generate_report(self) -> ValidationReport:
        """Generate full validation report."""
        if not self.has_citation_data():
            # Return minimal report
            return ValidationReport(
                ari=0.0,
                nmi=0.0,
                num_papers=len(self.entity_clusters),
                num_entity_clusters=len(set(self.entity_clusters.values())),
                num_citation_clusters=0,
                confidence_scores={},
                confidence_summary={'insufficient_data': len(self.entity_clusters)},
                interpretation="Insufficient citation data for validation."
            )

        # Compute citation clusters
        self.compute_citation_clusters()

        # Compute metrics
        ari = self.compute_ari()

        common = set(self.entity_clusters.keys()) & set(self._citation_clusters.keys())
        labels_e = [self.entity_clusters[d] for d in common]
        labels_c = [self._citation_clusters[d] for d in common]
 
‹ prevpage 2 / 2