From 8b41de56a36c8fb9f2c20fdcb3d6f07a8a2a662b Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sun, 23 Jun 2024 00:17:58 +0200 Subject: [PATCH] feat: tool and scripts to interactively explore webgraph Refactor: move class Graph into own file. --- .../commoncrawl/webgraph/explore/Graph.java | 172 ++++++++++++++++++ .../webgraph/explore/GraphExplorer.java | 145 +-------------- 2 files changed, 173 insertions(+), 144 deletions(-) create mode 100644 src/main/java/org/commoncrawl/webgraph/explore/Graph.java diff --git a/src/main/java/org/commoncrawl/webgraph/explore/Graph.java b/src/main/java/org/commoncrawl/webgraph/explore/Graph.java new file mode 100644 index 0000000..a71bd71 --- /dev/null +++ b/src/main/java/org/commoncrawl/webgraph/explore/Graph.java @@ -0,0 +1,172 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright (C) 2024 Common Crawl and contributors + */ +package org.commoncrawl.webgraph.explore; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.commoncrawl.webgraph.CountingMergedIntIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.sux4j.mph.GOV4Function; +import it.unimi.dsi.util.FrontCodedStringList; +import it.unimi.dsi.util.ImmutableExternalPrefixMap; +import it.unimi.dsi.util.ShiftAddXorSignedStringMap; +import it.unimi.dsi.webgraph.ImmutableGraph; +import it.unimi.dsi.webgraph.LazyIntIterator; + +/** + * Holds webgraph-related data structures and access methods for graph + * exploration. + */ +public class Graph { + + private static Logger LOG = LoggerFactory.getLogger(Graph.class); + + /** The base name of the graph */ + public String name; + /** The graph */ + public ImmutableGraph graph; + /** The transpose of the graph */ + public ImmutableGraph graphT; + + /* Maps to translate between vertex label an ID */ + protected ImmutableExternalPrefixMap vertexMap; + protected FrontCodedStringList vertexMapFcl; + protected ShiftAddXorSignedStringMap vertexMapSmph; + protected GOV4Function vertexMapMph; + + public Graph(String name) throws Exception { + this.name = name; + try { + LOG.info("Loading graph {}.graph", name); + graph = ImmutableGraph.loadMapped(name); + LOG.info("Loading transpose of the graph {}-t.graph", name); + graphT = ImmutableGraph.loadMapped(name + "-t"); + if (Files.exists(Paths.get(name + ".iepm"))) { + LOG.info("Loading vertex map {}.iepm (ImmutableExternalPrefixMap)", name); + vertexMap = (ImmutableExternalPrefixMap) BinIO.loadObject(name + ".iepm"); + } else if (Files.exists(Paths.get(name + ".fcl"))) { + LOG.info("Loading vertex map {}.fcl (FrontCodedStringList, maps vertex IDs to labels)", name); + vertexMapFcl = (FrontCodedStringList) BinIO.loadObject(name + ".fcl"); + if (Files.exists(Paths.get(name + ".smph"))) { + LOG.info("Loading vertex map {}.smph (string map perfect hash, maps vertex labels to IDs)", name); + vertexMapSmph = (ShiftAddXorSignedStringMap) BinIO.loadObject(name + ".smph"); + } else if (Files.exists(Paths.get(name + ".mph"))) { + LOG.info("Loading vertex map {}.mph (minimal perfect hash, maps vertex labels to IDs)", name); + vertexMapMph = (GOV4Function) BinIO.loadObject(name + ".mph"); + LOG.warn( + "Using a minimal perfect hash as vertex map does not allow to verify that a vertex label exists. " + + "Non-existant labels are mapped to quasi-random IDs."); + } else { + LOG.error("No vertex mapping found, cannot translate from vertex names to IDs."); + } + } else { + LOG.error("No vertex mapping found, cannot translate from vertex names to IDs."); + } + } catch (IOException | ClassNotFoundException e) { + LOG.error("Failed to load graph {}:", name, e); + throw e; + } + LOG.info("Loaded graph {}.graph", name); + } + + public String vertexIdToLabel(long id) { + if (vertexMap != null) { + return vertexMap.list().get((int) id).toString(); + } else { + return vertexMapFcl.get((int) id).toString(); + } + } + + public long vertexLabelToId(String label) { + if (vertexMap != null) { + return vertexMap.getLong(label); + } else if (vertexMapSmph != null) { + return vertexMapSmph.getLong(label); + } else if (vertexMapMph != null) { + return vertexMapMph.getLong(label); + } else { + throw new RuntimeException("No vertex map loaded."); + } + } + + public int outdegree(long vertexId) { + return graph.outdegree((int) vertexId); + } + + public int outdegree(String vertexLabel) { + return graph.outdegree((int) vertexLabelToId(vertexLabel)); + } + + public int indegree(long vertexId) { + return graphT.outdegree((int) vertexId); + } + + public int indegree(String vertexLabel) { + return graphT.outdegree((int) vertexLabelToId(vertexLabel)); + } + + public long[] sharedPredecessors(long[] vertices) { + return sharedPredecessors(vertices, vertices.length, vertices.length); + } + + public long[] sharedPredecessors(long[] vertices, int minShared, int maxShared) { + return sharedSuccessors(graphT, vertices, minShared, maxShared); + } + + public long[] sharedSuccessors(long[] vertices) { + return sharedSuccessors(vertices, vertices.length, vertices.length); + } + + public long[] sharedSuccessors(long[] vertices, int minShared, int maxShared) { + return sharedSuccessors(graph, vertices, minShared, maxShared); + } + + /** + * Get shared successors (children) of all {@code vertices} in a {@code graph}. + * The parameters {@code minShared} and {@code maxShared} allow to select the + * intersection, the union or a subset with a specific overlap (shared + * successors). If vertex a has the successors d, e, vertex + * b has d, f and vertex c has d, e, g, then + *
    + *
  • {@code minShared} = {@code maxShared} = {@code vertices.length} returns + * the intersection (d)
  • + *
  • {@code minShared} = 1 and {@code maxShared} = {@code vertices.length} + * returns the union (d, e, f)
  • + *
  • {@code minShared} = {@code maxShared} = 2 returns all successors shared + * by exactly two of the {@code vertices} (e)
  • + *
+ * + * @param graph the graph used to access the successors of a vertex (the + * transpose of the graph will give the predecessors of the + * vertex) + * @param vertices list of vertex IDs + * @param minShared the minimum number of shared links to successors + * @param maxShared the minimum number of shared links to successors + * @return shared successors + */ + public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minShared, int maxShared) { + LazyIntIterator[] iters = new LazyIntIterator[vertices.length]; + for (int i = 0; i < vertices.length; i++) { + iters[i] = graph.successors((int) vertices[i]); + } + CountingMergedIntIterator iter = new CountingMergedIntIterator(iters); + LongArrayList res = new LongArrayList(); + int id; + while (iter.hasNext()) { + id = iter.nextInt(); + if (iter.getCount() >= minShared && iter.getCount() <= maxShared) { + res.add(id); + } + } + res.trim(); + return res.elements(); + } +} diff --git a/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java b/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java index 729eddd..9f96fe1 100644 --- a/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java +++ b/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java @@ -16,13 +16,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import it.unimi.dsi.fastutil.io.BinIO; -import it.unimi.dsi.fastutil.longs.LongArrayList; -import it.unimi.dsi.sux4j.mph.GOV4Function; -import it.unimi.dsi.util.FrontCodedStringList; -import it.unimi.dsi.util.ImmutableExternalPrefixMap; -import it.unimi.dsi.util.ShiftAddXorSignedStringMap; -import it.unimi.dsi.webgraph.ImmutableGraph; import it.unimi.dsi.webgraph.LazyIntIterator; /** @@ -33,70 +26,6 @@ public class GraphExplorer { private static Logger LOG = LoggerFactory.getLogger(GraphExplorer.class); - public class Graph { - public String name; - public ImmutableGraph graph; - public ImmutableGraph graphT; - public ImmutableExternalPrefixMap vertexMap; - public FrontCodedStringList vertexMapFcl; - public ShiftAddXorSignedStringMap vertexMapSmph; - public GOV4Function vertexMapMph; - - public Graph(String name) throws Exception { - this.name = name; - try { - LOG.info("Loading graph {}.graph", name); - graph = ImmutableGraph.loadMapped(name); - LOG.info("Loading transpose of the graph {}-t.graph", name); - graphT = ImmutableGraph.loadMapped(name + "-t"); - if (Files.exists(Paths.get(name + ".iepm"))) { - LOG.info("Loading vertex map {}.iepm (ImmutableExternalPrefixMap)", name); - vertexMap = (ImmutableExternalPrefixMap) BinIO.loadObject(name + ".iepm"); - } else if (Files.exists(Paths.get(name + ".fcl"))) { - LOG.info("Loading vertex map {}.fcl (FrontCodedStringList, maps vertex IDs to labels)", name); - vertexMapFcl = (FrontCodedStringList) BinIO.loadObject(name + ".fcl"); - if (Files.exists(Paths.get(name + ".smph"))) { - LOG.info("Loading vertex map {}.smph (string map perfect hash, maps vertex labels to IDs)", name); - vertexMapSmph = (ShiftAddXorSignedStringMap) BinIO.loadObject(name + ".smph"); - } else if (Files.exists(Paths.get(name + ".mph"))) { - LOG.info("Loading vertex map {}.mph (minimal perfect hash, maps vertex labels to IDs)", name); - vertexMapMph = (GOV4Function) BinIO.loadObject(name + ".mph"); - LOG.warn( - "Using a minimal perfect hash as vertex map does not allow to verify that a vertex label exists. " - + "Non-existant labels are mapped to quasi-random IDs."); - } else { - LOG.error("No vertex mapping found, cannot translate from vertex names to IDs."); - } - } else { - LOG.error("No vertex mapping found, cannot translate from vertex names to IDs."); - } - } catch (IOException | ClassNotFoundException e) { - LOG.error("Failed to load graph {}:", name, e); - throw e; - } - } - - public String vertexIdToLabel(long id) { - if (g.vertexMap != null) { - return g.vertexMap.list().get((int) id).toString(); - } else { - return g.vertexMapFcl.get((int) id).toString(); - } - } - - public long vertexLabelToId(String label) { - if (g.vertexMap != null) { - return g.vertexMap.getLong(label); - } else if (g.vertexMapSmph != null) { - return g.vertexMapSmph.getLong(label); - } else if (g.vertexMapMph != null) { - return g.vertexMapMph.getLong(label); - } else { - throw new RuntimeException("No vertex map loaded."); - } - } - } - public class Vertex { private long id; private String label; @@ -160,79 +89,6 @@ public void setVertex(long vertexId) { v = getVertex(vertexId); } - public int outdegree() { - return v.outdegree(); - } - public int outdegree(long vertexId) { - return g.graph.outdegree((int) vertexId); - } - public int outdegree(String vertexLabel) { - return g.graph.outdegree((int) g.vertexLabelToId(vertexLabel)); - } - public int indegree() { - return v.indegree(); - } - public int indegree(long vertexId) { - return g.graphT.outdegree((int) vertexId); - } - public int indegree(String vertexLabel) { - return g.graphT.outdegree((int) g.vertexLabelToId(vertexLabel)); - } - - public long[] sharedPredecessors(long[] vertices) { - return sharedPredecessors(vertices, vertices.length, vertices.length); - } - - public long[] sharedPredecessors(long[] vertices, int minShared, int maxShared) { - return sharedSuccessors(g.graphT, vertices, minShared, maxShared); - } - - public long[] sharedSuccessors(long[] vertices) { - return sharedSuccessors(vertices, vertices.length, vertices.length); - } - - public long[] sharedSuccessors(long[] vertices, int minShared, int maxShared) { - return sharedSuccessors(g.graph, vertices, minShared, maxShared); - } - - /** - * Get shared successors (children) of all {@code vertices} in a {@code graph}. - * The parameters {@code minShared} and {@code maxShared} allow to select the - * intersection, the union or a subset with a specific overlap (shared - * successors). If vertex a has the successors d, e, vertex - * b has d, f and vertex c has d, e, g, then - *
    - *
  • {@code minShared} = {@code maxShared} = {@code vertices.length} returns - * the intersection (d)
  • - *
  • {@code minShared} = 1 and {@code maxShared} = {@code vertices.length} - * returns the union (d, e, f)
  • - *
  • {@code minShared} = {@code maxShared} = 2 returns all successors shared - * by exactly two of the {@code vertices} (e)
  • - *
- * - * @param graph the webgraph used to achieve the successors - * @param vertices list of vertex IDs - * @param minShared the minimum number of shared links to successors - * @param maxShared the minimum number of shared links to successors - * @return shared successors - */ - public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minShared, int maxShared) { - LazyIntIterator[] iters = new LazyIntIterator[vertices.length]; - for (int i = 0; i < vertices.length; i++) { - iters[i] = graph.successors((int) vertices[i]); - } - CountingMergedIntIterator iter = new CountingMergedIntIterator(iters); - LongArrayList res = new LongArrayList(); - int id; - while (iter.hasNext()) { - id = iter.nextInt(); - if (iter.getCount() >= minShared && iter.getCount() <= maxShared) { - res.add(id); - } - } - res.trim(); - return res.elements(); - } /* Reimplementation of commands provided by pywebgraph (cn, pwn, ls, sl) */ @@ -322,6 +178,7 @@ public void sl(String vertexLabel) { sl(g.vertexLabelToId(vertexLabel)); } + /* Utilities */ public long[] loadVerticesFromFile(String fileName) {