From e684eb5da5d3bc469dce8420f4df978f97495839 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sun, 23 Jun 2024 14:19:07 +0200 Subject: [PATCH] feat: tool and scripts to interactively explore webgraph - JShell script to load a graph - tutorial / quick start graph exploration --- README.md | 2 +- graph-exploration-README.md | 114 ++++++++++++++++++ .../graph_explore_load_graph.jsh | 32 +++++ 3 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 graph-exploration-README.md create mode 100644 src/script/webgraph_ranking/graph_explore_load_graph.jsh diff --git a/README.md b/README.md index 7b42ede..beca2df 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ The shell script is easily adapted to your needs. Please refer to the [LAW datas The Common Crawl webgraph data sets are announced on the [Common Crawl web site](https://commoncrawl.org/tag/webgraph/). -Instructions how to explore the webgraphs are given in the [cc-notebooks project](//github.com/commoncrawl/cc-notebooks/tree/master/cc-webgraph-statistics). +For instructions how to explore the webgraphs using the JShell please see the tutorial [Interactive Graph Exploration](./graph-exploration-README.md). For an older approach using [Jython](https://www.jython.org/) and [pyWebGraph](https://github.com/mapio/py-web-graph), see the [cc-notebooks project](//github.com/commoncrawl/cc-notebooks/tree/master/cc-webgraph-statistics). ## Credits diff --git a/graph-exploration-README.md b/graph-exploration-README.md new file mode 100644 index 0000000..1a6a01c --- /dev/null +++ b/graph-exploration-README.md @@ -0,0 +1,114 @@ +# Interactive Graph Exploration + +A tutorial how to interactively explore the Common Crawl webgraphs – or other graphs using the webgraph format – using the [JShell](https://docs.oracle.com/en/java/javase/21/jshell/index.html) and the [GraphExplorer](src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java) class. + + +## Quick Start + +1. change into the "cc-webgraph" project directory, [build the cc-webgraph jar](README.md#compiling-and-packaging-java-tools) and remember the project directory using an environment variable + + ``` + $> cd .../cc-webgraph + + $> mvn clean package + + $> CC_WEBGRAPH=$PWD + ``` + +2. select a web graph you want to explore, choose a download directory and download the web graph + + ``` + $> GRAPH=cc-main-2024-feb-apr-may-domain + + $> mkdir .../my-webgraphs/$GRAPH + $> cd .../my-webgraphs/$GRAPH + ``` + + About 15 GiB disk are needed to hold all files of a domain-level webgraph. + + ``` + $> $CC_WEBGRAPH/src/script/webgraph_ranking/graph_explore_download_webgraph.sh $GRAPH + ``` + +3. Build the map from vertex label to vertex ID and vice versa. This allows to look up a reverse domain name (e.g. "org.commoncrawl") and get the corresponding vertex ID. + + ``` + $> $CC_WEBGRAPH/src/script/webgraph_ranking/graph_explore_build_vertex_map.sh $GRAPH $GRAPH-vertices.txt.gz + ``` + +4. Launch the [JShell](https://docs.oracle.com/en/java/javase/21/jshell/index.html) + + ``` + $> jshell --class-path $CC_WEBGRAPH/target/cc-webgraph-*-jar-with-dependencies.jar + | Welcome to JShell -- Version 21.0.3 + | For an introduction type: /help intro + + jshell> + ``` + + Now you may play around with the JShell or load the GraphExplorer class and your graph: + + ``` + jshell> import org.commoncrawl.webgraph.explore.GraphExplorer + + jshell> GraphExplorer e = new GraphExplorer("cc-main-2024-feb-apr-may-domain") + 2024-06-23 13:38:51:084 +0200 [main] INFO Graph - Loading graph cc-main-2024-feb-apr-may-domain.graph + 2024-06-23 13:38:51:193 +0200 [main] INFO Graph - Loading transpose of the graph cc-main-2024-feb-apr-may-domain-t.graph + 2024-06-23 13:38:51:279 +0200 [main] INFO Graph - Loading vertex map cc-main-2024-feb-apr-may-domain.iepm (ImmutableExternalPrefixMap) + 2024-06-23 13:38:52:356 +0200 [main] INFO Graph - Loaded graph cc-main-2024-feb-apr-may-domain.graph + e ==> org.commoncrawl.webgraph.explore.GraphExplorer@4cc0edeb + ``` + + But for now exit the JShell + ``` + jshell> /exit + | Goodbye + ``` + + To make the loading easier, you may use the load script [graph_explore_load_graph.jsh](src/script/webgraph_ranking/graph_explore_load_graph.jsh) and pass the graph name as a Java property to the JShell via command-line option `-R-Dgraph=$GRAPH` + + ``` + $> jshell --class-path $CC_WEBGRAPH/target/cc-webgraph-*-jar-with-dependencies.jar \ + -R-Dgraph=$GRAPH \ + $CC_WEBRAPH/src/script/webgraph_ranking/graph_explore_load_graph.jsh + Loading graph cc-main-2024-feb-apr-may-domain + 2024-06-23 13:30:14:134 +0200 [main] INFO Graph - Loading graph cc-main-2024-feb-apr-may-domain.graph + 2024-06-23 13:30:14:340 +0200 [main] INFO Graph - Loading transpose of the graph cc-main-2024-feb-apr-may-domain-t.graph + 2024-06-23 13:30:14:439 +0200 [main] INFO Graph - Loading vertex map cc-main-2024-feb-apr-may-domain.iepm (ImmutableExternalPrefixMap) + 2024-06-23 13:30:15:595 +0200 [main] INFO Graph - Loaded graph cc-main-2024-feb-apr-may-domain.graph + + Graph cc-main-2024-feb-apr-may-domain loaded into GraphExplorer *e* + Type "e." and press to list the public methods of the class GraphExplorer + ... or "g." for the graph loaded for exploration + + ... or use one of the predefined methods: + void cn(String) + void cn(long) + void pwn() + void ls() + void ls(long) + void ls(String) + void sl() + void sl(long) + void sl(String) + + | Welcome to JShell -- Version 21.0.3 + | For an introduction type: /help intro + + jshell> + ``` + + The predefined methods are those provided by [pyWebGraph](https://github.com/mapio/py-web-graph). + + ``` + jshell> cn("org.commoncrawl") + #111997321 org.commoncrawl + + jshell> pwn() + #111997321 org.commoncrawl + + jshell> ls() // list successors (vertices linked from the domain commoncrawl.org or one of its subdomains) + + jshell> sl() // list predecessors (vertices connected via incoming links) + ``` + diff --git a/src/script/webgraph_ranking/graph_explore_load_graph.jsh b/src/script/webgraph_ranking/graph_explore_load_graph.jsh new file mode 100644 index 0000000..e884f76 --- /dev/null +++ b/src/script/webgraph_ranking/graph_explore_load_graph.jsh @@ -0,0 +1,32 @@ +/open PRINTING + +String graph = System.getProperty("graph") +println("Loading graph " + graph) + +import org.commoncrawl.webgraph.explore.Graph +import org.commoncrawl.webgraph.explore.GraphExplorer +import it.unimi.dsi.webgraph.ImmutableGraph + +GraphExplorer e = new GraphExplorer(graph) +Graph g = e.getGraph() + +println() +println("Graph " + graph + " loaded into GraphExplorer *e*") +println("Type \"e.\" and press to list the public methods of the class GraphExplorer") +println("... or \"g.\" for the graph loaded for exploration") + +/* Define commands provided by pywebgraph (cn, pwn, ls, sl) */ +void cn(String vertexLabel) { e.cn(vertexLabel); } +void cn(long vertexID) { e.cn(vertexID); } +void pwn() { e.pwn(); } +void ls() { e.ls(); } +void ls(long vertexId) { e.ls(vertexId); } +void ls(String vertexLabel) { e.ls(vertexLabel); } +void sl() { e.sl(); } +void sl(long vertexId) { e.sl(vertexId); } +void sl(String vertexLabel) { e.sl(vertexLabel); } + +println() +println("... or use one of the predefined methods:") +/methods cn pwn ls sl +println() \ No newline at end of file