Skip to content

Commit

Permalink
feat: tool and scripts to interactively explore webgraph
Browse files Browse the repository at this point in the history
Add methods to access and view successors/predecessors and
count top-level domains in lists of vertices.
  • Loading branch information
sebastian-nagel committed Jun 27, 2024
1 parent e684eb5 commit 153db36
Show file tree
Hide file tree
Showing 3 changed files with 214 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public int compareTo(QueuedIterator o) {
}
}

public static int EMPTY_INPUT_ITERATOR_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt();
public static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt();

private final PriorityQueue<QueuedIterator> iters = new PriorityQueue<>();
private int currentCount = 0;
Expand All @@ -54,7 +54,7 @@ public int compareTo(QueuedIterator o) {
public CountingMergedIntIterator(LazyIntIterator... iterators) {
for (final LazyIntIterator iter : iterators) {
final QueuedIterator qiter = new QueuedIterator(iter);
if (qiter.value != EMPTY_INPUT_ITERATOR_VALUE) {
if (qiter.value != LAZY_INT_ITERATOR_EMPTY_VALUE) {
iters.add(qiter);
}
}
Expand Down Expand Up @@ -93,7 +93,7 @@ public int nextInt() {
while ((val = qiter.iter.nextInt()) == value) {
count++;
}
if (val != EMPTY_INPUT_ITERATOR_VALUE) {
if (val != LAZY_INT_ITERATOR_EMPTY_VALUE) {
qiter.value = val;
iters.add(qiter);
}
Expand Down
174 changes: 174 additions & 0 deletions src/main/java/org/commoncrawl/webgraph/explore/Graph.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,32 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.AbstractMap.SimpleEntry;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.PrimitiveIterator;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import org.commoncrawl.webgraph.CountingMergedIntIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.sux4j.mph.GOV4Function;
import it.unimi.dsi.util.FrontCodedStringList;
import it.unimi.dsi.util.ImmutableExternalPrefixMap;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.webgraph.ImmutableGraph;
import it.unimi.dsi.webgraph.LazyIntIterator;
import it.unimi.dsi.webgraph.LazyIntIterators;

/**
* Holds webgraph-related data structures and access methods for graph
Expand All @@ -42,6 +55,8 @@ public class Graph {
protected ShiftAddXorSignedStringMap vertexMapSmph;
protected GOV4Function<String> vertexMapMph;

private static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt();

public Graph(String name) throws Exception {
this.name = name;
try {
Expand Down Expand Up @@ -113,6 +128,157 @@ public int indegree(String vertexLabel) {
return graphT.outdegree((int) vertexLabelToId(vertexLabel));
}

public int[] successors(long vertexId) {
return graph.successorArray((int) vertexId);
}

public int[] successors(String vertexLabel) {
return graph.successorArray((int) vertexLabelToId(vertexLabel));
}

public Stream<String> successorStream(String vertexLabel) {
return successorStream(graph, vertexLabelToId(vertexLabel));
}

public IntStream successorIntStream(String vertexLabel) {
return successorIntStream(graph, vertexLabelToId(vertexLabel));
}

public Stream<String> successorStream(String vertexLabel, String prefix) {
return successorStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
}

public IntStream successorIntStream(String vertexLabel, String prefix) {
return successorIntStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
}

public Stream<Entry<String, Long>> successorTopLevelDomainCounts(String vertexLabel) {
return successorTopLevelDomainCounts(graph, vertexLabelToId(vertexLabel));
}

public Stream<String> successorStream(ImmutableGraph graph, long vertexId) {
return successorIntStream(graph, vertexId).mapToObj(i -> vertexIdToLabel(i));
}

public IntStream successorIntStream(ImmutableGraph graph, long vertexId) {
return Arrays.stream(graph.successorArray((int) vertexId));
}

private Stream<String> successorStream(ImmutableGraph graph, long vertexId, Interval interval) {
return successorIntStream(graph, vertexId, interval).mapToObj(i -> vertexIdToLabel(i));
}

public IntStream successorIntStream(ImmutableGraph graph, long vertexId, Interval interval) {
return Arrays.stream(graph.successorArray((int) vertexId)).filter(x -> (interval.compareTo(x) == 0));
}

public Stream<String> successorTopLevelDomainStream(ImmutableGraph graph, long vertexId) {
return Arrays.stream(graph.successorArray((int) vertexId)).mapToObj(i -> getTopLevelDomain(vertexIdToLabel(i)));
}

public Stream<Entry<String, Long>> successorTopLevelDomainCounts(ImmutableGraph graph, long vertexId) {
if (vertexMap != null) {
/*
* speed up if we have a prefix map, utilizing the fact that vertex labels are
* lexicographically sorted by reversed domain name
*/
List<Entry<String, Long>> res = new LinkedList<>();
LazyIntIterator iter = graph.successors((int) vertexId);
int curr = iter.nextInt();
while (curr != LAZY_INT_ITERATOR_EMPTY_VALUE) {
final MutableString currLabel = vertexMap.list().get(curr);
final int pos = currLabel.indexOf('.');
final MutableString tldPrefix;
final String tld;
if (pos > -1 && (pos + 1) < currLabel.length()) {
tldPrefix = currLabel.substring(0, pos + 1);
tld = tldPrefix.substring(0, pos).toString();
} else {
tldPrefix = currLabel;
tld = currLabel.toString();
}
long count = 1;
final Interval interval = vertexMap.getInterval(tldPrefix);
int next;
while ((next = iter.nextInt()) != LAZY_INT_ITERATOR_EMPTY_VALUE) {
if (next > interval.right) {
break;
}
count++;
}
curr = next;
res.add(new SimpleEntry<>(tld, count));
}
return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue()));
}
return GraphExplorer.frequencies(successorTopLevelDomainStream(graph, vertexId));
}

public Stream<Entry<String, Long>> topLevelDomainCounts(IntStream vertexIds) {
if (vertexMap != null) {
List<Entry<String, Long>> res = new LinkedList<>();
PrimitiveIterator.OfInt iter = vertexIds.iterator();
if (iter.hasNext()) {
int curr = iter.nextInt();;
do {
final MutableString currLabel = vertexMap.list().get(curr);
final int pos = currLabel.indexOf('.');
final MutableString tldPrefix;
final String tld;
if (pos > -1 && (pos + 1) < currLabel.length()) {
tldPrefix = currLabel.substring(0, pos + 1);
tld = tldPrefix.substring(0, pos).toString();
} else {
tldPrefix = currLabel;
tld = currLabel.toString();
}
long count = 1;
final Interval interval = vertexMap.getInterval(tldPrefix);
int next = -1;
while (iter.hasNext()) {
next = iter.nextInt();
if (next > interval.right) {
break;
}
count++;
}
curr = next;
res.add(new SimpleEntry<>(tld, count));
} while (curr > -1);
}
return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue()));
}
return GraphExplorer.frequencies(vertexIds.mapToObj(i -> Graph.getTopLevelDomain(vertexIdToLabel(i))));
}

public int[] predecessors(long vertexId) {
return graphT.successorArray((int) vertexId);
}

public int[] predecessors(String vertexLabel) {
return graphT.successorArray((int) vertexLabelToId(vertexLabel));
}

public Stream<String> predecessorStream(String vertexLabel) {
return successorStream(graphT, vertexLabelToId(vertexLabel));
}

public IntStream predecessorIntStream(String vertexLabel) {
return successorIntStream(graphT, vertexLabelToId(vertexLabel));
}

public Stream<String> predecessorStream(String vertexLabel, String prefix) {
return successorStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
}

public IntStream predecessorIntStream(String vertexLabel, String prefix) {
return successorIntStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
}

public Stream<Entry<String, Long>> predecessorTopLevelDomainCounts(String vertexLabel) {
return successorTopLevelDomainCounts(graphT, vertexLabelToId(vertexLabel));
}

public long[] sharedPredecessors(long[] vertices) {
return sharedPredecessors(vertices, vertices.length, vertices.length);
}
Expand Down Expand Up @@ -169,4 +335,12 @@ public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minSha
res.trim();
return res.elements();
}

public static String getTopLevelDomain(String reversedDomainName) {
int dot = reversedDomainName.indexOf('.');
if (dot < reversedDomainName.length()) {
return reversedDomainName.substring(0, dot);
}
return reversedDomainName;
}
}
43 changes: 37 additions & 6 deletions src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map.Entry;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.commoncrawl.webgraph.CountingMergedIntIterator;
Expand Down Expand Up @@ -46,11 +50,11 @@ public String toString() {
}

public int outdegree() {
return g.graph.outdegree((int) id);
return g.outdegree((int) id);
}

public int indegree() {
return g.graphT.outdegree((int) id);
return g.indegree((int) id);
}

public int[] successors() {
Expand Down Expand Up @@ -89,7 +93,6 @@ public void setVertex(long vertexId) {
v = getVertex(vertexId);
}


/* Reimplementation of commands provided by pywebgraph (cn, pwn, ls, sl) */

/**
Expand Down Expand Up @@ -178,7 +181,6 @@ public void sl(String vertexLabel) {
sl(g.vertexLabelToId(vertexLabel));
}


/* Utilities */

public long[] loadVerticesFromFile(String fileName) {
Expand All @@ -195,7 +197,21 @@ public void saveVerticesToFile(long[] vertexIDs, String fileName) {
StandardCharsets.UTF_8)) {
Arrays.stream(vertexIDs).forEach(id -> out.println(g.vertexIdToLabel(id)));
} catch (IOException e) {
LOG.error("Failed to load vertices from file {}", fileName, e);
LOG.error("Failed to write vertices to file {}", fileName, e);
}
}

public void saveCountsToFile(Stream<Entry<String, Long>> counts, String fileName) {
try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
StandardCharsets.UTF_8)) {
counts.forEach(c -> {
out.print(c.getValue());
out.print('\t');
out.print(c.getKey());
out.print('\n');
});
} catch (IOException e) {
LOG.error("Failed to write counts to file {}", fileName, e);
}
}

Expand All @@ -206,7 +222,7 @@ private void print(String s) {
public void printVertices(LazyIntIterator it) {
int next = it.nextInt();
int i = 0;
while (next != CountingMergedIntIterator.EMPTY_INPUT_ITERATOR_VALUE) {
while (next != CountingMergedIntIterator.LAZY_INT_ITERATOR_EMPTY_VALUE) {
print(String.format("%d: %s", i, (new Vertex(next)).toString()));
next = it.nextInt();
i++;
Expand All @@ -228,4 +244,19 @@ public void printVertices(int[] vertexIDs) {
i++;
}
}

/**
* Count strings in a stream. Sort the resulting string-count pairs by
* decreasing count (frequency) and secondarily by string in lexicographic
* order.
*
* @param strings stream of strings
* @return stream of pairs {@code <string, count>}
*/
public static Stream<Entry<String, Long>> frequencies(Stream<String> strings) {
final Comparator<Entry<String, Long>> comp = Comparator.comparingLong((Entry<String, Long> e) -> e.getValue())
.reversed().thenComparing(Comparator.comparing((Entry<String, Long> e) -> e.getKey()));
return strings.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet().stream()
.sorted(comp);
}
}

0 comments on commit 153db36

Please sign in to comment.