diff --git a/src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java b/src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java index 1dd5538..328e9cd 100644 --- a/src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java +++ b/src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java @@ -43,7 +43,7 @@ public int compareTo(QueuedIterator o) { } } - public static int EMPTY_INPUT_ITERATOR_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt(); + public static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt(); private final PriorityQueue iters = new PriorityQueue<>(); private int currentCount = 0; @@ -54,7 +54,7 @@ public int compareTo(QueuedIterator o) { public CountingMergedIntIterator(LazyIntIterator... iterators) { for (final LazyIntIterator iter : iterators) { final QueuedIterator qiter = new QueuedIterator(iter); - if (qiter.value != EMPTY_INPUT_ITERATOR_VALUE) { + if (qiter.value != LAZY_INT_ITERATOR_EMPTY_VALUE) { iters.add(qiter); } } @@ -93,7 +93,7 @@ public int nextInt() { while ((val = qiter.iter.nextInt()) == value) { count++; } - if (val != EMPTY_INPUT_ITERATOR_VALUE) { + if (val != LAZY_INT_ITERATOR_EMPTY_VALUE) { qiter.value = val; iters.add(qiter); } diff --git a/src/main/java/org/commoncrawl/webgraph/explore/Graph.java b/src/main/java/org/commoncrawl/webgraph/explore/Graph.java index a71bd71..3cdc15d 100644 --- a/src/main/java/org/commoncrawl/webgraph/explore/Graph.java +++ b/src/main/java/org/commoncrawl/webgraph/explore/Graph.java @@ -7,6 +7,16 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.AbstractMap.SimpleEntry; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.PrimitiveIterator; +import java.util.stream.IntStream; +import java.util.stream.Stream; import org.commoncrawl.webgraph.CountingMergedIntIterator; import org.slf4j.Logger; @@ -14,12 +24,15 @@ import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.sux4j.mph.GOV4Function; import it.unimi.dsi.util.FrontCodedStringList; import it.unimi.dsi.util.ImmutableExternalPrefixMap; +import it.unimi.dsi.util.Interval; import it.unimi.dsi.util.ShiftAddXorSignedStringMap; import it.unimi.dsi.webgraph.ImmutableGraph; import it.unimi.dsi.webgraph.LazyIntIterator; +import it.unimi.dsi.webgraph.LazyIntIterators; /** * Holds webgraph-related data structures and access methods for graph @@ -42,6 +55,8 @@ public class Graph { protected ShiftAddXorSignedStringMap vertexMapSmph; protected GOV4Function vertexMapMph; + private static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt(); + public Graph(String name) throws Exception { this.name = name; try { @@ -113,6 +128,157 @@ public int indegree(String vertexLabel) { return graphT.outdegree((int) vertexLabelToId(vertexLabel)); } + public int[] successors(long vertexId) { + return graph.successorArray((int) vertexId); + } + + public int[] successors(String vertexLabel) { + return graph.successorArray((int) vertexLabelToId(vertexLabel)); + } + + public Stream successorStream(String vertexLabel) { + return successorStream(graph, vertexLabelToId(vertexLabel)); + } + + public IntStream successorIntStream(String vertexLabel) { + return successorIntStream(graph, vertexLabelToId(vertexLabel)); + } + + public Stream successorStream(String vertexLabel, String prefix) { + return successorStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); + } + + public IntStream successorIntStream(String vertexLabel, String prefix) { + return successorIntStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); + } + + public Stream> successorTopLevelDomainCounts(String vertexLabel) { + return successorTopLevelDomainCounts(graph, vertexLabelToId(vertexLabel)); + } + + public Stream successorStream(ImmutableGraph graph, long vertexId) { + return successorIntStream(graph, vertexId).mapToObj(i -> vertexIdToLabel(i)); + } + + public IntStream successorIntStream(ImmutableGraph graph, long vertexId) { + return Arrays.stream(graph.successorArray((int) vertexId)); + } + + private Stream successorStream(ImmutableGraph graph, long vertexId, Interval interval) { + return successorIntStream(graph, vertexId, interval).mapToObj(i -> vertexIdToLabel(i)); + } + + public IntStream successorIntStream(ImmutableGraph graph, long vertexId, Interval interval) { + return Arrays.stream(graph.successorArray((int) vertexId)).filter(x -> (interval.compareTo(x) == 0)); + } + + public Stream successorTopLevelDomainStream(ImmutableGraph graph, long vertexId) { + return Arrays.stream(graph.successorArray((int) vertexId)).mapToObj(i -> getTopLevelDomain(vertexIdToLabel(i))); + } + + public Stream> successorTopLevelDomainCounts(ImmutableGraph graph, long vertexId) { + if (vertexMap != null) { + /* + * speed up if we have a prefix map, utilizing the fact that vertex labels are + * lexicographically sorted by reversed domain name + */ + List> res = new LinkedList<>(); + LazyIntIterator iter = graph.successors((int) vertexId); + int curr = iter.nextInt(); + while (curr != LAZY_INT_ITERATOR_EMPTY_VALUE) { + final MutableString currLabel = vertexMap.list().get(curr); + final int pos = currLabel.indexOf('.'); + final MutableString tldPrefix; + final String tld; + if (pos > -1 && (pos + 1) < currLabel.length()) { + tldPrefix = currLabel.substring(0, pos + 1); + tld = tldPrefix.substring(0, pos).toString(); + } else { + tldPrefix = currLabel; + tld = currLabel.toString(); + } + long count = 1; + final Interval interval = vertexMap.getInterval(tldPrefix); + int next; + while ((next = iter.nextInt()) != LAZY_INT_ITERATOR_EMPTY_VALUE) { + if (next > interval.right) { + break; + } + count++; + } + curr = next; + res.add(new SimpleEntry<>(tld, count)); + } + return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue())); + } + return GraphExplorer.frequencies(successorTopLevelDomainStream(graph, vertexId)); + } + + public Stream> topLevelDomainCounts(IntStream vertexIds) { + if (vertexMap != null) { + List> res = new LinkedList<>(); + PrimitiveIterator.OfInt iter = vertexIds.iterator(); + if (iter.hasNext()) { + int curr = iter.nextInt();; + do { + final MutableString currLabel = vertexMap.list().get(curr); + final int pos = currLabel.indexOf('.'); + final MutableString tldPrefix; + final String tld; + if (pos > -1 && (pos + 1) < currLabel.length()) { + tldPrefix = currLabel.substring(0, pos + 1); + tld = tldPrefix.substring(0, pos).toString(); + } else { + tldPrefix = currLabel; + tld = currLabel.toString(); + } + long count = 1; + final Interval interval = vertexMap.getInterval(tldPrefix); + int next = -1; + while (iter.hasNext()) { + next = iter.nextInt(); + if (next > interval.right) { + break; + } + count++; + } + curr = next; + res.add(new SimpleEntry<>(tld, count)); + } while (curr > -1); + } + return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue())); + } + return GraphExplorer.frequencies(vertexIds.mapToObj(i -> Graph.getTopLevelDomain(vertexIdToLabel(i)))); + } + + public int[] predecessors(long vertexId) { + return graphT.successorArray((int) vertexId); + } + + public int[] predecessors(String vertexLabel) { + return graphT.successorArray((int) vertexLabelToId(vertexLabel)); + } + + public Stream predecessorStream(String vertexLabel) { + return successorStream(graphT, vertexLabelToId(vertexLabel)); + } + + public IntStream predecessorIntStream(String vertexLabel) { + return successorIntStream(graphT, vertexLabelToId(vertexLabel)); + } + + public Stream predecessorStream(String vertexLabel, String prefix) { + return successorStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); + } + + public IntStream predecessorIntStream(String vertexLabel, String prefix) { + return successorIntStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); + } + + public Stream> predecessorTopLevelDomainCounts(String vertexLabel) { + return successorTopLevelDomainCounts(graphT, vertexLabelToId(vertexLabel)); + } + public long[] sharedPredecessors(long[] vertices) { return sharedPredecessors(vertices, vertices.length, vertices.length); } @@ -169,4 +335,12 @@ public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minSha res.trim(); return res.elements(); } + + public static String getTopLevelDomain(String reversedDomainName) { + int dot = reversedDomainName.indexOf('.'); + if (dot < reversedDomainName.length()) { + return reversedDomainName.substring(0, dot); + } + return reversedDomainName; + } } diff --git a/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java b/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java index 9f96fe1..5e63390 100644 --- a/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java +++ b/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java @@ -10,6 +10,10 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.Arrays; +import java.util.Comparator; +import java.util.Map.Entry; +import java.util.function.Function; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.commoncrawl.webgraph.CountingMergedIntIterator; @@ -46,11 +50,11 @@ public String toString() { } public int outdegree() { - return g.graph.outdegree((int) id); + return g.outdegree((int) id); } public int indegree() { - return g.graphT.outdegree((int) id); + return g.indegree((int) id); } public int[] successors() { @@ -89,7 +93,6 @@ public void setVertex(long vertexId) { v = getVertex(vertexId); } - /* Reimplementation of commands provided by pywebgraph (cn, pwn, ls, sl) */ /** @@ -178,7 +181,6 @@ public void sl(String vertexLabel) { sl(g.vertexLabelToId(vertexLabel)); } - /* Utilities */ public long[] loadVerticesFromFile(String fileName) { @@ -195,7 +197,21 @@ public void saveVerticesToFile(long[] vertexIDs, String fileName) { StandardCharsets.UTF_8)) { Arrays.stream(vertexIDs).forEach(id -> out.println(g.vertexIdToLabel(id))); } catch (IOException e) { - LOG.error("Failed to load vertices from file {}", fileName, e); + LOG.error("Failed to write vertices to file {}", fileName, e); + } + } + + public void saveCountsToFile(Stream> counts, String fileName) { + try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, + StandardCharsets.UTF_8)) { + counts.forEach(c -> { + out.print(c.getValue()); + out.print('\t'); + out.print(c.getKey()); + out.print('\n'); + }); + } catch (IOException e) { + LOG.error("Failed to write counts to file {}", fileName, e); } } @@ -206,7 +222,7 @@ private void print(String s) { public void printVertices(LazyIntIterator it) { int next = it.nextInt(); int i = 0; - while (next != CountingMergedIntIterator.EMPTY_INPUT_ITERATOR_VALUE) { + while (next != CountingMergedIntIterator.LAZY_INT_ITERATOR_EMPTY_VALUE) { print(String.format("%d: %s", i, (new Vertex(next)).toString())); next = it.nextInt(); i++; @@ -228,4 +244,19 @@ public void printVertices(int[] vertexIDs) { i++; } } + + /** + * Count strings in a stream. Sort the resulting string-count pairs by + * decreasing count (frequency) and secondarily by string in lexicographic + * order. + * + * @param strings stream of strings + * @return stream of pairs {@code } + */ + public static Stream> frequencies(Stream strings) { + final Comparator> comp = Comparator.comparingLong((Entry e) -> e.getValue()) + .reversed().thenComparing(Comparator.comparing((Entry e) -> e.getKey())); + return strings.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet().stream() + .sorted(comp); + } }