internal/devtools/cmd/rmdoc: delete crawled pages from corpus

Gaby splits each crawled webpage into docs for embedding, computes embedding, and store them in the vector db. Delete all the docs and their embedding. This is meant to be run after the webpage is excluded from crawling with Crawler.Deny. For #63 Change-Id: I095a65b9a834ccf48062facc3654f40b43562e15 Reviewed-on: https://go-review.googlesource.com/c/oscar/+/635176 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Jonathan Amsterdam <[email protected]>
golang · Dec 15, 2024 · e238ae9 · e238ae9
1 parent 5e25bc0
commit e238ae9
Show file tree

Hide file tree

Showing 3 changed files with 151 additions and 1 deletion.
diff --git a/internal/devtools/cmd/rmdoc/main.go b/internal/devtools/cmd/rmdoc/main.go
@@ -0,0 +1,126 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// rmdoc deletes the documents from the corpus (including the vector db).
+//
+//	Usage:  go run . -project oscar-go-1 -firestoredb devel https://go.dev/x/y/z
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"log"
+	"log/slog"
+	"strings"
+
+	"cloud.google.com/go/compute/metadata"
+	"golang.org/x/oscar/internal/dbspec"
+	"golang.org/x/oscar/internal/docs"
+	"golang.org/x/oscar/internal/gcp/firestore"
+	"golang.org/x/oscar/internal/pebble"
+	"golang.org/x/oscar/internal/storage"
+)
+
+var flags = struct {
+	project     string
+	firestoredb string
+	overlay     string
+}{}
+
+func init() {
+	flag.StringVar(&flags.project, "project", "", "name of the Google Cloud Project")
+	flag.StringVar(&flags.firestoredb, "firestoredb", "", "name of the firestore db")
+}
+
+var logger = slog.Default()
+
+func main() {
+	flag.Parse()
+
+	args := flag.Args()
+	if len(args) == 0 {
+		log.Fatal("no args")
+	}
+
+	gabyDB, gabyVectorDB := initGCP()
+	corpus := docs.New(logger, gabyDB)
+
+	for _, url := range args {
+		if !strings.HasPrefix(url, "https://go.dev/") {
+			log.Println("ignoring unrecognized url:", url)
+			continue
+		}
+
+		// TODO: do we need to delete crawl.Page entries too?
+
+		for doc := range corpus.Docs(url) {
+			hasVector := " "
+			if _, ok := gabyVectorDB.Get(doc.ID); ok {
+				hasVector = "*"
+			}
+			fmt.Printf("%v %v", hasVector, doc.ID)
+
+			fmt.Printf(" delete (y/N)? ")
+			var a string
+			fmt.Scanln(&a)
+			if answer := strings.ToLower(strings.TrimSpace(a)); answer == "y" || answer == "yes" {
+				gabyVectorDB.Delete(doc.ID)
+				corpus.Delete(doc.ID)
+				if _, ok := gabyVectorDB.Get(doc.ID); ok {
+					log.Fatalf("error - %v not removed from vector db", doc.ID)
+				}
+				fmt.Print(" ↪ deleted")
+			} else {
+				fmt.Print(" ↪ skipped")
+			}
+			fmt.Println()
+		}
+	}
+}
+
+func initGCP() (storage.DB, storage.VectorDB) {
+	ctx := context.TODO()
+
+	if flags.project == "" {
+		projectID, err := metadata.ProjectIDWithContext(ctx)
+		if err != nil {
+			log.Fatalf("metadata project ID: %v", err)
+		}
+		if projectID == "" {
+			log.Fatal("project ID from metadata is empty")
+		}
+		flags.project = projectID
+	}
+
+	db, err := openDB(&dbspec.Spec{
+		Kind:     "firestore",
+		Location: flags.project,
+		Name:     flags.firestoredb,
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	const vectorDBNamespace = "gaby"
+	vdb, err := firestore.NewVectorDB(ctx, slog.Default(), flags.project, flags.firestoredb, vectorDBNamespace)
+	if err != nil {
+		log.Fatal(err)
+	}
+	return db, vdb
+}
+
+// openDB opens the database described by spec.
+func openDB(spec *dbspec.Spec) (storage.DB, error) {
+	switch spec.Kind {
+	case "mem":
+		return storage.MemDB(), nil
+	case "pebble":
+		return pebble.Open(logger, spec.Location)
+	case "firestore":
+		return firestore.NewDB(context.TODO(), logger, spec.Location, spec.Name)
+	default:
+		return nil, fmt.Errorf("unknown DB kind %q", spec.Kind)
+	}
+}
diff --git a/internal/docs/docs.go b/internal/docs/docs.go
@@ -77,7 +77,7 @@ func (c *Corpus) Get(id string) (doc *Doc, ok bool) {
 
 // Add adds a document with the given id, title, and text.
 // If the document already exists in the corpus with the same title and text,
-// Add is an no-op.
+// Add is a no-op.
 // Otherwise, if the document already exists in the corpus, it is replaced.
 func (c *Corpus) Add(id, title, text string) {
 	old, ok := c.Get(id)
@@ -89,6 +89,18 @@ func (c *Corpus) Add(id, title, text string) {
 	b.Apply()
 }
 
+// Delete deletes a document with the given id.
+// If the document does not exist inthe corpus, Delete is a no-op.
+func (c *Corpus) Delete(id string) {
+	doc, ok := c.Get(id)
+	if !ok {
+		return
+	}
+	b := c.db.Batch()
+	timed.Delete(c.db, b, docsKind, ordered.Encode(doc.ID))
+	b.Apply()
+}
+
 // Docs returns an iterator over all documents in the corpus
 // with IDs starting with a given prefix.
 // The documents are ordered by ID.

diff --git a/internal/docs/docs_test.go b/internal/docs/docs_test.go
@@ -117,4 +117,16 @@ func TestCorpus(t *testing.T) {
 	if !slices.Equal(ids, want) {
 		t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want)
 	}
+
+	// After Delete id1.
+	corpus.Delete("id1")
+	corpus.Delete("id1111") // doesn't exist
+	ids = nil
+	for d := range corpus.Docs("id1") {
+		do(d)
+	}
+	want = []string{"id11"}
+	if !slices.Equal(ids, want) {
+		t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want)
+	}
 }