From e238ae97a73dcd8a58f8fbe2226300a3bed8a480 Mon Sep 17 00:00:00 2001 From: "Hana (Hyang-Ah) Kim" Date: Thu, 12 Dec 2024 21:05:40 -0500 Subject: [PATCH] internal/devtools/cmd/rmdoc: delete crawled pages from corpus Gaby splits each crawled webpage into docs for embedding, computes embedding, and store them in the vector db. Delete all the docs and their embedding. This is meant to be run after the webpage is excluded from crawling with Crawler.Deny. For golang/oscar#63 Change-Id: I095a65b9a834ccf48062facc3654f40b43562e15 Reviewed-on: https://go-review.googlesource.com/c/oscar/+/635176 LUCI-TryBot-Result: Go LUCI Reviewed-by: Jonathan Amsterdam --- internal/devtools/cmd/rmdoc/main.go | 126 ++++++++++++++++++++++++++++ internal/docs/docs.go | 14 +++- internal/docs/docs_test.go | 12 +++ 3 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 internal/devtools/cmd/rmdoc/main.go diff --git a/internal/devtools/cmd/rmdoc/main.go b/internal/devtools/cmd/rmdoc/main.go new file mode 100644 index 0000000..26ccbbf --- /dev/null +++ b/internal/devtools/cmd/rmdoc/main.go @@ -0,0 +1,126 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// rmdoc deletes the documents from the corpus (including the vector db). +// +// Usage: go run . -project oscar-go-1 -firestoredb devel https://go.dev/x/y/z +package main + +import ( + "context" + "flag" + "fmt" + "log" + "log/slog" + "strings" + + "cloud.google.com/go/compute/metadata" + "golang.org/x/oscar/internal/dbspec" + "golang.org/x/oscar/internal/docs" + "golang.org/x/oscar/internal/gcp/firestore" + "golang.org/x/oscar/internal/pebble" + "golang.org/x/oscar/internal/storage" +) + +var flags = struct { + project string + firestoredb string + overlay string +}{} + +func init() { + flag.StringVar(&flags.project, "project", "", "name of the Google Cloud Project") + flag.StringVar(&flags.firestoredb, "firestoredb", "", "name of the firestore db") +} + +var logger = slog.Default() + +func main() { + flag.Parse() + + args := flag.Args() + if len(args) == 0 { + log.Fatal("no args") + } + + gabyDB, gabyVectorDB := initGCP() + corpus := docs.New(logger, gabyDB) + + for _, url := range args { + if !strings.HasPrefix(url, "https://go.dev/") { + log.Println("ignoring unrecognized url:", url) + continue + } + + // TODO: do we need to delete crawl.Page entries too? + + for doc := range corpus.Docs(url) { + hasVector := " " + if _, ok := gabyVectorDB.Get(doc.ID); ok { + hasVector = "*" + } + fmt.Printf("%v %v", hasVector, doc.ID) + + fmt.Printf(" delete (y/N)? ") + var a string + fmt.Scanln(&a) + if answer := strings.ToLower(strings.TrimSpace(a)); answer == "y" || answer == "yes" { + gabyVectorDB.Delete(doc.ID) + corpus.Delete(doc.ID) + if _, ok := gabyVectorDB.Get(doc.ID); ok { + log.Fatalf("error - %v not removed from vector db", doc.ID) + } + fmt.Print(" ↪ deleted") + } else { + fmt.Print(" ↪ skipped") + } + fmt.Println() + } + } +} + +func initGCP() (storage.DB, storage.VectorDB) { + ctx := context.TODO() + + if flags.project == "" { + projectID, err := metadata.ProjectIDWithContext(ctx) + if err != nil { + log.Fatalf("metadata project ID: %v", err) + } + if projectID == "" { + log.Fatal("project ID from metadata is empty") + } + flags.project = projectID + } + + db, err := openDB(&dbspec.Spec{ + Kind: "firestore", + Location: flags.project, + Name: flags.firestoredb, + }) + if err != nil { + log.Fatal(err) + } + + const vectorDBNamespace = "gaby" + vdb, err := firestore.NewVectorDB(ctx, slog.Default(), flags.project, flags.firestoredb, vectorDBNamespace) + if err != nil { + log.Fatal(err) + } + return db, vdb +} + +// openDB opens the database described by spec. +func openDB(spec *dbspec.Spec) (storage.DB, error) { + switch spec.Kind { + case "mem": + return storage.MemDB(), nil + case "pebble": + return pebble.Open(logger, spec.Location) + case "firestore": + return firestore.NewDB(context.TODO(), logger, spec.Location, spec.Name) + default: + return nil, fmt.Errorf("unknown DB kind %q", spec.Kind) + } +} diff --git a/internal/docs/docs.go b/internal/docs/docs.go index 33de6f9..3644c42 100644 --- a/internal/docs/docs.go +++ b/internal/docs/docs.go @@ -77,7 +77,7 @@ func (c *Corpus) Get(id string) (doc *Doc, ok bool) { // Add adds a document with the given id, title, and text. // If the document already exists in the corpus with the same title and text, -// Add is an no-op. +// Add is a no-op. // Otherwise, if the document already exists in the corpus, it is replaced. func (c *Corpus) Add(id, title, text string) { old, ok := c.Get(id) @@ -89,6 +89,18 @@ func (c *Corpus) Add(id, title, text string) { b.Apply() } +// Delete deletes a document with the given id. +// If the document does not exist inthe corpus, Delete is a no-op. +func (c *Corpus) Delete(id string) { + doc, ok := c.Get(id) + if !ok { + return + } + b := c.db.Batch() + timed.Delete(c.db, b, docsKind, ordered.Encode(doc.ID)) + b.Apply() +} + // Docs returns an iterator over all documents in the corpus // with IDs starting with a given prefix. // The documents are ordered by ID. diff --git a/internal/docs/docs_test.go b/internal/docs/docs_test.go index db969a7..d1dcb3b 100644 --- a/internal/docs/docs_test.go +++ b/internal/docs/docs_test.go @@ -117,4 +117,16 @@ func TestCorpus(t *testing.T) { if !slices.Equal(ids, want) { t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want) } + + // After Delete id1. + corpus.Delete("id1") + corpus.Delete("id1111") // doesn't exist + ids = nil + for d := range corpus.Docs("id1") { + do(d) + } + want = []string{"id11"} + if !slices.Equal(ids, want) { + t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want) + } }