diff --git a/internal/devtools/cmd/rmdoc/main.go b/internal/devtools/cmd/rmdoc/main.go new file mode 100644 index 0000000..26ccbbf --- /dev/null +++ b/internal/devtools/cmd/rmdoc/main.go @@ -0,0 +1,126 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// rmdoc deletes the documents from the corpus (including the vector db). +// +// Usage: go run . -project oscar-go-1 -firestoredb devel https://go.dev/x/y/z +package main + +import ( + "context" + "flag" + "fmt" + "log" + "log/slog" + "strings" + + "cloud.google.com/go/compute/metadata" + "golang.org/x/oscar/internal/dbspec" + "golang.org/x/oscar/internal/docs" + "golang.org/x/oscar/internal/gcp/firestore" + "golang.org/x/oscar/internal/pebble" + "golang.org/x/oscar/internal/storage" +) + +var flags = struct { + project string + firestoredb string + overlay string +}{} + +func init() { + flag.StringVar(&flags.project, "project", "", "name of the Google Cloud Project") + flag.StringVar(&flags.firestoredb, "firestoredb", "", "name of the firestore db") +} + +var logger = slog.Default() + +func main() { + flag.Parse() + + args := flag.Args() + if len(args) == 0 { + log.Fatal("no args") + } + + gabyDB, gabyVectorDB := initGCP() + corpus := docs.New(logger, gabyDB) + + for _, url := range args { + if !strings.HasPrefix(url, "https://go.dev/") { + log.Println("ignoring unrecognized url:", url) + continue + } + + // TODO: do we need to delete crawl.Page entries too? + + for doc := range corpus.Docs(url) { + hasVector := " " + if _, ok := gabyVectorDB.Get(doc.ID); ok { + hasVector = "*" + } + fmt.Printf("%v %v", hasVector, doc.ID) + + fmt.Printf(" delete (y/N)? ") + var a string + fmt.Scanln(&a) + if answer := strings.ToLower(strings.TrimSpace(a)); answer == "y" || answer == "yes" { + gabyVectorDB.Delete(doc.ID) + corpus.Delete(doc.ID) + if _, ok := gabyVectorDB.Get(doc.ID); ok { + log.Fatalf("error - %v not removed from vector db", doc.ID) + } + fmt.Print(" ↪ deleted") + } else { + fmt.Print(" ↪ skipped") + } + fmt.Println() + } + } +} + +func initGCP() (storage.DB, storage.VectorDB) { + ctx := context.TODO() + + if flags.project == "" { + projectID, err := metadata.ProjectIDWithContext(ctx) + if err != nil { + log.Fatalf("metadata project ID: %v", err) + } + if projectID == "" { + log.Fatal("project ID from metadata is empty") + } + flags.project = projectID + } + + db, err := openDB(&dbspec.Spec{ + Kind: "firestore", + Location: flags.project, + Name: flags.firestoredb, + }) + if err != nil { + log.Fatal(err) + } + + const vectorDBNamespace = "gaby" + vdb, err := firestore.NewVectorDB(ctx, slog.Default(), flags.project, flags.firestoredb, vectorDBNamespace) + if err != nil { + log.Fatal(err) + } + return db, vdb +} + +// openDB opens the database described by spec. +func openDB(spec *dbspec.Spec) (storage.DB, error) { + switch spec.Kind { + case "mem": + return storage.MemDB(), nil + case "pebble": + return pebble.Open(logger, spec.Location) + case "firestore": + return firestore.NewDB(context.TODO(), logger, spec.Location, spec.Name) + default: + return nil, fmt.Errorf("unknown DB kind %q", spec.Kind) + } +} diff --git a/internal/docs/docs.go b/internal/docs/docs.go index 33de6f9..3644c42 100644 --- a/internal/docs/docs.go +++ b/internal/docs/docs.go @@ -77,7 +77,7 @@ func (c *Corpus) Get(id string) (doc *Doc, ok bool) { // Add adds a document with the given id, title, and text. // If the document already exists in the corpus with the same title and text, -// Add is an no-op. +// Add is a no-op. // Otherwise, if the document already exists in the corpus, it is replaced. func (c *Corpus) Add(id, title, text string) { old, ok := c.Get(id) @@ -89,6 +89,18 @@ func (c *Corpus) Add(id, title, text string) { b.Apply() } +// Delete deletes a document with the given id. +// If the document does not exist inthe corpus, Delete is a no-op. +func (c *Corpus) Delete(id string) { + doc, ok := c.Get(id) + if !ok { + return + } + b := c.db.Batch() + timed.Delete(c.db, b, docsKind, ordered.Encode(doc.ID)) + b.Apply() +} + // Docs returns an iterator over all documents in the corpus // with IDs starting with a given prefix. // The documents are ordered by ID. diff --git a/internal/docs/docs_test.go b/internal/docs/docs_test.go index db969a7..d1dcb3b 100644 --- a/internal/docs/docs_test.go +++ b/internal/docs/docs_test.go @@ -117,4 +117,16 @@ func TestCorpus(t *testing.T) { if !slices.Equal(ids, want) { t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want) } + + // After Delete id1. + corpus.Delete("id1") + corpus.Delete("id1111") // doesn't exist + ids = nil + for d := range corpus.Docs("id1") { + do(d) + } + want = []string{"id11"} + if !slices.Equal(ids, want) { + t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want) + } }