Skip to content

Commit

Permalink
internal/devtools/cmd/rmdoc: delete crawled pages from corpus
Browse files Browse the repository at this point in the history
Gaby splits each crawled webpage into docs for embedding, computes
embedding, and store them in the vector db. Delete all the docs
and their embedding.

This is meant to be run after the webpage is excluded from
crawling with Crawler.Deny.

For #63

Change-Id: I095a65b9a834ccf48062facc3654f40b43562e15
Reviewed-on: https://go-review.googlesource.com/c/oscar/+/635176
LUCI-TryBot-Result: Go LUCI <[email protected]>
Reviewed-by: Jonathan Amsterdam <[email protected]>
  • Loading branch information
hyangah committed Dec 15, 2024
1 parent 5e25bc0 commit e238ae9
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 1 deletion.
126 changes: 126 additions & 0 deletions internal/devtools/cmd/rmdoc/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// rmdoc deletes the documents from the corpus (including the vector db).
//
// Usage: go run . -project oscar-go-1 -firestoredb devel https://go.dev/x/y/z
package main

import (
"context"
"flag"
"fmt"
"log"
"log/slog"
"strings"

"cloud.google.com/go/compute/metadata"
"golang.org/x/oscar/internal/dbspec"
"golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/gcp/firestore"
"golang.org/x/oscar/internal/pebble"
"golang.org/x/oscar/internal/storage"
)

var flags = struct {
project string
firestoredb string
overlay string
}{}

func init() {
flag.StringVar(&flags.project, "project", "", "name of the Google Cloud Project")
flag.StringVar(&flags.firestoredb, "firestoredb", "", "name of the firestore db")
}

var logger = slog.Default()

func main() {
flag.Parse()

args := flag.Args()
if len(args) == 0 {
log.Fatal("no args")
}

gabyDB, gabyVectorDB := initGCP()
corpus := docs.New(logger, gabyDB)

for _, url := range args {
if !strings.HasPrefix(url, "https://go.dev/") {
log.Println("ignoring unrecognized url:", url)
continue
}

// TODO: do we need to delete crawl.Page entries too?

for doc := range corpus.Docs(url) {
hasVector := " "
if _, ok := gabyVectorDB.Get(doc.ID); ok {
hasVector = "*"
}
fmt.Printf("%v %v", hasVector, doc.ID)

fmt.Printf(" delete (y/N)? ")
var a string
fmt.Scanln(&a)
if answer := strings.ToLower(strings.TrimSpace(a)); answer == "y" || answer == "yes" {
gabyVectorDB.Delete(doc.ID)
corpus.Delete(doc.ID)
if _, ok := gabyVectorDB.Get(doc.ID); ok {
log.Fatalf("error - %v not removed from vector db", doc.ID)
}
fmt.Print(" ↪ deleted")
} else {
fmt.Print(" ↪ skipped")
}
fmt.Println()
}
}
}

func initGCP() (storage.DB, storage.VectorDB) {
ctx := context.TODO()

if flags.project == "" {
projectID, err := metadata.ProjectIDWithContext(ctx)
if err != nil {
log.Fatalf("metadata project ID: %v", err)
}
if projectID == "" {
log.Fatal("project ID from metadata is empty")
}
flags.project = projectID
}

db, err := openDB(&dbspec.Spec{
Kind: "firestore",
Location: flags.project,
Name: flags.firestoredb,
})
if err != nil {
log.Fatal(err)
}

const vectorDBNamespace = "gaby"
vdb, err := firestore.NewVectorDB(ctx, slog.Default(), flags.project, flags.firestoredb, vectorDBNamespace)
if err != nil {
log.Fatal(err)
}
return db, vdb
}

// openDB opens the database described by spec.
func openDB(spec *dbspec.Spec) (storage.DB, error) {
switch spec.Kind {
case "mem":
return storage.MemDB(), nil
case "pebble":
return pebble.Open(logger, spec.Location)
case "firestore":
return firestore.NewDB(context.TODO(), logger, spec.Location, spec.Name)
default:
return nil, fmt.Errorf("unknown DB kind %q", spec.Kind)
}
}
14 changes: 13 additions & 1 deletion internal/docs/docs.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func (c *Corpus) Get(id string) (doc *Doc, ok bool) {

// Add adds a document with the given id, title, and text.
// If the document already exists in the corpus with the same title and text,
// Add is an no-op.
// Add is a no-op.
// Otherwise, if the document already exists in the corpus, it is replaced.
func (c *Corpus) Add(id, title, text string) {
old, ok := c.Get(id)
Expand All @@ -89,6 +89,18 @@ func (c *Corpus) Add(id, title, text string) {
b.Apply()
}

// Delete deletes a document with the given id.
// If the document does not exist inthe corpus, Delete is a no-op.
func (c *Corpus) Delete(id string) {
doc, ok := c.Get(id)
if !ok {
return
}
b := c.db.Batch()
timed.Delete(c.db, b, docsKind, ordered.Encode(doc.ID))
b.Apply()
}

// Docs returns an iterator over all documents in the corpus
// with IDs starting with a given prefix.
// The documents are ordered by ID.
Expand Down
12 changes: 12 additions & 0 deletions internal/docs/docs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,4 +117,16 @@ func TestCorpus(t *testing.T) {
if !slices.Equal(ids, want) {
t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want)
}

// After Delete id1.
corpus.Delete("id1")
corpus.Delete("id1111") // doesn't exist
ids = nil
for d := range corpus.Docs("id1") {
do(d)
}
want = []string{"id11"}
if !slices.Equal(ids, want) {
t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want)
}
}

0 comments on commit e238ae9

Please sign in to comment.