-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
internal/devtools/cmd/rmdoc: delete crawled pages from corpus
Gaby splits each crawled webpage into docs for embedding, computes embedding, and store them in the vector db. Delete all the docs and their embedding. This is meant to be run after the webpage is excluded from crawling with Crawler.Deny. For #63 Change-Id: I095a65b9a834ccf48062facc3654f40b43562e15 Reviewed-on: https://go-review.googlesource.com/c/oscar/+/635176 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Jonathan Amsterdam <[email protected]>
- Loading branch information
Showing
3 changed files
with
151 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
// Copyright 2024 The Go Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
// rmdoc deletes the documents from the corpus (including the vector db). | ||
// | ||
// Usage: go run . -project oscar-go-1 -firestoredb devel https://go.dev/x/y/z | ||
package main | ||
|
||
import ( | ||
"context" | ||
"flag" | ||
"fmt" | ||
"log" | ||
"log/slog" | ||
"strings" | ||
|
||
"cloud.google.com/go/compute/metadata" | ||
"golang.org/x/oscar/internal/dbspec" | ||
"golang.org/x/oscar/internal/docs" | ||
"golang.org/x/oscar/internal/gcp/firestore" | ||
"golang.org/x/oscar/internal/pebble" | ||
"golang.org/x/oscar/internal/storage" | ||
) | ||
|
||
var flags = struct { | ||
project string | ||
firestoredb string | ||
overlay string | ||
}{} | ||
|
||
func init() { | ||
flag.StringVar(&flags.project, "project", "", "name of the Google Cloud Project") | ||
flag.StringVar(&flags.firestoredb, "firestoredb", "", "name of the firestore db") | ||
} | ||
|
||
var logger = slog.Default() | ||
|
||
func main() { | ||
flag.Parse() | ||
|
||
args := flag.Args() | ||
if len(args) == 0 { | ||
log.Fatal("no args") | ||
} | ||
|
||
gabyDB, gabyVectorDB := initGCP() | ||
corpus := docs.New(logger, gabyDB) | ||
|
||
for _, url := range args { | ||
if !strings.HasPrefix(url, "https://go.dev/") { | ||
log.Println("ignoring unrecognized url:", url) | ||
continue | ||
} | ||
|
||
// TODO: do we need to delete crawl.Page entries too? | ||
|
||
for doc := range corpus.Docs(url) { | ||
hasVector := " " | ||
if _, ok := gabyVectorDB.Get(doc.ID); ok { | ||
hasVector = "*" | ||
} | ||
fmt.Printf("%v %v", hasVector, doc.ID) | ||
|
||
fmt.Printf(" delete (y/N)? ") | ||
var a string | ||
fmt.Scanln(&a) | ||
if answer := strings.ToLower(strings.TrimSpace(a)); answer == "y" || answer == "yes" { | ||
gabyVectorDB.Delete(doc.ID) | ||
corpus.Delete(doc.ID) | ||
if _, ok := gabyVectorDB.Get(doc.ID); ok { | ||
log.Fatalf("error - %v not removed from vector db", doc.ID) | ||
} | ||
fmt.Print(" ↪ deleted") | ||
} else { | ||
fmt.Print(" ↪ skipped") | ||
} | ||
fmt.Println() | ||
} | ||
} | ||
} | ||
|
||
func initGCP() (storage.DB, storage.VectorDB) { | ||
ctx := context.TODO() | ||
|
||
if flags.project == "" { | ||
projectID, err := metadata.ProjectIDWithContext(ctx) | ||
if err != nil { | ||
log.Fatalf("metadata project ID: %v", err) | ||
} | ||
if projectID == "" { | ||
log.Fatal("project ID from metadata is empty") | ||
} | ||
flags.project = projectID | ||
} | ||
|
||
db, err := openDB(&dbspec.Spec{ | ||
Kind: "firestore", | ||
Location: flags.project, | ||
Name: flags.firestoredb, | ||
}) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
const vectorDBNamespace = "gaby" | ||
vdb, err := firestore.NewVectorDB(ctx, slog.Default(), flags.project, flags.firestoredb, vectorDBNamespace) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
return db, vdb | ||
} | ||
|
||
// openDB opens the database described by spec. | ||
func openDB(spec *dbspec.Spec) (storage.DB, error) { | ||
switch spec.Kind { | ||
case "mem": | ||
return storage.MemDB(), nil | ||
case "pebble": | ||
return pebble.Open(logger, spec.Location) | ||
case "firestore": | ||
return firestore.NewDB(context.TODO(), logger, spec.Location, spec.Name) | ||
default: | ||
return nil, fmt.Errorf("unknown DB kind %q", spec.Kind) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters