Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# FileNest .gitignore

#mY STUFF TO IGNORE
tasks\mnist_classifier\src\ray_tune.py

# Environment variables
.env
.env.local
Expand Down Expand Up @@ -273,7 +276,7 @@ uploads/
downloads/
cache/
embeddings/
models/
# models/
vectors/

# FileNest configuration files (sensitive)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ FileNest/

```bash
# Clone the repository
git clone https://github.com/lakshyajain-0291/FileNest.git
git clone https://github.com/AISocietyIITJ/FileNest.git
cd FileNest

# Set up backend dependencies
Expand Down
17 changes: 17 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
module filenest

go 1.24.4

require (
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/pgx/v5 v5.7.5 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/jinzhu/inflection v1.0.0 // indirect
github.com/jinzhu/now v1.1.5 // indirect
golang.org/x/crypto v0.39.0 // indirect
golang.org/x/sync v0.15.0 // indirect
golang.org/x/text v0.26.0 // indirect
gorm.io/driver/postgres v1.6.0 // indirect
gorm.io/gorm v1.30.0 // indirect
)
29 changes: 29 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
github.com/jackc/pgx/v5 v5.7.5 h1:JHGfMnQY+IEtGM63d+NGMjoRpysB2JBwDr5fsngwmJs=
github.com/jackc/pgx/v5 v5.7.5/go.mod h1:aruU7o91Tc2q2cFp5h4uP3f6ztExVpyVv88Xl/8Vl8M=
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ=
github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM=
golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U=
golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8=
golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M=
golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gorm.io/driver/postgres v1.6.0 h1:2dxzU8xJ+ivvqTRph34QX+WrRaJlmfyPqXmoGVjMBa4=
gorm.io/driver/postgres v1.6.0/go.mod h1:vUw0mrGgrTK+uPHEhAdV4sfFELrByKVGnaVRkXDhtWo=
gorm.io/gorm v1.30.0 h1:qbT5aPv1UH8gI99OsRlvDToLxW5zR7FzS9acZDOZcgs=
gorm.io/gorm v1.30.0/go.mod h1:8Z33v652h4//uMA76KjeDH8mJXPm1QNCYrMeatR0DOE=
78 changes: 78 additions & 0 deletions tasks/backend_prototype/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# FileNest Backend Prototype

This project is a backend prototype for indexing and storing file embeddings using Go, PostgreSQL (with GORM), and a worker pool architecture. It demonstrates concurrent file processing, embedding generation, and database storage.

## Features

- Walks a directory and processes all `.txt` files
- Generates random embeddings for each file (placeholder for real embedding models)
- Calculates cosine similarity to a set of reference embeddings (D1TV)
- Stores file metadata and embeddings in PostgreSQL using GORM
- Uses a worker pool for concurrent file processing
- Graceful shutdown on interrupt

## Project Structure

```
backend_prototype/
β”œβ”€β”€ db/
β”‚ └── database.go # Database connection and initialization
β”œβ”€β”€ embedding.go # Embedding and similarity functions
β”œβ”€β”€ go.mod # Go module definition
β”œβ”€β”€ go.sum # Go dependencies
β”œβ”€β”€ main.go # Application entry point
β”œβ”€β”€ models/
β”‚ └── model.go # FileIndex model definition
β”œβ”€β”€ process.go # File processing logic
β”œβ”€β”€ worker.go # Worker pool and job processing
β”œβ”€β”€ sample_texts/
β”‚ β”œβ”€β”€ a.txt ... j.txt # Sample text files for indexing
└── README.md # Project documentation
```

## Prerequisites

- Go 1.21 or higher
- PostgreSQL database
- Set the `POSTGRES_DSN` environment variable with your PostgreSQL connection string

## Getting Started

1. **Install dependencies:**
```sh
go mod tidy
```

2. **Set up your PostgreSQL DSN:**
```sh
export POSTGRES_DSN="host=localhost user=youruser password=yourpass dbname=yourdb port=5432 sslmode=disable"
```

3. **Run the application:**
```sh
go run main.go worker.go process.go embedding.go ./sample_texts
```

By default, it will process all `.txt` files in `sample_texts/` using 5 workers.

4. **Command-line options:**
- `-w`: Number of concurrent workers (default: 5)
- `-dir`: Directory to index (default: ./sample_texts)
- `-timeout`: Per-file processing timeout (default: 5s)

Example:
```sh
go run main.go worker.go process.go embedding.go db/database.go models/model.go -w 10 -dir ./sample_texts -timeout 10s
```

## How It Works

- The app walks through the specified directory, sending `.txt` files to a pool of workers.
- Each worker reads the file, generates a random embedding, finds the most similar D1TV embedding, and stores the result in PostgreSQL.
- Embeddings are stored as arrays using the `pq.Float64Array` type.

## Notes

- The embedding generation is a placeholder; replace `generateEmbedding` in `embedding.go` with your actual model.
- The database schema is auto-migrated on startup.
- Sample text files are provided in `sample_texts/`.
23 changes: 23 additions & 0 deletions tasks/backend_prototype/db/database.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package db

import (
"log"
"os" //to read environment variables

"backend_prototype/models"

"gorm.io/driver/postgres"
"gorm.io/gorm" //main GORM library
)

var DB *gorm.DB //pointer to GORM database connection

func InitDB() {
dsn := os.Getenv("POSTGRES_DSN") //dsn (Data Source Name) connection string to PostgreSQL, set as environment variable
var err error
DB, err = gorm.Open(postgres.Open(dsn), &gorm.Config{}) //here, postgres.open creates a gorm drive for postgres and gorm.open establishes the actual connection
if err != nil {
log.Fatalf("Failed to connect DB: %v", err)
}
DB.AutoMigrate(&models.FileIndex{}) //telling gorm to update the database schema to match the FileIndex model
}
26 changes: 26 additions & 0 deletions tasks/backend_prototype/embedding.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package main

import (
"math"
"math/rand"
)

const embeddingDim = 128

func generateEmbedding(content string) []float64 { //generateEmbedding creates a random embedding vector for the given content, whereas in rela application, we will use a pre-trained model to generate the embedding
vec := make([]float64, embeddingDim)
for i := range vec {
vec[i] = rand.Float64()
}
return vec
}

func cosineSimilarity(a, b []float64) float64 {
var dot, normA, normB float64
for i := 0; i < embeddingDim; i++ {
dot += a[i] * b[i] //sum of all dot products
normA += a[i] * a[i] //sum of all square of first vector
normB += b[i] * b[i]
}
return dot / (math.Sqrt(normA) * math.Sqrt(normB)) //cosine similarity s
}
21 changes: 21 additions & 0 deletions tasks/backend_prototype/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
module backend_prototype

go 1.24.4

require (
github.com/lib/pq v1.10.9
gorm.io/driver/postgres v1.6.0
gorm.io/gorm v1.30.0
)

require (
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/pgx/v5 v5.6.0 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/jinzhu/inflection v1.0.0 // indirect
github.com/jinzhu/now v1.1.5 // indirect
golang.org/x/crypto v0.31.0 // indirect
golang.org/x/sync v0.10.0 // indirect
golang.org/x/text v0.21.0 // indirect
)
38 changes: 38 additions & 0 deletions tasks/backend_prototype/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
github.com/jackc/pgx/v5 v5.6.0 h1:SWJzexBzPL5jb0GEsrPMLIsi/3jOo7RHlzTjcAeDrPY=
github.com/jackc/pgx/v5 v5.6.0/go.mod h1:DNZ/vlrUnhWCoFGxHAG8U2ljioxukquj7utPDgtQdTw=
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ=
github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gorm.io/driver/postgres v1.6.0 h1:2dxzU8xJ+ivvqTRph34QX+WrRaJlmfyPqXmoGVjMBa4=
gorm.io/driver/postgres v1.6.0/go.mod h1:vUw0mrGgrTK+uPHEhAdV4sfFELrByKVGnaVRkXDhtWo=
gorm.io/gorm v1.30.0 h1:qbT5aPv1UH8gI99OsRlvDToLxW5zR7FzS9acZDOZcgs=
gorm.io/gorm v1.30.0/go.mod h1:8Z33v652h4//uMA76KjeDH8mJXPm1QNCYrMeatR0DOE=
72 changes: 72 additions & 0 deletions tasks/backend_prototype/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package main

import (
"backend_prototype/db" //custom package for database operations (InitDB)
"context"
"flag" //for command-line flags like -w, -dir, -timeout
"log"
"os" //for file operations and environment variables (setting up postgres data source name (DSN))
"os/signal" //for ctrl+c handling
"path/filepath" //for walking the directory structure
"sync" //for managing concurrent workers
"time"
)

func main() {
//all command line flags
numWorkers := flag.Int("w", 5, "number of concurrent workers") //number of concurrent workers, default is 5
dirPath := flag.String("dir", "./sample_texts", "directory to index") //path to directory to index, default is "./sample_texts"
timeout := flag.Duration("timeout", 5*time.Second, "per-file processing timeout") //timeout for processing each file, default is 5 seconds

flag.Parse() //reading the flags the cient has set
db.InitDB() //initDB from database.go and gets the model fileindex from models/model.go

// generating D1TV embeddings
var d1tvs [][]float64 //multidimensional slice to hold D1TV embeddings, can help in automatically increasing the size of the slice as needed
for i := 0; i < 10; i++ {
d1tvs = append(d1tvs, generateEmbedding("D1TV")) //generateEmbedding is a function from worker.go that generates a D1TV embedding for each file
}

jobs := make(chan FileJob) //chanel to carry file jobs to workers

ctx, stop := context.WithCancel(context.Background()) //ctx here is a context that is cancelled so as to stop workers at ctrl + c
defer stop() //using stop function as soon as main function exits, here stop is like a variable holding cancel function

// handling ctrl + c to shutdown properly
go func() { //startinga goroutine
c := make(chan os.Signal, 1) //made a channel to wait for an os signal and it can hold 1
signal.Notify(c, os.Interrupt) //our program gets notified when we get an os.interrupt
<-c //until someone writes into c channel
log.Println("Interrupt received, shutting down...")
stop()
}()

var wg sync.WaitGroup //creating a pool of workers using sync.WaitGroup
for i := 1; i <= *numWorkers; i++ { //from 1 to number of workers specified using the -w flag
wg.Add(1) //adding a worker
go func(id int) { //here we are defining the function, calling it and launching it as a goroutine
defer wg.Done() //later tell when the worker is done
startWorker(id, jobs, d1tvs, *timeout) //startWorker is a function from worker.go that does the actual work of processing files
}(i) //calling the function with id as i
}

// Walk the directory and send file paths to jobs channel
err := filepath.Walk(*dirPath, func(path string, info os.FileInfo, err error) error { //starting to walk the directory specified by -dir flag, calling the function helps us recieve the string path, its metadata or any error it is facing
select { //this select statement is used to handle the context cancellation
case <-ctx.Done():
return ctx.Err() //
default: //for when context is not cancelled
if err == nil && !info.IsDir() && filepath.Ext(path) == ".txt" { //ensures no error, the file is not soem kind of directory and filters for .txt file
jobs <- FileJob{Path: path} //send the file path to jobs channel
}
return nil //return to show that no error is there
}
})
if err != nil && err != context.Canceled { //if an error occurs and its not due to cancellation context
log.Fatalf("Walk error: %v", err)
}

close(jobs) //close the jobs channel to tell that all files have been sent
wg.Wait() //waiting for workers to get processed
log.Println("Indexing complete.")
}
16 changes: 16 additions & 0 deletions tasks/backend_prototype/models/model.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package models

import (
"time"

"github.com/lib/pq" //provides pq.Float64Array for []float64 type
)

type FileIndex struct {
ID uint `gorm:"primaryKey"` // telling gorm that this is the primary key
Filename string
Filepath string
Embedding pq.Float64Array `gorm:"type:float8[]"` // using pq.Float64Array for []float64 type because gorm does not support []float64 directly
D1TVID int
IndexedAt time.Time
}
Loading