diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a55c894 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,73 @@ +name: CI + +on: + push: + branches: [ main ] + +jobs: + test: + name: Test + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v4 + with: + go-version: '1.21' + + - name: Cache Go modules + uses: actions/cache@v4 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- + + - name: Run tests + run: | + go test ./... -v -coverprofile=coverage.out + + - name: Run vet + run: | + go vet ./... + + - name: Upload coverage + if: always() + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: coverage.out + + build: + name: Build + needs: test + runs-on: ubuntu-latest + strategy: + matrix: + goos: [linux, windows, darwin] + goarch: [amd64] + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v4 + with: + go-version: '1.21' + + - name: Build binary + run: | + mkdir -p build + BIN_NAME=mediarizer2 + if [ "${{ matrix.goos }}" = "windows" ]; then BIN_NAME=mediarizer2.exe; fi + env GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} go build -o build/${BIN_NAME}-${{ matrix.goos }}-${{ matrix.goarch }} ./app + + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: mediarizer2-${{ matrix.goos }}-${{ matrix.goarch }} + path: build/ diff --git a/app/consumer.go b/app/consumer.go index 440a040..39666cd 100644 --- a/app/consumer.go +++ b/app/consumer.go @@ -19,12 +19,15 @@ func consumer( geoLocation bool, format string, verbose bool, - duplicateStrategy string, processedFiles *int64, - done chan<- struct{}) { + done chan<- struct{}, + dupChecker *duplicate.DuplicateChecker) { var wg sync.WaitGroup numWorkers := runtime.NumCPU() / 2 + if numWorkers < 1 { + numWorkers = 1 + } for i := 0; i < numWorkers; i++ { wg.Add(1) @@ -38,7 +41,7 @@ func consumer( geoLocation, format, verbose, - duplicateStrategy, + dupChecker, ) atomic.AddInt64(processedFiles, 1) @@ -57,55 +60,64 @@ func processFileInfo( geoLocation bool, format string, verbose bool, - duplicateStrategy string, + dupChecker *duplicate.DuplicateChecker, ) { - var generatedPath string - var err error + // Check for duplicates first + isDup, originalPath, err := dupChecker.CheckAndTrack(fileInfo.Path) + if err != nil { + errorQueue <- fmt.Errorf("failed to check duplicate for %s: %v", fileInfo.Path, err) + return + } + + if isDup { + duplicatesDir := filepath.Join(destinationPath, "duplicates") + if err := duplicate.MoveDuplicate(fileInfo.Path, originalPath, duplicatesDir); err != nil { + errorQueue <- fmt.Errorf("failed to move duplicate %s: %v", fileInfo.Path, err) + } + return + } + var generatedPath string generatedPath, err = getDestinationPath(destinationPath, fileInfo, geoLocation, format) if err != nil { errorQueue <- err return } - if fileInfo.isDuplicate { - generatedPath, err = duplicate.CreateDuplicateFolder(generatedPath, "DUPLICATE") + // Check if file already exists and add numeric suffix if needed + _, err = os.Stat(generatedPath) + if !os.IsNotExist(err) { + generatedPath, err = generateUniquePathName(generatedPath) if err != nil { errorQueue <- err return } - generatedPath = filepath.Join(generatedPath, filepath.Base(fileInfo.Path)) - } else { - _, err = os.Stat(generatedPath) - if !os.IsNotExist(err) { - generatedPath, err = generateUniquePathName(generatedPath) - if err != nil { - errorQueue <- err - return - } - } } err = moveFile( fileInfo.Path, generatedPath, verbose, - fileInfo.isDuplicate, - duplicateStrategy, ) if err != nil { errorQueue <- fmt.Errorf("failed to move %s to %s: %v", fileInfo.Path, generatedPath, err) + } else { + // Track the new file in the index + if err := dupChecker.TrackNewFile(generatedPath); err != nil { + // Log warning but don't fail + // fmt.Printf("Warning: failed to track new file %s: %v\n", generatedPath, err) + } } } -func moveFile(sourcePath, destinationPath string, verbose bool, isDuplicate bool, duplicateStrategy string) error { +func moveFile(sourcePath, destinationPath string, verbose bool) error { destPath := filepath.Dir(destinationPath) if err := os.MkdirAll(destPath, os.ModePerm); err != nil { return fmt.Errorf("failed to create destination directory %s: %v", destPath, err) } if verbose { - moveActionLog, err := logMoveAction(sourcePath, destPath, isDuplicate, duplicateStrategy) + moveActionLog, err := logMoveAction(sourcePath, destPath) if err != nil { return err } diff --git a/app/consumer_test.go b/app/consumer_test.go new file mode 100644 index 0000000..1e7a9b2 --- /dev/null +++ b/app/consumer_test.go @@ -0,0 +1,26 @@ +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestGenerateUniquePathName_AppendsCounter(t *testing.T) { + dir := t.TempDir() + p := filepath.Join(dir, "file.jpg") + if err := os.WriteFile(p, []byte("x"), 0644); err != nil { + t.Fatalf("write: %v", err) + } + + p2, err := generateUniquePathName(p) + if err != nil { + t.Fatalf("generateUniquePathName: %v", err) + } + if p2 == p { + t.Fatalf("expected different path") + } + if filepath.Ext(p2) != ".jpg" { + t.Fatalf("expected same extension") + } +} diff --git a/app/creator.go b/app/creator.go index 8d1b8ce..95bba7b 100644 --- a/app/creator.go +++ b/app/creator.go @@ -10,8 +10,6 @@ import ( "sync" "time" - "github.com/keybraker/mediarizer-2/duplicate" - "github.com/rwcarlsen/goexif/exif" ) @@ -27,7 +25,6 @@ func creator( fileTypesToInclude []string, organisePhotos bool, organiseVideos bool, - duplicateStrategy string, fileHashMap *sync.Map, hashCache *sync.Map, ) { @@ -36,6 +33,9 @@ func creator( var wg sync.WaitGroup numWorkers := runtime.NumCPU() / 2 + if numWorkers < 1 { + numWorkers = 1 + } for i := 0; i < numWorkers; i++ { wg.Add(1) @@ -52,7 +52,6 @@ func creator( fileTypesToInclude, organisePhotos, organiseVideos, - duplicateStrategy, fileHashMap, hashCache, ) @@ -93,7 +92,6 @@ func processFile( fileTypesToInclude []string, organisePhotos bool, organiseVideos bool, - duplicateStrategy string, fileHashMap *sync.Map, hashCache *sync.Map, ) { @@ -110,28 +108,6 @@ func processFile( return } - isDuplicate, err := duplicate.IsDuplicate(path, duplicateStrategy, fileHashMap, hashCache) - if err != nil { - errorQueue <- err - return - } - - if isDuplicate { - switch duplicateStrategy { - case "skip": - fmt.Printf("Skipped duplicate file: %v\n", path) - logMoveAction(path, "", true, duplicateStrategy) - return - case "delete": - if err := os.Remove(path); err != nil { - errorQueue <- fmt.Errorf("failed to delete duplicate file: %v", err) - } else { - logMoveAction(path, "", true, duplicateStrategy) - } - return - } - } - if geoLocation { country, err := getCountry(path) if err != nil { @@ -141,7 +117,7 @@ func processFile( warnQueue <- fmt.Sprintf("no country found for file: %v", path) } - fileQueue <- FileInfo{Path: path, FileType: fileType, isDuplicate: isDuplicate, Country: country} + fileQueue <- FileInfo{Path: path, FileType: fileType, Country: country} } else { createdDate, hasCreationDate, err := getCreatedTime(path) if err != nil { @@ -152,7 +128,6 @@ func processFile( fileQueue <- FileInfo{ Path: path, FileType: fileType, - isDuplicate: isDuplicate, Created: createdDate, HasCreationDate: hasCreationDate, } @@ -160,23 +135,6 @@ func processFile( } func getFileType(path string, fileTypesToInclude []string, organisePhotos bool, organiseVideos bool) FileType { - file, err := os.Open(path) - if err != nil { - logger(LoggerTypeWarning, fmt.Sprintf("failed to open file %v: %v", path, err)) - return FileTypeUnknown - } - defer file.Close() - - fileInfo, err := file.Stat() - if err != nil { - logger(LoggerTypeWarning, fmt.Sprintf("failed to get file info: %v", err)) - return FileTypeUnknown - } - - if fileInfo.IsDir() { - return FileTypeFolder - } - fileType := FileTypeUnknown if fileTypesToInclude != nil { fileType = FileTypeExcluded diff --git a/app/creator_test.go b/app/creator_test.go new file mode 100644 index 0000000..1cd8382 --- /dev/null +++ b/app/creator_test.go @@ -0,0 +1,22 @@ +package main + +import "testing" + +func TestGetFileType_ExtensionOnly(t *testing.T) { + if got := getFileType("C:/tmp/a.JPG", nil, true, true); got != FileTypeImage { + t.Fatalf("expected image, got %v", got) + } + if got := getFileType("C:/tmp/a.MP4", nil, true, true); got != FileTypeVideo { + t.Fatalf("expected video, got %v", got) + } + + // When types list is provided and does not include extension, excluded. + if got := getFileType("C:/tmp/a.jpg", []string{".mp4"}, true, true); got != FileTypeExcluded { + t.Fatalf("expected excluded, got %v", got) + } + + // Unknown extension becomes Unknown when no explicit types filter. + if got := getFileType("C:/tmp/a.bin", nil, true, true); got != FileTypeUnknown { + t.Fatalf("expected unknown, got %v", got) + } +} diff --git a/app/file.go b/app/file.go index fb8ab26..6d2e7cd 100644 --- a/app/file.go +++ b/app/file.go @@ -33,6 +33,10 @@ func getPhotoType(fileExt string) PhotoType { return PNG case ".gif": return GIF + case ".dng": + return DNG + case ".nef": + return NEF default: return -1 } diff --git a/app/file_test.go b/app/file_test.go new file mode 100644 index 0000000..78681a2 --- /dev/null +++ b/app/file_test.go @@ -0,0 +1,22 @@ +package main + +import "testing" + +func TestPhotoAndVideoDetection(t *testing.T) { + if !isPhoto(".JPG") { + t.Fatalf("expected .JPG to be photo") + } + if !isPhoto(".jpeg") { + t.Fatalf("expected .jpeg to be photo") + } + if isPhoto(".txt") { + t.Fatalf("expected .txt to not be photo") + } + + if !isVideo(".mp4") { + t.Fatalf("expected .mp4 to be video") + } + if isVideo(".jpg") { + t.Fatalf("expected .jpg to not be video") + } +} diff --git a/app/logger.go b/app/logger.go index 25d3871..d88c08a 100644 --- a/app/logger.go +++ b/app/logger.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "sync" ) @@ -52,31 +53,12 @@ func infoHandler(infoQueue chan string) { } } -func logMoveAction(sourcePath, destinationDirectory string, isDuplicate bool, duplicateStrategy string) (string, error) { +func logMoveAction(sourcePath, destinationDirectory string) (string, error) { colorCode := "\033[32m" - actionName := "Moved (original)" + actionName := "Moved" fileName := filepath.Base(sourcePath) - if isDuplicate { - switch duplicateStrategy { - case "move": - colorCode = "\033[33m" - actionName = "Moved (duplicate)" - case "skip": - colorCode = "\033[34m" - actionName = "Skipped (duplicate)" - return fmt.Sprintf("\033[1m%s%s\033[0m %s\n", colorCode, actionName, fileName), nil - case "delete": - colorCode = "\033[31m" - actionName = "Deleted (duplicate)" - return fmt.Sprintf("\033[1m%s%s\033[0m %s\n", colorCode, actionName, fileName), nil - default: - colorCode = "\033[35m" - actionName = "Unknown Operation" - } - } - const maxPathLength = 90 var source, destination string @@ -126,3 +108,13 @@ func logger(loggerType string, message string) { ErrorLogger.Println("Unknown logger type:", loggerType) } } + +func loggerWithDots(loggerType string, message string, totalWidth int) { + dotsCount := totalWidth - len(message) - 12 + if dotsCount < 0 { + dotsCount = 0 + } + dots := strings.Repeat(".", dotsCount) + formattedMessage := message + dots + "done" + logger(loggerType, formattedMessage) +} diff --git a/app/main.go b/app/main.go index 8145720..712a6f4 100644 --- a/app/main.go +++ b/app/main.go @@ -6,12 +6,12 @@ import ( "log" "os" "path/filepath" - "strconv" "strings" "sync" "sync/atomic" "time" + "github.com/keybraker/mediarizer-2/duplicate" "github.com/keybraker/mediarizer-2/hash" ) @@ -28,6 +28,7 @@ var ( showHelp *bool verbose *bool showVersion *bool + skipIndex *bool InfoLogger *log.Logger VerboseLogger *log.Logger @@ -35,6 +36,13 @@ var ( ErrorLogger *log.Logger ) +// ExecutionStep represents a step in the execution process with timing information +type ExecutionStep struct { + Name string + Duration time.Duration + Order int +} + func main() { l0 := " __ ___ ___ _ ___ " l1 := " / |/ /__ ___/ (_)__ _____(_)__ ___ ____ |_ |" @@ -42,7 +50,8 @@ func main() { l3 := "/_/ /_/\\__/\\_,_/_/\\_,_/_/ /_//__/\\__/_/ /____/ (v1.0.2)" fmt.Println("\n" + l0 + "\n" + l1 + "\n" + l2 + "\n" + l3 + "\n\n\t\t\t\tby Keybraker\n") - start := time.Now() + startTotal := time.Now() + executionSteps := []ExecutionStep{} flag.Parse() fileTypes := flagProcessor() @@ -58,42 +67,46 @@ func main() { startLoggerHandlers(&wg, infoQueue, warnQueue, errorQueue) - logger(LoggerTypeInfo, "Counting files in path.") + stepStart := time.Now() + loggerWithDots(LoggerTypeInfo, "Counting files in path.", 80) totalFilesToMove := countFiles(sourcePath, fileTypes, *organisePhotos, *organiseVideos) + stepDuration := time.Since(stepStart) + executionSteps = append(executionSteps, ExecutionStep{Name: "Count source files", Duration: stepDuration, Order: 1}) if totalFilesToMove == 0 { - logger(LoggerTypeInfo, "No files in path, exiting.") + loggerWithDots(LoggerTypeInfo, "No files in path, exiting.", 80) return } else { - logger(LoggerTypeInfo, fmt.Sprintf("%d files to be processed.", totalFilesToMove)) + loggerWithDots(LoggerTypeInfo, fmt.Sprintf("%d files to be processed.", totalFilesToMove), 80) } + stepStart = time.Now() hashCache, err := hash.InitHashCache("") if err != nil { logger(LoggerTypeWarning, fmt.Sprintf("Failed to load hash cache: %v. Using empty cache.", err)) hashCache = &sync.Map{} } else { - logger(LoggerTypeInfo, "Hash cache loaded successfully.") + loggerWithDots(LoggerTypeInfo, "Hash cache loaded successfully.", 80) } - - logger(LoggerTypeInfo, "Creating file hash-map on the destination path.") - totalFilesInDestination := countFiles(destinationPath, fileTypes, *organisePhotos, *organiseVideos) - - var hashedFiles int64 - stopHashSpinner := make(chan bool) - go spinner(stopHashSpinner, "Hashing:", &hashedFiles, totalFilesInDestination) - - fileHashMap, err := hash.HashImagesInPath(destinationPath, hashCache, &hashedFiles) - if err != nil { - stopHashSpinner <- true - logger(LoggerTypeInfo, "Failed to create file hash map.") - logger(LoggerTypeFatal, err.Error()) + stepDuration = time.Since(stepStart) + executionSteps = append(executionSteps, ExecutionStep{Name: "Load hash cache", Duration: stepDuration, Order: 2}) + + // Initialize Duplicate Checker + stepStart = time.Now() + dupChecker := duplicate.NewDuplicateChecker(destinationPath, hashCache, *skipIndex) + var indexedFiles int64 + if !*skipIndex { + stopIndexSpinner := make(chan bool) + go spinner(stopIndexSpinner, "Indexing destination:", &indexedFiles, 0) + if err := dupChecker.Initialize(&indexedFiles); err != nil { + logger(LoggerTypeWarning, fmt.Sprintf("Failed to initialize duplicate checker: %v", err)) + } + stopIndexSpinner <- true } + stepDuration = time.Since(stepStart) + executionSteps = append(executionSteps, ExecutionStep{Name: "Index destination", Duration: stepDuration, Order: 3}) - stopHashSpinner <- true - elapsed := time.Since(start) - logger(LoggerTypeInfo, fmt.Sprintf("File hash-map created in %.2f seconds.", elapsed.Seconds())) - + stepStart = time.Now() var processedFiles int64 stopSpinner := make(chan bool) @@ -101,6 +114,8 @@ func main() { done := make(chan struct{}) + fileHashMap := &sync.Map{} + go creator( sourcePath, fileQueue, @@ -111,7 +126,6 @@ func main() { fileTypes, *organisePhotos, *organiseVideos, - *duplicateStrategy, fileHashMap, hashCache, ) @@ -123,26 +137,27 @@ func main() { *geoLocation, *format, *verbose, - *duplicateStrategy, &processedFiles, done, + dupChecker, ) <-done stopSpinner <- true + stepDuration = time.Since(stepStart) + executionSteps = append(executionSteps, ExecutionStep{Name: "Process and move files", Duration: stepDuration, Order: 4}) - // Save the hash cache to disk before exiting + stepStart = time.Now() if err := hash.SaveHashCache(hashCache, hash.DefaultCacheFilePath); err != nil { logger(LoggerTypeWarning, fmt.Sprintf("Failed to save hash cache: %v", err)) } else { - logger(LoggerTypeInfo, "Hash cache saved successfully.") + loggerWithDots(LoggerTypeInfo, "Hash cache saved successfully.", 80) } + stepDuration = time.Since(stepStart) + executionSteps = append(executionSteps, ExecutionStep{Name: "Save hash cache", Duration: stepDuration, Order: 5}) - elapsed = time.Since(start) - elapsedString := formatElapsedTime(elapsed) - - logger(LoggerTypeInfo, strconv.Itoa(totalFilesToMove)+" files processed.") - logger(LoggerTypeInfo, fmt.Sprintf("Processing completed in %s.", elapsedString)) + totalElapsed := time.Since(startTotal) + displayExecutionSummary(totalElapsed, executionSteps, totalFilesToMove) } func formatElapsedTime(elapsed time.Duration) string { @@ -152,12 +167,29 @@ func formatElapsedTime(elapsed time.Duration) string { if minutes > 0 { if minutes == 1 { - return fmt.Sprintf("%d minute and %d seconds", minutes, seconds) + return fmt.Sprintf("%d min and %d secs", minutes, seconds) } - return fmt.Sprintf("%d minutes and %d seconds", minutes, seconds) + return fmt.Sprintf("%d mins and %d secs", minutes, seconds) } - return fmt.Sprintf("%.2f seconds", elapsed.Seconds()) + return fmt.Sprintf("%.2f secs", elapsed.Seconds()) +} + +func displayExecutionSummary(totalElapsed time.Duration, steps []ExecutionStep, filesProcessed int) { + fmt.Println("\n" + strings.Repeat("=", 80)) + fmt.Printf("Total files processed: %d\n", filesProcessed) + fmt.Printf("Total execution time: %s\n", formatElapsedTime(totalElapsed)) + fmt.Println(strings.Repeat("-", 80)) + fmt.Printf("%-40s | %20s | %14s\n", "Step", "Duration", "Percentage") + fmt.Println(strings.Repeat("-", 80)) + + for _, step := range steps { + percentage := (float64(step.Duration.Milliseconds()) / float64(totalElapsed.Milliseconds())) * 100 + durationStr := formatElapsedTime(step.Duration) + fmt.Printf("%-40s | %20s | %13.2f%%\n", step.Name, durationStr, percentage) + } + + fmt.Println(strings.Repeat("=", 80)) } func spinner(stopSpinner chan bool, verb string, processedFiles *int64, totalFiles int) { @@ -170,8 +202,14 @@ func spinner(stopSpinner chan bool, verb string, processedFiles *int64, totalFil return default: processed := atomic.LoadInt64(processedFiles) - percentage := float64(processed) / float64(totalFiles) * 100 - fmt.Printf("\r%c | %s: %d/%d (%.2f%%)", spinChars[i], verb, processed, totalFiles, percentage) + var output string + if totalFiles > 0 { + percentage := float64(processed) / float64(totalFiles) * 100 + output = fmt.Sprintf("\r%c | %s: %d/%d (%.2f%%)", spinChars[i], verb, processed, totalFiles, percentage) + } else { + output = fmt.Sprintf("\r%c | %s: %d files", spinChars[i], verb, processed) + } + fmt.Print(output) i = (i + 1) % len(spinChars) time.Sleep(100 * time.Millisecond) } @@ -228,6 +266,7 @@ func init() { showHelp = flag.Bool("help", false, "Display usage guide") verbose = flag.Bool("verbose", false, "Display progress information in console") showVersion = flag.Bool("version", false, "Display version information") + skipIndex = flag.Bool("skip-index", false, "Skip indexing destination files (faster startup, but may miss duplicates)") InfoLogger = log.New(os.Stdout, "\033[1m\033[34minfo\033[0m:\t", log.Lmsgprefix) VerboseLogger = log.New(os.Stdout, "\033[1m\033[36mverbose\033[0m:\t", log.Ldate|log.Ltime) @@ -271,7 +310,11 @@ func flagProcessor() []string { } if *geoLocation { - loadFeatureCollection() + fc, err := loadFeatureCollection() + if err != nil { + logger(LoggerTypeFatal, fmt.Sprintf("failed to load countries.json: %v", err)) + } + featureCollection = fc } return fileTypes diff --git a/app/types.go b/app/types.go index 6642414..98422d4 100644 --- a/app/types.go +++ b/app/types.go @@ -34,6 +34,8 @@ const ( JPEG PNG GIF + DNG + NEF ) type VideoType int diff --git a/docs/README.md b/docs/README.md index 6225610..76c4a6a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,7 +3,7 @@ Mediarizer2 is a command-line tool for organizing your media files. It allows you to easily sort your photos and videos into folders based on date, location, file type, and other criteria. -> As speed is prioritized instead of copying a file it gets transferred to a different location; this means the input and output folders shall be on the same physical drive in order to assure maximum performance. +> To maximize speed, files are moved (renamed) instead of copying. For best performance, input and output directories are on the same physical drive — moving between different drives requires copying and is significantly slower. ## Installation diff --git a/duplicate/duplicate.go b/duplicate/duplicate.go index 5b2087a..f67e45a 100644 --- a/duplicate/duplicate.go +++ b/duplicate/duplicate.go @@ -4,47 +4,104 @@ import ( "fmt" "os" "path/filepath" - "strings" "sync" "github.com/keybraker/mediarizer-2/hash" ) -// createDuplicateFolder creates a folder for storing duplicates of the file. -func CreateDuplicateFolder(destinationPath, duplicateFileName string) (string, error) { - ext := filepath.Ext(duplicateFileName) - nameWithoutExt := strings.TrimSuffix(duplicateFileName, ext) - underscoreExt := strings.ReplaceAll(ext, ".", "_") - duplicatesFolder := filepath.Join(filepath.Dir(destinationPath), fmt.Sprintf("%s%s", nameWithoutExt, underscoreExt)) +// DuplicateChecker handles duplicate detection logic +type DuplicateChecker struct { + destIndex *sync.Map // Map[hashString]filePath + hashCache *sync.Map + destPath string + skipIndex bool +} - err := os.MkdirAll(duplicatesFolder, 0755) +// NewDuplicateChecker creates a new duplicate checker +func NewDuplicateChecker(destPath string, hashCache *sync.Map, skipIndex bool) *DuplicateChecker { + return &DuplicateChecker{ + destIndex: &sync.Map{}, + hashCache: hashCache, + destPath: destPath, + skipIndex: skipIndex, + } +} + +// Initialize builds the initial index of the destination folder +func (dc *DuplicateChecker) Initialize(progress *int64) error { + if dc.skipIndex { + return nil + } + + fmt.Println("Building destination index...") + index, err := hash.BuildDestinationHashIndex(dc.destPath, dc.hashCache, progress) if err != nil { - return "", fmt.Errorf("failed to create duplicates folder: %v", err) + return fmt.Errorf("failed to build destination index: %v", err) } + dc.destIndex = index + return nil +} - return duplicatesFolder, nil +// CheckAndTrack checks if a file is a duplicate and tracks it if not +// Returns (isDuplicate, originalPath, error) +func (dc *DuplicateChecker) CheckAndTrack(sourcePath string) (bool, string, error) { + // Calculate hash of source file + hashStr, err := hash.GetFileHashString(sourcePath, dc.hashCache) + if err != nil { + return false, "", err + } + + // Check if hash exists in index + if existingPath, found := dc.destIndex.Load(hashStr); found { + // Verify the file actually exists at that path (handle stale cache) + if _, err := os.Stat(existingPath.(string)); err == nil { + return true, existingPath.(string), nil + } + // If file doesn't exist, remove from index and continue + dc.destIndex.Delete(hashStr) + } + + // If we're skipping index, we might want to do a lazy check here + // For now, we just assume it's unique if we skipped indexing + // But we should still track it for the current session + + return false, "", nil } -// isDuplicate checks if the file is a duplicate and handles it based on the strategy. -func IsDuplicate( - path string, - duplicateStrategy string, - fileHashMap *sync.Map, - hashCache *sync.Map, -) (bool, error) { - hashValue, err := hash.GetFileHash(path, hashCache) +// TrackNewFile adds a newly moved file to the index +func (dc *DuplicateChecker) TrackNewFile(filePath string) error { + hashStr, err := hash.GetFileHashString(filePath, dc.hashCache) if err != nil { - return false, err + return err } + dc.destIndex.Store(hashStr, filePath) + return nil +} + +// MoveDuplicate moves a duplicate file to the duplicates folder +func MoveDuplicate(sourcePath, originalPath, duplicatesDir string) error { + fileName := filepath.Base(sourcePath) + destPath := filepath.Join(duplicatesDir, fileName) - hashStr := string(hashValue) + // Ensure unique filename in duplicates folder + ext := filepath.Ext(fileName) + name := fileName[:len(fileName)-len(ext)] + counter := 1 + for { + if _, err := os.Stat(destPath); os.IsNotExist(err) { + break + } + destPath = filepath.Join(duplicatesDir, fmt.Sprintf("%s_%d%s", name, counter, ext)) + counter++ + } - _, exists := fileHashMap.Load(hashStr) - if exists { - return true, nil + if err := os.MkdirAll(filepath.Dir(destPath), 0755); err != nil { + return fmt.Errorf("failed to create duplicates directory: %v", err) } - fileHashMap.Store(hashStr, true) + if err := os.Rename(sourcePath, destPath); err != nil { + return fmt.Errorf("failed to move duplicate file: %v", err) + } - return false, nil + return nil } diff --git a/duplicate/duplicate_test.go b/duplicate/duplicate_test.go new file mode 100644 index 0000000..e734711 --- /dev/null +++ b/duplicate/duplicate_test.go @@ -0,0 +1,117 @@ +package duplicate + +import ( + "os" + "path/filepath" + "sync" + "testing" +) + +func TestDuplicateChecker_CheckAndTrack(t *testing.T) { + dir := t.TempDir() + destDir := filepath.Join(dir, "dest") + if err := os.Mkdir(destDir, 0755); err != nil { + t.Fatalf("mkdir: %v", err) + } + + // Create a file in destination + destFile := filepath.Join(destDir, "existing.jpg") + if err := os.WriteFile(destFile, []byte("content"), 0644); err != nil { + t.Fatalf("write dest file: %v", err) + } + + // Create a source file with same content + srcFile := filepath.Join(dir, "source.jpg") + if err := os.WriteFile(srcFile, []byte("content"), 0644); err != nil { + t.Fatalf("write src file: %v", err) + } + + // Create a source file with different content + uniqueFile := filepath.Join(dir, "unique.jpg") + if err := os.WriteFile(uniqueFile, []byte("unique"), 0644); err != nil { + t.Fatalf("write unique file: %v", err) + } + + hashCache := &sync.Map{} + dc := NewDuplicateChecker(destDir, hashCache, false) + + var progress int64 + if err := dc.Initialize(&progress); err != nil { + t.Fatalf("Initialize: %v", err) + } + + // Check duplicate + isDup, origPath, err := dc.CheckAndTrack(srcFile) + if err != nil { + t.Fatalf("CheckAndTrack duplicate: %v", err) + } + if !isDup { + t.Fatalf("expected duplicate to be detected") + } + if origPath != destFile { + t.Fatalf("expected original path %s, got %s", destFile, origPath) + } + + // Check unique + isDup, _, err = dc.CheckAndTrack(uniqueFile) + if err != nil { + t.Fatalf("CheckAndTrack unique: %v", err) + } + if isDup { + t.Fatalf("expected unique file not to be duplicate") + } + + // Track the unique file (simulating move) + // We need to move it first because TrackNewFile expects file to exist at path? + // Actually TrackNewFile just hashes the file at path and adds to index. + // In real usage, we move then track. + if err := dc.TrackNewFile(uniqueFile); err != nil { + t.Fatalf("TrackNewFile: %v", err) + } + + // Check unique again (should now be duplicate) + isDup, _, err = dc.CheckAndTrack(uniqueFile) + if err != nil { + t.Fatalf("CheckAndTrack unique 2nd time: %v", err) + } + if !isDup { + t.Fatalf("expected tracked file to be detected as duplicate") + } +} + +func TestMoveDuplicate(t *testing.T) { + dir := t.TempDir() + srcFile := filepath.Join(dir, "dup.jpg") + if err := os.WriteFile(srcFile, []byte("dup"), 0644); err != nil { + t.Fatalf("write src: %v", err) + } + + duplicatesDir := filepath.Join(dir, "duplicates") + // duplicatesDir doesn't exist yet + + err := MoveDuplicate(srcFile, "original/path/ignored", duplicatesDir) + if err != nil { + t.Fatalf("MoveDuplicate: %v", err) + } + + destFile := filepath.Join(duplicatesDir, "dup.jpg") + if _, err := os.Stat(destFile); os.IsNotExist(err) { + t.Fatalf("expected file to be moved to %s", destFile) + } + + // Test collision handling + srcFile2 := filepath.Join(dir, "dup.jpg") // Recreate source + if err := os.WriteFile(srcFile2, []byte("dup2"), 0644); err != nil { + t.Fatalf("write src2: %v", err) + } + + err = MoveDuplicate(srcFile2, "original/path/ignored", duplicatesDir) + if err != nil { + t.Fatalf("MoveDuplicate 2: %v", err) + } + + destFile2 := filepath.Join(duplicatesDir, "dup_1.jpg") + if _, err := os.Stat(destFile2); os.IsNotExist(err) { + t.Fatalf("expected file to be renamed to %s", destFile2) + } +} diff --git a/hash/hash.go b/hash/hash.go index a78be56..b8cbf23 100644 --- a/hash/hash.go +++ b/hash/hash.go @@ -27,18 +27,63 @@ type CachedFile struct { Hash []byte `json:"hash"` } +type DirectoryHash struct { + LastScanned time.Time `json:"last_scanned"` + ModTime time.Time `json:"mod_time"` + FileCount int `json:"file_count"` + Files []string `json:"files"` +} + type hashCacheFile struct { - Files map[string]serializedCachedFile `json:"files"` + Files map[string]serializedCachedFile `json:"files"` + Directories map[string]DirectoryHash `json:"directories"` } type serializedCachedFile struct { Size int64 `json:"size"` ModTime time.Time `json:"mod_time"` - Hash string `json:"hash"` // Hex string representation + Hash string `json:"hash"` } const DefaultCacheFilePath = "hash_cache.json" +var ( + supportedImageExts = map[string]bool{ + ".jpg": true, ".jpeg": true, ".png": true, ".gif": true, + ".bmp": true, ".tiff": true, ".dng": true, ".nef": true, + } + supportedVideoExts = map[string]bool{ + ".mp4": true, ".avi": true, ".mov": true, ".mkv": true, + } + skippableDirs = map[string]bool{ + "videos": true, "unknown": true, ".git": true, + ".cache": true, "node_modules": true, "duplicate": true, + } +) + +// IsSupportedMediaFile checks if the file is a supported media type +func IsSupportedMediaFile(filePath string) bool { + ext := strings.ToLower(filepath.Ext(filePath)) + return supportedImageExts[ext] || supportedVideoExts[ext] +} + +// IsImageFile checks if the file is an image +func IsImageFile(filePath string) bool { + ext := strings.ToLower(filepath.Ext(filePath)) + return supportedImageExts[ext] +} + +// IsVideoFile checks if the file is a video +func IsVideoFile(filePath string) bool { + ext := strings.ToLower(filepath.Ext(filePath)) + return supportedVideoExts[ext] +} + +// IsSkippableDirectory checks if directory should be skipped +func IsSkippableDirectory(dirName string) bool { + return skippableDirs[strings.ToLower(dirName)] +} + type readerAtWrapper struct { readerAt io.ReaderAt offset int64 @@ -54,15 +99,7 @@ func (r *readerAtWrapper) Read(p []byte) (n int, err error) { return n, err } -// isImageFile checks if the file is an image based on its extension. -func isImageFile(filePath string) bool { - lowerFilePath := strings.ToLower(filePath) - return strings.HasSuffix(lowerFilePath, ".jpg") || strings.HasSuffix(lowerFilePath, ".jpeg") || - strings.HasSuffix(lowerFilePath, ".png") || strings.HasSuffix(lowerFilePath, ".gif") || - strings.HasSuffix(lowerFilePath, ".bmp") || strings.HasSuffix(lowerFilePath, ".tiff") -} - -// calculateFileHash calculates the SHA-256 hash of the file at the given filePath. +// calculateFileHash calculates the SHA-256 hash of the file func calculateFileHash(filePath string) ([]byte, error) { readerAt, err := mmap.Open(filePath) if err != nil { @@ -74,23 +111,22 @@ func calculateFileHash(filePath string) ([]byte, error) { if err != nil { return nil, fmt.Errorf("failed to stat file %s: %v", filePath, err) } - fileSize := fileInfo.Size() reader := &readerAtWrapper{ readerAt: readerAt, offset: 0, - size: fileSize, + size: fileInfo.Size(), } - hash := sha256.New() - if _, err := io.Copy(hash, reader); err != nil { + h := sha256.New() + if _, err := io.Copy(h, reader); err != nil { return nil, fmt.Errorf("failed to calculate hash for file %s: %v", filePath, err) } - return hash.Sum(nil), nil + return h.Sum(nil), nil } -// GetFileHash retrieves or calculates the hash of the file at filePath. +// GetFileHash retrieves or calculates the hash of the file func GetFileHash(filePath string, hashCache *sync.Map) ([]byte, error) { info, err := os.Stat(filePath) if err != nil { @@ -99,9 +135,10 @@ func GetFileHash(filePath string, hashCache *sync.Map) ([]byte, error) { meta := FileMeta{Size: info.Size(), ModTime: info.ModTime()} if cached, found := hashCache.Load(filePath); found { - cachedFile := cached.(CachedFile) - if cachedFile.Size == meta.Size && cachedFile.ModTime.Equal(meta.ModTime) { - return cachedFile.Hash, nil + if cachedFile, ok := cached.(CachedFile); ok { + if cachedFile.Size == meta.Size && cachedFile.ModTime.Equal(meta.ModTime) { + return cachedFile.Hash, nil + } } } @@ -110,16 +147,28 @@ func GetFileHash(filePath string, hashCache *sync.Map) ([]byte, error) { return nil, err } - cachedFile := CachedFile{ - FileMeta: meta, - Hash: hashValue, + hashCache.Store(filePath, CachedFile{FileMeta: meta, Hash: hashValue}) + return hashValue, nil +} + +// GetFileHashString returns the hex-encoded hash string +func GetFileHashString(filePath string, hashCache *sync.Map) (string, error) { + h, err := GetFileHash(filePath, hashCache) + if err != nil { + return "", err } - hashCache.Store(filePath, cachedFile) + return hex.EncodeToString(h), nil +} - return hashValue, nil +// InitHashCache initializes the hash cache from file +func InitHashCache(cachePath string) (*sync.Map, error) { + if cachePath == "" { + cachePath = DefaultCacheFilePath + } + return LoadHashCache(cachePath) } -// LoadHashCache loads the hash cache from the specified JSON file. +// LoadHashCache loads the hash cache from JSON file func LoadHashCache(cachePath string) (*sync.Map, error) { hashCache := &sync.Map{} @@ -133,52 +182,54 @@ func LoadHashCache(cachePath string) (*sync.Map, error) { } var cacheFile hashCacheFile - err = json.Unmarshal(data, &cacheFile) - if err != nil { + if err := json.Unmarshal(data, &cacheFile); err != nil { return hashCache, fmt.Errorf("failed to unmarshal hash cache: %v", err) } for filePath, serialized := range cacheFile.Files { hashBytes, err := hex.DecodeString(serialized.Hash) if err != nil { - continue // Skip invalid entries - } - - cachedFile := CachedFile{ - FileMeta: FileMeta{ - Size: serialized.Size, - ModTime: serialized.ModTime, - }, - Hash: hashBytes, + continue } + hashCache.Store(filePath, CachedFile{ + FileMeta: FileMeta{Size: serialized.Size, ModTime: serialized.ModTime}, + Hash: hashBytes, + }) + } - hashCache.Store(filePath, cachedFile) + for dirPath, dirHash := range cacheFile.Directories { + hashCache.Store("dir:"+dirPath, dirHash) } return hashCache, nil } -// SaveHashCache saves the hash cache to a JSON file. +// SaveHashCache saves the hash cache to JSON file func SaveHashCache(hashCache *sync.Map, cachePath string) error { cacheFile := hashCacheFile{ - Files: make(map[string]serializedCachedFile), + Files: make(map[string]serializedCachedFile), + Directories: make(map[string]DirectoryHash), } hashCache.Range(func(key, value interface{}) bool { - filePath, ok := key.(string) + strKey, ok := key.(string) if !ok { return true } - cachedFile, ok := value.(CachedFile) - if !ok { + if strings.HasPrefix(strKey, "dir:") { + if dirHash, ok := value.(DirectoryHash); ok { + cacheFile.Directories[strings.TrimPrefix(strKey, "dir:")] = dirHash + } return true } - cacheFile.Files[filePath] = serializedCachedFile{ - Size: cachedFile.Size, - ModTime: cachedFile.ModTime, - Hash: hex.EncodeToString(cachedFile.Hash), + if cachedFile, ok := value.(CachedFile); ok { + cacheFile.Files[strKey] = serializedCachedFile{ + Size: cachedFile.Size, + ModTime: cachedFile.ModTime, + Hash: hex.EncodeToString(cachedFile.Hash), + } } return true }) @@ -188,87 +239,91 @@ func SaveHashCache(hashCache *sync.Map, cachePath string) error { return fmt.Errorf("failed to marshal hash cache: %v", err) } - err = os.WriteFile(cachePath, data, 0644) - if err != nil { - return fmt.Errorf("failed to write hash cache to file: %v", err) + tmpPath := cachePath + ".tmp" + if err := os.WriteFile(tmpPath, data, 0644); err != nil { + return fmt.Errorf("failed to write hash cache temp file: %v", err) } - return nil -} - -// InitHashCache initializes the hash cache, loading from file if it exists. -func InitHashCache(cachePath string) (*sync.Map, error) { - if cachePath == "" { - cachePath = DefaultCacheFilePath + _ = os.Remove(cachePath) + if err := os.Rename(tmpPath, cachePath); err != nil { + return fmt.Errorf("failed to replace hash cache file: %v", err) } - return LoadHashCache(cachePath) + return nil } -// HashImagesInPath hashes all images in the given path and updates the fileHashMap. -func HashImagesInPath(path string, hashCache *sync.Map, hashedFiles *int64) (*sync.Map, error) { +// HashFilesInPath hashes files incrementally using workers +func HashFilesInPath(path string, hashCache *sync.Map, hashedFiles *int64, fileFilter func(string) bool) (*sync.Map, error) { fileHashMap := &sync.Map{} - fileChan := make(chan string) - errChan := make(chan error) + fileChan := make(chan string, 500) + errChan := make(chan error, 1) var wg sync.WaitGroup - numWorkers := runtime.NumCPU() * 4 + numWorkers := runtime.NumCPU() * 2 + if numWorkers < 4 { + numWorkers = 4 + } + // Start workers for i := 0; i < numWorkers; i++ { wg.Add(1) go func() { defer wg.Done() for filePath := range fileChan { - if isImageFile(filePath) { - hashValue, err := GetFileHash(filePath, hashCache) - if err != nil { - errChan <- fmt.Errorf("failed to get file hash for %s: %v", filePath, err) - return - } - - hashStr := hex.EncodeToString(hashValue) - fileHashMap.Store(hashStr, true) + if fileFilter != nil && !fileFilter(filePath) { + continue + } + if !IsSupportedMediaFile(filePath) { + continue + } - atomic.AddInt64(hashedFiles, 1) + hashStr, err := GetFileHashString(filePath, hashCache) + if err != nil { + select { + case errChan <- fmt.Errorf("failed to hash %s: %v", filePath, err): + default: + } + return } + + fileHashMap.Store(hashStr, filePath) + atomic.AddInt64(hashedFiles, 1) } }() } + // Walk directory go func() { defer close(fileChan) - err := filepath.Walk(path, func(filePath string, info os.FileInfo, err error) error { + _ = filepath.WalkDir(path, func(p string, d os.DirEntry, err error) error { if err != nil { - errChan <- fmt.Errorf("failed to walk path %s: %v", filePath, err) - return err + return nil } - - if !info.IsDir() { - fileChan <- filePath + if d.IsDir() { + if IsSkippableDirectory(d.Name()) { + return filepath.SkipDir + } + return nil + } + if d.Type().IsRegular() { + fileChan <- p } - return nil }) - - if err != nil { - errChan <- err - } }() - go func() { - wg.Wait() - close(errChan) - }() - - for err := range errChan { - if err != nil { - return nil, err - } - } + wg.Wait() + close(errChan) - if err := SaveHashCache(hashCache, DefaultCacheFilePath); err != nil { - fmt.Printf("Warning: Failed to save hash cache: %v\n", err) + if err := <-errChan; err != nil { + return nil, err } return fileHashMap, nil } + +// BuildDestinationHashIndex builds a hash->path index for destination files +// This is optimized to use cache aggressively and skip already-indexed directories +func BuildDestinationHashIndex(destPath string, hashCache *sync.Map, progress *int64) (*sync.Map, error) { + return HashFilesInPath(destPath, hashCache, progress, nil) +} diff --git a/hash/hash_test.go b/hash/hash_test.go new file mode 100644 index 0000000..ff369cd --- /dev/null +++ b/hash/hash_test.go @@ -0,0 +1,155 @@ +package hash + +import ( + "crypto/sha256" + "encoding/hex" + "os" + "path/filepath" + "sync" + "testing" + "time" +) + +func TestGetFileHash_CachesAndInvalidates(t *testing.T) { + dir := t.TempDir() + p := filepath.Join(dir, "a.jpg") + + if err := os.WriteFile(p, []byte("hello"), 0644); err != nil { + t.Fatalf("write file: %v", err) + } + + cache := &sync.Map{} + + h1, err := GetFileHash(p, cache) + if err != nil { + t.Fatalf("GetFileHash #1: %v", err) + } + h2, err := GetFileHash(p, cache) + if err != nil { + t.Fatalf("GetFileHash #2: %v", err) + } + if hex.EncodeToString(h1) != hex.EncodeToString(h2) { + t.Fatalf("expected cached hash to match") + } + + if err := os.WriteFile(p, []byte("hello2"), 0644); err != nil { + t.Fatalf("rewrite file: %v", err) + } + // Ensure modtime differs on coarse filesystems. + now := time.Now().Add(2 * time.Second) + _ = os.Chtimes(p, now, now) + + h3, err := GetFileHash(p, cache) + if err != nil { + t.Fatalf("GetFileHash #3: %v", err) + } + if hex.EncodeToString(h1) == hex.EncodeToString(h3) { + t.Fatalf("expected hash to change after modification") + } +} + +func TestSaveLoadHashCache_RoundTrip(t *testing.T) { + dir := t.TempDir() + cachePath := filepath.Join(dir, "cache.json") + + filePath := filepath.Join(dir, "b.jpg") + if err := os.WriteFile(filePath, []byte("abc"), 0644); err != nil { + t.Fatalf("write file: %v", err) + } + + info, err := os.Stat(filePath) + if err != nil { + t.Fatalf("stat: %v", err) + } + h := sha256.Sum256([]byte("abc")) + + cache := &sync.Map{} + cache.Store(filePath, CachedFile{FileMeta: FileMeta{Size: info.Size(), ModTime: info.ModTime()}, Hash: h[:]}) + + dirInfo, err := os.Stat(dir) + if err != nil { + t.Fatalf("stat dir: %v", err) + } + cache.Store("dir:"+dir, DirectoryHash{LastScanned: time.Now(), ModTime: dirInfo.ModTime(), FileCount: 1, Files: []string{filepath.Base(filePath)}}) + + if err := SaveHashCache(cache, cachePath); err != nil { + t.Fatalf("SaveHashCache: %v", err) + } + if _, err := os.Stat(cachePath + ".tmp"); err == nil { + t.Fatalf("expected temp file to be removed") + } + + loaded, err := LoadHashCache(cachePath) + if err != nil { + t.Fatalf("LoadHashCache: %v", err) + } + + v, ok := loaded.Load(filePath) + if !ok { + t.Fatalf("expected file entry") + } + cf := v.(CachedFile) + if cf.Size != info.Size() { + t.Fatalf("size mismatch") + } + if !cf.ModTime.Equal(info.ModTime()) { + t.Fatalf("modtime mismatch") + } + if hex.EncodeToString(cf.Hash) != hex.EncodeToString(h[:]) { + t.Fatalf("hash mismatch") + } + + v2, ok := loaded.Load("dir:" + dir) + if !ok { + t.Fatalf("expected dir entry") + } + dh := v2.(DirectoryHash) + if dh.FileCount != 1 || len(dh.Files) != 1 { + t.Fatalf("expected directory metadata") + } +} + +func TestBuildDestinationHashIndex_UsesDirectoryCacheFastPath(t *testing.T) { + // BuildDestinationHashIndex writes DefaultCacheFilePath in the current working directory. + cwd, _ := os.Getwd() + defer func() { _ = os.Chdir(cwd) }() + _ = os.Chdir(t.TempDir()) + + root := t.TempDir() + p := filepath.Join(root, "c.jpg") + if err := os.WriteFile(p, []byte("payload"), 0644); err != nil { + t.Fatalf("write: %v", err) + } + + cache := &sync.Map{} + hashBytes, err := GetFileHash(p, cache) + if err != nil { + t.Fatalf("GetFileHash: %v", err) + } + + rootInfo, err := os.Stat(root) + if err != nil { + t.Fatalf("stat root: %v", err) + } + cache.Store("dir:"+root, DirectoryHash{LastScanned: time.Now(), ModTime: rootInfo.ModTime(), FileCount: 1, Files: []string{filepath.Base(p)}}) + + var hashed int64 + m, err := BuildDestinationHashIndex(root, cache, &hashed) + if err != nil { + t.Fatalf("BuildDestinationHashIndex: %v", err) + } + if hashed != 1 { + t.Fatalf("expected hashedFiles=1, got %d", hashed) + } + + exists := false + hashStr := hex.EncodeToString(hashBytes) + if val, ok := m.Load(hashStr); ok { + if val.(string) == p { + exists = true + } + } + if !exists { + t.Fatalf("expected returned map to contain hash pointing to file") + } +}