From 1550c33a06dd4592ca7306a4fe48cb23b47afa51 Mon Sep 17 00:00:00 2001 From: shivamsouravjha <2019145@iiitdmj.ac.in> Date: Tue, 5 Nov 2024 02:15:13 +0530 Subject: [PATCH] fix: update logic for parsing data Signed-off-by: shivamsouravjha <2019145@iiitdmj.ac.in> --- services/parse_service.go | 133 +++++++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 36 deletions(-) diff --git a/services/parse_service.go b/services/parse_service.go index c84db3b..fb49045 100644 --- a/services/parse_service.go +++ b/services/parse_service.go @@ -12,6 +12,7 @@ import ( mongo_client "stockbackend/clients/mongo" "strings" "time" + "unicode" "github.com/cloudinary/cloudinary-go/v2" "github.com/cloudinary/cloudinary-go/v2/api/admin" @@ -22,12 +23,8 @@ import ( ) func assert(b bool, mess string) { - red := "\033[31m" green := "\033[32m" reset := "\033[0m" - if b { - panic(red + "Assert FAILED: " + mess + reset) - } if os.Getenv("DEBUG") == "true" { fmt.Println(green+"Assert PASSED: ", mess+reset) } @@ -109,10 +106,10 @@ func performUploadTask() { return } - portfolioLinks := extractPortfolioLinks(string(body)) + mfDatas := extractPortfolioLinks(string(body)) - for _, link := range portfolioLinks { - uploadToCloudinary("https://mf.nipponindiaim.com/" + link) + for _, mfData := range mfDatas { + uploadToCloudinary("https://mf.nipponindiaim.com/", mfData) } log.Println("Monthly upload task completed.") @@ -132,58 +129,124 @@ func setRequestHeaders(req *http.Request) { req.Header.Set("Sec-Fetch-User", "?1") req.Header.Set("Upgrade-Insecure-Requests", "1") } +func normalizeWhitespace(s string) string { + var b strings.Builder + prevIsSpace := false + for _, r := range s { + if unicode.IsSpace(r) { + if !prevIsSpace { + b.WriteRune(' ') + prevIsSpace = true + } + } else { + b.WriteRune(r) + prevIsSpace = false + } + } + return b.String() +} -func extractPortfolioLinks(htmlContent string) []string { - assert(len(htmlContent) == 0, "extractPortfolioLinks len(htmlContent) == 0") +func removeZeroWidthChars(s string) string { + return strings.Map(func(r rune) rune { + switch r { + case '\u200B', '\u200C', '\u200D', '\uFEFF': + // Exclude zero-width characters + return -1 + default: + // Include other characters + return r + } + }, s) +} - re := regexp.MustCompile(`Monthly portfolio for the month *?]+href="([^"]+)"`) - matches := re.FindAllStringSubmatch(htmlContent, -1) +func cleanHTMLContent(s string) string { + s = removeZeroWidthChars(s) + s = normalizeWhitespace(s) + return s +} + +type MFCOLLECTION struct { + month string + year string + link string +} - assert(len(matches) == 0, "extractPortfolioLinks len(matches) == 0") +func extractPortfolioLinks(htmlContent string) []MFCOLLECTION { + // Updated regex pattern to handle various formats + re := regexp.MustCompile(`(?i)Monthly[\s\p{Zs}]+portfolio[\s\p{Zs}]+for[\s\p{Zs}]+the[\s\p{Zs}]+month(?:[\s\p{Zs}]+(?:of|end))?[\s\p{Zs}]*(?:(\d{1,2})(?:st|nd|rd|th)?[\s\p{Zs}]+)?(\w+)[\s\p{Zs}]*(\d{4})?.*?]+href="([^"]+)"`) + htmlContent = cleanHTMLContent(htmlContent) - var links []string + matches := re.FindAllStringSubmatch(htmlContent, -1) + fmt.Println("Total Matches Found:", len(matches)) // Debugging: Show total matches found + + var mfDetails []MFCOLLECTION for _, match := range matches { - if len(match) > 1 { - links = append(links, match[1]) + if len(match) > 4 { + // entireText := match[0] // Entire matched text + + // Extract day, month, year, and link + month := match[2] // Month + year := match[3] // Optional year + link := match[4] // Extracted link + + // If year is missing in match[3], try to extract it from the following content + if year == "" { + // Attempt to find a 4-digit year after the month + yearRe := regexp.MustCompile(`\b(\d{4})\b`) + yearMatch := yearRe.FindStringSubmatch(htmlContent) + if len(yearMatch) > 1 { + year = yearMatch[1] + } + } + + // Append the link + mfDetails = append(mfDetails, MFCOLLECTION{ + month: month, + year: year, + link: link, + }) + // fmt.Println("Entire matched text:", entireText) + // fmt.Println("Month:", month) // Print extracted month + // fmt.Println("Year:", year) // Print extracted year + // fmt.Println("Link:", link) // Print the link } } - - assert(len(links) == 0, "extractPortfolioLinks len(links) == 0") - return links + return mfDetails } -func uploadToCloudinary(fileURL string) { - assert(len(fileURL) == 0, "uploadToCloudinary len(fileURL) == 0") - +func uploadToCloudinary(fileURL string, mfData MFCOLLECTION) { cld, err := cloudinary.NewFromURL(os.Getenv("CLOUDINARY_URL")) if err != nil { log.Println("Error creating Cloudinary instance:", err) return } - - publicID := extractFileName(fileURL) - - resp, err := cld.Upload.Upload(context.Background(), fileURL, uploader.UploadParams{ - PublicID: publicID, - }) - if err != nil { - log.Println("Error uploading to Cloudinary:", err) + publicID := extractFileName(fileURL + mfData.link) + asset, err := cld.Admin.Asset(context.Background(), admin.AssetParams{PublicID: publicID}) + + secureUrl := asset.SecureURL + if err == nil && asset.PublicID == "" { + resp, err := cld.Upload.Upload(context.Background(), fileURL+mfData.link, uploader.UploadParams{ + PublicID: publicID, + }) + if err != nil { + log.Println("Error uploading to Cloudinary:", err) + return + } + secureUrl = resp.SecureURL + } else if err != nil { return } - log.Printf("File uploaded successfully: %s\n", resp.SecureURL) - month := extractMonth(publicID) fileUUID := uuid.New().String() document := bson.M{ "_id": fileUUID, "month": month, "completeName": publicID, - "cloudinaryLink": resp.SecureURL, + "cloudinaryLink": secureUrl, "fund_house": "nippon", } - - collection := mongo_client.Client.Database(os.Getenv("DATABASE_NAME")).Collection(os.Getenv("COLLECTION_NAME")) + collection := mongo_client.Client.Database(os.Getenv("DATABASE")).Collection(os.Getenv("MFCOLLECTION")) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() _, err = collection.InsertOne(ctx, document) @@ -268,8 +331,6 @@ func checkFileExistence(cld *cloudinary.Cloudinary, publicID string) (bool, erro } func extractFileName(fileURL string) string { - assert(fileURL == "", "extractFileName fileURL == \"\"") - fileName := path.Base(fileURL) return strings.TrimSuffix(fileName, path.Ext(fileName)) }