From 1550c33a06dd4592ca7306a4fe48cb23b47afa51 Mon Sep 17 00:00:00 2001
From: shivamsouravjha <2019145@iiitdmj.ac.in>
Date: Tue, 5 Nov 2024 02:15:13 +0530
Subject: [PATCH] fix: update logic for parsing data
Signed-off-by: shivamsouravjha <2019145@iiitdmj.ac.in>
---
services/parse_service.go | 133 +++++++++++++++++++++++++++-----------
1 file changed, 97 insertions(+), 36 deletions(-)
diff --git a/services/parse_service.go b/services/parse_service.go
index c84db3b..fb49045 100644
--- a/services/parse_service.go
+++ b/services/parse_service.go
@@ -12,6 +12,7 @@ import (
mongo_client "stockbackend/clients/mongo"
"strings"
"time"
+ "unicode"
"github.com/cloudinary/cloudinary-go/v2"
"github.com/cloudinary/cloudinary-go/v2/api/admin"
@@ -22,12 +23,8 @@ import (
)
func assert(b bool, mess string) {
- red := "\033[31m"
green := "\033[32m"
reset := "\033[0m"
- if b {
- panic(red + "Assert FAILED: " + mess + reset)
- }
if os.Getenv("DEBUG") == "true" {
fmt.Println(green+"Assert PASSED: ", mess+reset)
}
@@ -109,10 +106,10 @@ func performUploadTask() {
return
}
- portfolioLinks := extractPortfolioLinks(string(body))
+ mfDatas := extractPortfolioLinks(string(body))
- for _, link := range portfolioLinks {
- uploadToCloudinary("https://mf.nipponindiaim.com/" + link)
+ for _, mfData := range mfDatas {
+ uploadToCloudinary("https://mf.nipponindiaim.com/", mfData)
}
log.Println("Monthly upload task completed.")
@@ -132,58 +129,124 @@ func setRequestHeaders(req *http.Request) {
req.Header.Set("Sec-Fetch-User", "?1")
req.Header.Set("Upgrade-Insecure-Requests", "1")
}
+func normalizeWhitespace(s string) string {
+ var b strings.Builder
+ prevIsSpace := false
+ for _, r := range s {
+ if unicode.IsSpace(r) {
+ if !prevIsSpace {
+ b.WriteRune(' ')
+ prevIsSpace = true
+ }
+ } else {
+ b.WriteRune(r)
+ prevIsSpace = false
+ }
+ }
+ return b.String()
+}
-func extractPortfolioLinks(htmlContent string) []string {
- assert(len(htmlContent) == 0, "extractPortfolioLinks len(htmlContent) == 0")
+func removeZeroWidthChars(s string) string {
+ return strings.Map(func(r rune) rune {
+ switch r {
+ case '\u200B', '\u200C', '\u200D', '\uFEFF':
+ // Exclude zero-width characters
+ return -1
+ default:
+ // Include other characters
+ return r
+ }
+ }, s)
+}
- re := regexp.MustCompile(`Monthly portfolio for the month *?]+href="([^"]+)"`)
- matches := re.FindAllStringSubmatch(htmlContent, -1)
+func cleanHTMLContent(s string) string {
+ s = removeZeroWidthChars(s)
+ s = normalizeWhitespace(s)
+ return s
+}
+
+type MFCOLLECTION struct {
+ month string
+ year string
+ link string
+}
- assert(len(matches) == 0, "extractPortfolioLinks len(matches) == 0")
+func extractPortfolioLinks(htmlContent string) []MFCOLLECTION {
+ // Updated regex pattern to handle various formats
+ re := regexp.MustCompile(`(?i)Monthly[\s\p{Zs}]+portfolio[\s\p{Zs}]+for[\s\p{Zs}]+the[\s\p{Zs}]+month(?:[\s\p{Zs}]+(?:of|end))?[\s\p{Zs}]*(?:(\d{1,2})(?:st|nd|rd|th)?[\s\p{Zs}]+)?(\w+)[\s\p{Zs}]*(\d{4})?.*?]+href="([^"]+)"`)
+ htmlContent = cleanHTMLContent(htmlContent)
- var links []string
+ matches := re.FindAllStringSubmatch(htmlContent, -1)
+ fmt.Println("Total Matches Found:", len(matches)) // Debugging: Show total matches found
+
+ var mfDetails []MFCOLLECTION
for _, match := range matches {
- if len(match) > 1 {
- links = append(links, match[1])
+ if len(match) > 4 {
+ // entireText := match[0] // Entire matched text
+
+ // Extract day, month, year, and link
+ month := match[2] // Month
+ year := match[3] // Optional year
+ link := match[4] // Extracted link
+
+ // If year is missing in match[3], try to extract it from the following content
+ if year == "" {
+ // Attempt to find a 4-digit year after the month
+ yearRe := regexp.MustCompile(`\b(\d{4})\b`)
+ yearMatch := yearRe.FindStringSubmatch(htmlContent)
+ if len(yearMatch) > 1 {
+ year = yearMatch[1]
+ }
+ }
+
+ // Append the link
+ mfDetails = append(mfDetails, MFCOLLECTION{
+ month: month,
+ year: year,
+ link: link,
+ })
+ // fmt.Println("Entire matched text:", entireText)
+ // fmt.Println("Month:", month) // Print extracted month
+ // fmt.Println("Year:", year) // Print extracted year
+ // fmt.Println("Link:", link) // Print the link
}
}
-
- assert(len(links) == 0, "extractPortfolioLinks len(links) == 0")
- return links
+ return mfDetails
}
-func uploadToCloudinary(fileURL string) {
- assert(len(fileURL) == 0, "uploadToCloudinary len(fileURL) == 0")
-
+func uploadToCloudinary(fileURL string, mfData MFCOLLECTION) {
cld, err := cloudinary.NewFromURL(os.Getenv("CLOUDINARY_URL"))
if err != nil {
log.Println("Error creating Cloudinary instance:", err)
return
}
-
- publicID := extractFileName(fileURL)
-
- resp, err := cld.Upload.Upload(context.Background(), fileURL, uploader.UploadParams{
- PublicID: publicID,
- })
- if err != nil {
- log.Println("Error uploading to Cloudinary:", err)
+ publicID := extractFileName(fileURL + mfData.link)
+ asset, err := cld.Admin.Asset(context.Background(), admin.AssetParams{PublicID: publicID})
+
+ secureUrl := asset.SecureURL
+ if err == nil && asset.PublicID == "" {
+ resp, err := cld.Upload.Upload(context.Background(), fileURL+mfData.link, uploader.UploadParams{
+ PublicID: publicID,
+ })
+ if err != nil {
+ log.Println("Error uploading to Cloudinary:", err)
+ return
+ }
+ secureUrl = resp.SecureURL
+ } else if err != nil {
return
}
- log.Printf("File uploaded successfully: %s\n", resp.SecureURL)
-
month := extractMonth(publicID)
fileUUID := uuid.New().String()
document := bson.M{
"_id": fileUUID,
"month": month,
"completeName": publicID,
- "cloudinaryLink": resp.SecureURL,
+ "cloudinaryLink": secureUrl,
"fund_house": "nippon",
}
-
- collection := mongo_client.Client.Database(os.Getenv("DATABASE_NAME")).Collection(os.Getenv("COLLECTION_NAME"))
+ collection := mongo_client.Client.Database(os.Getenv("DATABASE")).Collection(os.Getenv("MFCOLLECTION"))
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_, err = collection.InsertOne(ctx, document)
@@ -268,8 +331,6 @@ func checkFileExistence(cld *cloudinary.Cloudinary, publicID string) (bool, erro
}
func extractFileName(fileURL string) string {
- assert(fileURL == "", "extractFileName fileURL == \"\"")
-
fileName := path.Base(fileURL)
return strings.TrimSuffix(fileName, path.Ext(fileName))
}