i #284 Updated documentation and modified function for download_piper…

…mail() - Modified helix.yml to use [[“mailing_list”]][[“pipermail”]][[“project_key_1”]] - Added project_key_2 to helix.yml - Created /vignettes/download_mail.Rmd to document information about pipermail downloader - Made function calls explicit for external libraries - ISSUE: Build -> Check is not passing. Seems to be having issues with utags_path, even though I changed the path to the one for universal-ctags in tools.yml
sailuh · Sep 17, 2024 · 69ca163 · carlosparadis · Sep 17, 2024 · 69ca163
1 parent 7c585ae
commit 69ca163
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 22 deletions.
diff --git a/R/mail.R b/R/mail.R
@@ -20,42 +20,56 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s
     dir.create(save_folder_path, recursive = TRUE)
   }
 
+  # Ensure mailing_list URL ends with a slash
+  if (!stringi::stri_endswith_fixed(mailing_list, "/")) {
+    mailing_list <- paste0(mailing_list, "/")
+  }
+
   # Get mailing list contents
-  response <- GET(mailing_list)
+  response <- httr::GET(mailing_list)
 
   # Parse the response
-  parsed_response <- content(response, "text")
-  doc_obj <- htmlParse(parsed_response, asText = TRUE)
+  parsed_response <- httr::content(response, "text")
+  doc_obj <- XML::htmlParse(parsed_response, asText = TRUE)
 
   # Table rows
-  rows <- getNodeSet(doc_obj, "//tr")
+  rows <- XML::getNodeSet(doc_obj, "//tr")
 
   # Skip header row
   data_rows <- rows[-1]
 
   # Vector for link storage
-  links = c()
+  links <- c()
 
   # Extract the date and link from each row
   for (row in data_rows) {
     # Date in YYYYMM format
-    date_extracted <- xpathSApply(row, ".//td[1]", xmlValue)
-    date_cleaned <- stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "")
-    date_cleaned <- stri_trim_both(date_cleaned)
+    date_extracted <- XML::xpathSApply(row, ".//td[1]", XML::xmlValue)
+    date_cleaned <- stringi::stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "")
+    date_cleaned <- stringi::stri_trim_both(date_cleaned)
     # Parse the date
     # Add 01 as dummy to make it a valid date
     date_parsed <- as.Date(paste0("01 ", date_cleaned), format = "%d %B %Y")
+    if (is.na(date_parsed)) {
+      warning("Date could not be parsed: ", date_cleaned)
+      next
+    }
     year_month <- format(date_parsed, "%Y%m")
 
     # Check if date is within range
     if (year_month >= start_year_month & year_month <= end_year_month) {
-      # get href from column 3
-      link_nodes <- xpathSApply(row, ".//td[3]/a", xmlGetAttr, 'href')
+      # Get href from column 3
+      link_nodes <- XML::xpathSApply(row, ".//td[3]/a", XML::xmlGetAttr, 'href')
+      if (length(link_nodes) == 0) {
+        warning("No link found in row for date: ", date_cleaned)
+        next
+      }
       # Store the link in links
       link <- link_nodes[1]
       links <- c(links, link)
     }
   }
+
   # Vector for downloaded files
   downloaded_files <- c()
   for (i in seq_along(links)) {
@@ -66,6 +80,10 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s
 
     # Parse the date from the base name
     date_parsed <- as.Date(paste0("01-", base_name), format = "%d-%Y-%B")
+    if (is.na(date_parsed)) {
+      warning("Could not parse date from link: ", link)
+      next
+    }
     year_month_clean <- format(date_parsed, "%Y%m")
 
     # Download URL
@@ -78,7 +96,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s
 
     # Download the gz mbox file
     cat("Downloading:", download_url, "\n")
-    GET(download_url, write_disk(dest_gz, overwrite = TRUE))
+    httr::GET(download_url, httr::write_disk(dest_gz, overwrite = TRUE))
 
     # Unzip the file
     gz_con <- gzfile(dest_gz, open = "rb")
@@ -100,10 +118,10 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s
 
   # Return downloaded files
   return(downloaded_files)
-
 }
 
 
+
 #' Convert pipermail archive files (.txt and .txt.gz) into an mbox format for use with \code{\link{parse_mbox}}
 #' @param filelist A vector of pipermail archive files from \code{\link{download_pipermail}}
 #' @return Returns `output`, the name of the resulting .mbox file in the current working directory
@@ -417,10 +435,10 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold
     if (verbose) {
       message("The folder is empty. Downloading all pipermail files. \n")
     }
-    download_pipermail(archive_url = archive_url,
-                                mailing_list = mailing_list,
-                                archive_type = archive_type,
-                                save_folder_path = save_folder_path)
+    download_pipermail(mailing_list = mailing_list,
+                       start_year_month = start_year_month,
+                       end_year_month = end_year_month,
+                       save_folder_path = save_folder_path)
   } else {
     latest_file_name <- parse_mbox_latest_date(save_folder_path)
     extracted_year_month <- sub("[^_]*_[^_]*_", "", sub(".mbox", "", latest_file_name))

diff --git a/conf/helix.yml b/conf/helix.yml
@@ -59,14 +59,19 @@ mailing_list:
       mbox: ../../rawdata/helix/mod_mbox/helix-user/
       mailing_list: helix-user
       archive_type: apache
-    # Using for testing R/mail.R/pipermail_downloader()
-    pipermail_key:
-      archive_url: https://mta.openssl.org/mailman/listinfo/
+  pipermail:
+    project_key_1:
+      # archive_url: https://mta.openssl.org/mailman/listinfo/
       mailing_list: https://mta.openssl.org/pipermail/openssl-users/
-      # archive_type
       start_year_month: 202310
       end_year_month: 202405
-      save_folder_path: "save_folder_mail"
+      save_folder_path: "../save_folder_mail"
+    project_key_2:
+      # archive_url: https://mta.openssl.org/mailman/listinfo/
+      mailing_list: https://mta.openssl.org/pipermail/openssl-project/
+      start_year_month: 201903
+      end_year_month: 202103
+      save_folder_path: "../save_folder_mail_2"
 
 issue_tracker:
   jira:

diff --git a/tools.yml b/tools.yml
@@ -7,7 +7,7 @@ refactoring_miner: ~/RefactoringMiner-1.0/bin/RefactoringMiner
 # https://github.com/boyter/scc
 scc: ~/scc/scc
 # universal-ctags
-utags: /usr/local/Cellar/universal-ctags/HEAD-62f0144/bin/ctags
+utags: /usr/local/Cellar/universal-ctags/p6.1.20240901.0/bin/ctags
 # https://archdia.com/
 dv8: /Applications/DV84/bin/dv8-console
 # OSLOM: http://oslom.org/

diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd
@@ -0,0 +1,81 @@
+---
+title: "Download Mod Mbox and Pipermail Mailing List Archives"
+output: 
+  html_document:
+    toc: true
+    number_sections: true
+vignette: >
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteIndexEntry{Download Mod Mbox Mailing List Archives}
+  %\VignetteEncoding{UTF-8}
+---
+
+
+```{r}
+rm(list = ls())
+seed <- 1
+set.seed(seed)
+
+# Load libraries
+  library(kaiaulu)
+  library(data.table)
+  library(yaml)
+  library(stringi)
+  library(XML)
+  library(httr)
+```
+
+
+# Introduction
+
+Mailing list data is stored in a variety of archives. See:
+- Mod Mbox: [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)).
+- Pipermail: [OpenSSL](https://mta.openssl.org/mailman/listinfo/).
+is notebook demonstrates how to download and refresh mailing list archives from Mod Mbox and Pipermail.
+
+## Mailing List Organization
+
+Mailing lists are typically organized by topic or purpose. For example, the [OpenSSL project](https://www.openssl.org/community/mailinglists.html) maintains several mailing lists, each serving a different group:
+
+- **openssl-announce**: For important announcements.
+- **openssl-commits**: For commit messages.
+- **openssl-project**: For project discussions.
+- **openssl-users**: For general user questions and discussions.
+
+Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis.
+
+# Project Configuration File 
+
+To start, we load the project configuration file, which contains parameters for downloading the mailing list archives.
+
+// # Project Configuration File
+
+```{r}
+conf <- yaml::read_yaml("conf/helix.yml")
+mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]]
+start_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["start_year_month"]]
+end_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["end_year_month"]]
+save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["save_folder_path"]]
+```
+
+### Explanation of Configuration Parameters
+- mailing_list: The URL of the mailing list archive index page (e.g., https://lists.openssl.org/pipermail/openssl-users/).
+- start_year_month: The starting date for downloading archives (in YYYYMM format).
+- end_year_month: The ending date for downloading archives (in YYYYMM format).
+- save_folder_path: The local directory where the downloaded archives will be saved.
+
+
+# Pipermail Downloader
+
+```{r}
+# Download archives
+download_pipermail(
+  mailing_list = mailing_list,
+  start_year_month = start_year_month,
+  end_year_month = end_year_month,
+  save_folder_path = save_folder_path
+)
+
+```
+After running this function, the .mbox files will be saved in the specified directory with filenames like kaiaulu_202310.mbox, kaiaulu_202311.mbox, etc.
+