Skip to content

Commit

Permalink
Get rights information from sherpa/romeo (#6) [minor]
Browse files Browse the repository at this point in the history
  • Loading branch information
joecorall authored Apr 3, 2024
1 parent cdc3c3a commit b08fbac
Show file tree
Hide file tree
Showing 8 changed files with 522 additions and 146 deletions.
117 changes: 16 additions & 101 deletions cmd/doi.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,23 @@ package cmd

import (
"bufio"
"crypto/md5"
"encoding/csv"
"encoding/hex"
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strings"

"github.com/lehigh-university-libraries/papercut/internal/utils"
"github.com/lehigh-university-libraries/papercut/pkg/doi"
"github.com/lehigh-university-libraries/papercut/pkg/romeo"
"github.com/spf13/cobra"
)

var (
// used for flags.
filePath string

doiCmd = &cobra.Command{
filePath string
downloadPdfs bool
doiCmd = &cobra.Command{
Use: "doi",
Short: "Get DOI metadata and PDF",
Run: func(cmd *cobra.Command, args []string) {
Expand Down Expand Up @@ -64,19 +60,10 @@ var (
}
for scanner.Scan() {
var doiObject doi.Article
line := strings.TrimSpace(scanner.Text())
dirPath := filepath.Join("dois", line)
dirPath, err = utils.MkTmpDir(dirPath)
doiStr := strings.TrimSpace(scanner.Text())
doiObject, err := doi.GetDoi(doiStr, url)
if err != nil {
log.Printf("Unable to create cached file directory: %v", err)
continue
}

d := filepath.Join(dirPath, "doi.json")
result := getResult(d, url, line, "application/json")
err = json.Unmarshal(result, &doiObject)
if err != nil {
log.Printf("Could not unmarshal JSON for %s: %v", line, err)
log.Println(err)
continue
}

Expand All @@ -90,8 +77,12 @@ var (
identifiers := []string{
fmt.Sprintf(`{"attr0":"doi","value":"%s"}`, doiObject.DOI),
}
fieldRights := ""
for _, i := range doiObject.ISSN {
identifiers = append(identifiers, fmt.Sprintf(`{"attr0":"issn","value":"%s"}`, i))
if fieldRights == "" {
fieldRights = romeo.FindIssnLicense(i)
}
}

partDetail := []string{}
Expand All @@ -111,53 +102,17 @@ var (
extent = fmt.Sprintf(`{"attr0": "page", "number": "%s"}`, doiObject.Page)
}

pdfUrl := ""
pdf := ""
for _, l := range doiObject.Link {
if l.ContentType == "application/pdf" || strings.Contains(strings.ToLower(l.URL), "pdf") {
pdfUrl = l.URL

}
}
if pdfUrl == "" {
d = filepath.Join(dirPath, "doi.html")
result = getResult(d, url, line, "text/html")
pattern := `<meta name="citation_pdf_url" content="([^"]+)".*>`
re := regexp.MustCompile(pattern)
matches := re.FindAllSubmatch(result, -1)
var pdfURLs []string
for _, match := range matches {
if len(match) >= 2 {
log.Println(string(match[1]))
pdfURLs = append(pdfURLs, string(match[1]))
}
}
for _, url := range pdfURLs {
pdfUrl = url
break
}
}
if pdfUrl != "" {
hash := md5.Sum([]byte(line))
hashStr := hex.EncodeToString(hash[:])

pdf = fmt.Sprintf("papers/dois/%s.pdf", hashStr)
err = utils.DownloadPdf(pdfUrl, pdf)
if err != nil {
err = os.Remove(pdf)
if err != nil {
log.Println("Error deleting file:", err)
}
pdf = pdfUrl
}
if downloadPdfs {
pdf = doiObject.DownloadPdf()
}

fullTitle := ""
if len(doiObject.Title) > 255 {
fullTitle = doiObject.Title
}
err = wr.Write([]string{
line,
doiStr,
doi.JoinDate(doiObject.Issued),
utils.TrimToMaxLen(doiObject.Title, 255),
fullTitle,
Expand All @@ -169,7 +124,7 @@ var (
strings.Join(relatedItem, "|"),
extent,
doiObject.Language,
"",
fieldRights,
strings.Join(doiObject.Subject, "|"),
pdf,
})
Expand All @@ -192,45 +147,5 @@ func init() {

doiCmd.Flags().StringP("url", "u", "https://dx.doi.org", "The DOI API url")
doiCmd.Flags().StringVarP(&filePath, "file", "f", "", "path to file containing one DOI per line")
}

func getResult(d, url, line, acceptContentType string) []byte {
var err error

// see if we can just get the cached file
if _, err := os.Stat(d); err == nil {
content, err := os.ReadFile(d)
if err != nil {
fmt.Println("Error reading cached file:", err)
} else {
var a doi.Affiliation
err = json.Unmarshal(content, &a)
if err == nil || acceptContentType == "text/html" {
return content
}
log.Println("Error unmarshalling cached file:", err)
}
}

apiURL := fmt.Sprintf("%s/%s", url, line)

log.Printf("Accessing %s\n", apiURL)

doiObject, err := doi.GetObject(apiURL, acceptContentType)
if err != nil {
log.Fatal(err)
}
cacheFile, err := os.Create(d)
if err != nil {
fmt.Println("Error creating file:", err)
return nil
}
defer cacheFile.Close()

_, err = cacheFile.WriteString(string(doiObject))
if err != nil {
fmt.Println("Error caching DOI JSON:", err)
}

return doiObject
doiCmd.Flags().BoolVarP(&downloadPdfs, "download-pdfs", "d", true, "whether to download the PDFs")
}
85 changes: 85 additions & 0 deletions cmd/license.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package cmd

import (
"bufio"
"encoding/csv"
"fmt"
"log"
"os"
"strings"

"github.com/lehigh-university-libraries/papercut/pkg/doi"
"github.com/lehigh-university-libraries/papercut/pkg/romeo"
"github.com/spf13/cobra"
)

var (
// used for flags.
licenseFilePath string
licenseCmd = &cobra.Command{
Use: "license",
Short: "Get license for a DOI",
Run: func(cmd *cobra.Command, args []string) {
file, err := os.Open(licenseFilePath)
if err != nil {
fmt.Println("Error opening file:", err)
return
}
defer file.Close()

// Create a scanner to read the file line by line
scanner := bufio.NewScanner(file)
url, err := cmd.Flags().GetString("url")
if err != nil {
log.Fatal(err)
}
wr := csv.NewWriter(os.Stdout)

// CSV header
err = wr.Write([]string{
"id",
"field_rights",
})
if err != nil {
log.Fatalf("Unable to write to CSV: %v", err)
}
for scanner.Scan() {
doiStr := strings.TrimSpace(scanner.Text())
doiObject, err := doi.GetDoi(doiStr, url)
if err != nil {
log.Println(err)
continue
}

fieldRights := ""
for _, i := range doiObject.ISSN {
fieldRights = romeo.FindIssnLicense(i)
if fieldRights != "" {
break
}
}

err = wr.Write([]string{
doiStr,
fieldRights,
})
if err != nil {
log.Fatalf("Unable to write to CSV: %v", err)
}
wr.Flush()
}

if err := scanner.Err(); err != nil {
fmt.Println("Error scanning file:", err)
return
}
},
}
)

func init() {
getCmd.AddCommand(licenseCmd)

licenseCmd.Flags().StringP("url", "u", "https://dx.doi.org", "The DOI API url")
licenseCmd.Flags().StringVarP(&licenseFilePath, "file", "f", "", "path to file containing one DOI per line")
}
93 changes: 92 additions & 1 deletion internal/utils/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ func DownloadPdf(url, filePath string) error {
}

if _, err := os.Stat(filePath); os.IsNotExist(err) {

file, err := os.Create(filePath)
if err != nil {
fmt.Println("Error creating file:", err)
Expand Down Expand Up @@ -120,3 +119,95 @@ func DownloadPdf(url, filePath string) error {

return nil
}

func StrInSlice(s string, sl []string) bool {
for _, a := range sl {
if a == s {
return true
}
}
return false
}

func GetResult(d, url, acceptContentType string) []byte {
var err error
content := CheckCachedFile(d)
if content != nil {
return content
}

log.Printf("Accessing %s\n", url)

r, err := getResult(url, acceptContentType)
if err != nil {
log.Printf("Failed to get %s: %v", url, err)
return nil
}
WriteCachedFile(d, string(r))

return r
}

func CheckCachedFile(d string) []byte {
// see if we can just get the cached file
if _, err := os.Stat(d); err == nil {
content, err := os.ReadFile(d)
if err != nil {
log.Println("Error reading cached file:", err)
return nil
}
return content
}
return nil
}

func WriteCachedFile(f, c string) {
cacheFile, err := os.Create(f)
if err != nil {
fmt.Println("Error creating file:", err)
return
}
defer cacheFile.Close()

_, err = cacheFile.WriteString(c)
if err != nil {
log.Println("Error caching DOI JSON:", err)
}
}

func getResult(url, acceptContentType string) ([]byte, error) {
client := &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
},
}

req, err := http.NewRequest("GET", url, nil)
if err != nil {
fmt.Println("Error creating request:", err)
return nil, err
}

req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
req.Header.Set("Accept", acceptContentType)
req.Header.Set("Accept-Language", "en-US")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Cache-Control", "no-cache")

resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()

if resp.StatusCode > 299 {
return nil, fmt.Errorf("%s returned a non-200 status code: %d", url, resp.StatusCode)
}

body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

return body, nil
}
24 changes: 24 additions & 0 deletions internal/utils/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,27 @@ func TestTrimToMaxLen(t *testing.T) {
t.Errorf("TrimToMaxLen(%q, 20) = %q; want %q", inputExact, resultExact, expectedExact)
}
}

func TestStrInSlice(t *testing.T) {
tests := []struct {
name string
s string
sl []string
expected bool
}{
{"StringInSlice", "hello", []string{"hello", "world", "foo", "bar"}, true},
{"StringNotInSlice", "goodbye", []string{"hello", "world", "foo", "bar"}, false},
{"EmptySlice", "foo", []string{}, false},
{"EmptyString", "", []string{"hello", "world", "foo", "bar"}, false},
{"StringInSliceMultipleTimes", "foo", []string{"hello", "world", "foo", "bar", "foo"}, true},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
result := StrInSlice(test.s, test.sl)
if result != test.expected {
t.Errorf("Expected StrInSlice(%q, %v) to be %v, but got %v", test.s, test.sl, test.expected, result)
}
})
}
}
Loading

0 comments on commit b08fbac

Please sign in to comment.