OpenKnowledgeMaps
diff --git a/‎server/preprocessing/other-scripts/metrics.R‎
Lines changed: 49 additions & 34 deletions b/‎server/preprocessing/other-scripts/metrics.R‎
Lines changed: 49 additions & 34 deletions
diff --git a/‎server/preprocessing/other-scripts/run_metrics.R‎
Lines changed: 7 additions & 1 deletion b/‎server/preprocessing/other-scripts/run_metrics.R‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎server/services/snapshot/data-config_pubmed.js‎
Lines changed: 11 additions & 2 deletions b/‎server/services/snapshot/data-config_pubmed.js‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎server/workers/common/common/utils.py‎
Lines changed: 154 additions & 5 deletions b/‎server/workers/common/common/utils.py‎
Lines changed: 154 additions & 5 deletions
diff --git a/‎server/workers/metrics/src/metrics.py‎
Lines changed: 7 additions & 12 deletions b/‎server/workers/metrics/src/metrics.py‎
Lines changed: 7 additions & 12 deletions
@@ -5,9 +5,50 @@ library("plyr")
 mlog <- getLogger("metrics")
 
 
-enrich_metadata_metrics <- function(metadata) {
+enrich_metadata_metrics <- function(metadata, metrics_sources=c("altmetric", "crossref")) {
   start.time <- Sys.time()
 
+  original_sorting <- metadata$id
+
+  if ("altmetric" %in% metrics_sources) {
+    metadata <- add_altmetrics(metadata)
+  }
+  if ("crossref" %in% metrics_sources) {
+    metadata <- add_citations(metadata)
+  }
+
+  # Remove duplicate lines - TODO: check for root of this problem
+  metadata <- unique(metadata)
+
+  # restore original sorting
+  metadata <- metadata[match(original_sorting, metadata$id), ]
+
+  end.time <- Sys.time()
+  time.taken <- end.time - start.time
+  mlog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Time taken:", time.taken, sep = " "))
+
+  return(metadata)
+}
+
+get_altmetrics <- function(dois) {
+  valid_dois <- unique(dois[which(dois != "")])
+  results <- data.frame()
+  for (doi in valid_dois) {
+    tryCatch(
+      {
+        metrics <- altmetric_data(altmetrics(doi = doi, apikey = ""))
+        results <- rbind.fill(results, metrics)
+      },
+      error = function(err) {
+        mlog$debug(gsub("[\r\n]", "", paste(err, doi, sep = " ")))
+      }
+    )
+  }
+  return(results)
+}
+
+add_altmetrics <- function(metadata) {
+
   results <- get_altmetrics(metadata$doi)
   requested_metrics <- c(
     "cited_by_wikipedia_count",
@@ -36,45 +77,19 @@ enrich_metadata_metrics <- function(metadata) {
     # merge the metadata with the results of the altmetrics
     # don't remove any rows from the metadata, just add the altmetrics to the
     # output
-    output <- merge(x = metadata, y = results, by = "doi", all.x = TRUE, all.y = FALSE)
+    result <- merge(x = metadata, y = results, by = "doi", all.x = TRUE, all.y = FALSE)
   } else {
     for (metric in requested_metrics) {
       metadata[[metric]] <- NA
     }
     mlog$info("No altmetrics found for any paper in this dataset.")
-    output <- metadata
+    result <- metadata
   }
-  output <- add_citations(output)
-
-  # Remove duplicate lines - TODO: check for root of this problem
-  output <- unique(output)
-
-  end.time <- Sys.time()
-  time.taken <- end.time - start.time
-  mlog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Time taken:", time.taken, sep = " "))
-
-  return(output)
-}
-
-get_altmetrics <- function(dois) {
-  valid_dois <- unique(dois[which(dois != "")])
-  results <- data.frame()
-  for (doi in valid_dois) {
-    tryCatch(
-      {
-        metrics <- altmetric_data(altmetrics(doi = doi, apikey = ""))
-        results <- rbind.fill(results, metrics)
-      },
-      error = function(err) {
-        mlog$debug(gsub("[\r\n]", "", paste(err, doi, sep = " ")))
-      }
-    )
-  }
-  return(results)
+  return(result)
 }
 
-add_citations <- function(output) {
-  dois <- output$doi
+add_citations <- function(metadata) {
+  dois <- metadata$doi
   valid_dois <- unique(dois[which(dois != "")])
 
   cc <- tryCatch(
@@ -87,6 +102,6 @@ add_citations <- function(output) {
     }
   )
   names(cc)[names(cc) == "count"] <- "citation_count"
-  output <- merge(x = output, y = cc, by = "doi", all.x = TRUE)
-  return(output)
+  result <- merge(x = metadata, y = cc, by = "doi", all.x = TRUE)
+  return(result)
 }
@@ -47,6 +47,12 @@ if (!is.null(params$lang_id)) {
   lang_id <- 'all'
 }
 
+if (!is.null(params$metrics_sources)) {
+  metrics_sources <- params$metrics_sources
+} else {
+  metrics_sources <- c("altmetric", "crossref")
+}
+
 source('metrics.R')
 
 registerDoParallel(detectCores(all.tests = FALSE, logical = TRUE)-1)
@@ -57,7 +63,7 @@ tryCatch({
   if ("doi" %in% names(metadata)) {
     # only enrich metadata with metrics if at least one DOI is present
     if (!all(is.na(metadata$doi))) {
-      output <- enrich_metadata_metrics(metadata)
+      output <- enrich_metadata_metrics(metadata, metrics_sources)
     }
   } else {
     mlog$warn("No DOIs found in metadata")
 
@@ -3,14 +3,23 @@ var data_config = {
     mode: "search_repos",
 
     service: "pubmed",
+    bubble_min_scale: 1.2,
+    bubble_max_scale: 1,
+    paper_min_scale: 1,
+    paper_max_scale: 1,
+    showLanguage: true,
 
-    title: "",
+    // Configuring papers scaling
     base_unit: "citations",
+    initial_sort: "citations",
+    scale_by: "citations",
+
+    title: "",
     use_area_uri: true,
     show_multiples: false,
     show_dropdown: false,
     preview_type: "pdf",
-    sort_options: ["readers", "title", "authors", "year"],
+    sort_options: ["citations", "title", "authors", "year"],
     is_force_areas: true,
     language: "eng_pubmed",
     area_force_alpha: 0.015,
 
@@ -1,13 +1,16 @@
+import re
 import os
 import json
 import time
 import uuid
-from dateutil.parser import parse
-from datetime import timedelta
-import re
 import redis
-import pandas as pd
 import pathlib
+import numpy as np
+import pandas as pd
+from datetime import timedelta
+from dateutil.parser import parse
+from typing import Dict, List, Union
+from typing_extensions import Literal
 
 
 redis_config = {
@@ -29,7 +32,7 @@ def get_key(store, key, timeout=180):
     result = {
         "k": key,
         "status": "error",
-        "error": "timeout"        
+        "error": "timeout"
     }
     while tries <= max_tries:
         res = store.get(key+"_output")
@@ -119,3 +122,149 @@ def get_nested_value(data, keys, default=None):
         if data is None:
             return default
     return data
+
+
+def push_metadata_to_queue(
+    redis_store: redis.Redis,
+    params: Dict[str, Union[str, List[str]]],
+    metadata: pd.DataFrame,
+    source_list: List[str]
+) -> str:
+    """
+    Sending metadata for processing into Redis queue and returning the request_id.
+
+    :param redis_store: Object of the Redis store.
+    :param params: Request params.
+    :param metadata: DataFrame with default metadata.
+    :param source_list: define from which service additional metadata will be received (available values: "crossref", "altmetric").
+    :return: request_id for the receiving of the request result.
+    """
+    # Checks that valid values are specified in the source array
+    check_metadata_enrichment_source(source_list)
+
+    # Creates a new unique request identifier that will then be used to retrieve the result
+    request_id = str(uuid.uuid4())
+
+    # Specifies from which sources to obtain information
+    params["metrics_sources"] = source_list
+
+    # Payload object creation
+    task_data = json.dumps({
+        "id": request_id,
+        "params": params,
+        "metadata": metadata.to_json(orient="records"),
+    })
+
+    # Pushing request to Redis and returning request id
+    redis_store.rpush("metrics", task_data)
+    return request_id
+
+
+def check_metadata_enrichment_source(source_list: List[str]) -> None:
+    """
+    Checks that valid values are specified in the source array.
+
+    :param source_list: List of sources from where metadata will be enriched.
+    :return: None.
+    """
+    if not all(source in ("crossref", "altmetric") for source in source_list):
+        raise ValueError("Source list must contain only 'crossref' or 'altmetric'")
+
+
+def fetch_enriched_metadata(redis_store: redis.Redis, request_id: str, timeout: int = 600) -> pd.DataFrame:
+    """
+    Getting enriched metadata from Redis.
+
+    :param redis_store: Object of the Redis store.
+    :param request_id: Unique indemnificator of the request.
+    :param timeout: Results waiting time (default - 600 seconds).
+    :return: Enriched DataFrame with metadata.
+    """
+    # Getting result of metadata enrichment from Redis
+    result = get_key(redis_store, request_id, timeout)
+    return pd.DataFrame(result["input_data"])
+
+
+def get_metadata_columns_for_source(source_list: List[str]) -> List[str]:
+    """
+    Returning required metadata columns for different sources.
+
+    :param source_list: List of sources from where metadata received.
+    :return: array with required metadata columns.
+    """
+    # Checks that valid values are specified in the source array
+    check_metadata_enrichment_source(source_list)
+
+    # Define required metadata columns for different sources and return them
+    result = []
+
+    if "crossref" in source_list:
+        result.extend(["citation_count"])
+
+    if "altmetric" in source_list:
+        result.extend([
+            "cited_by_wikipedia_count",
+            "cited_by_msm_count",
+            "cited_by_policies_count",
+            "cited_by_patents_count",
+            "cited_by_accounts_count",
+            "cited_by_fbwalls_count",
+            "cited_by_feeds_count",
+            "cited_by_gplus_count",
+            "cited_by_rdts_count",
+            "cited_by_qna_count",
+            "cited_by_tweeters_count",
+            "cited_by_videos_count"
+        ])
+
+    return result
+
+
+def ensure_required_columns(metadata: pd.DataFrame, source_list: List[str]) -> pd.DataFrame:
+    """
+    Checks that all necessary columns are available or adding them with NaN value.
+
+    :param metadata: DataFrame with metadata.
+    :param source_list: List of sources from where metadata received.
+    :return: Updated DataFrame.
+    """
+    # Checks that valid values are specified in the source array
+    check_metadata_enrichment_source(source_list)
+
+    # Gets metadata columns that must be received from source(-s)
+    columns = get_metadata_columns_for_source(source_list)
+    for column in columns:
+        if column not in metadata.columns:
+            metadata[column] = np.NaN
+
+    return metadata
+
+
+def enrich_metadata(
+    redis: redis.Redis,
+    params: Dict[str, Union[str, List[str]]],
+    metadata: pd.DataFrame,
+    source_list: List[str],
+) -> pd.DataFrame:
+    """
+    Enriching metadata - adding information about citations from Redis.
+
+    :param redis: store object of Redis.
+    :param params: params of the request.
+    :param metadata: DataFrame with default metadata.
+    :param source: define from which service additional metadata will be received (available values: "crossref", "altmetric").
+    :return: Enriched DataFrame with metadata.
+    """
+    # Checks that valid values are specified in the source array
+    check_metadata_enrichment_source(source_list)
+
+    # Creates a request to metrics for metadata enrichment
+    # and returns request_id for receiving the result later
+    request_id = push_metadata_to_queue(redis, params, metadata, source_list)
+
+    # Getting the result after metadata enrichment at metrics
+    enriched_metadata = fetch_enriched_metadata(redis, request_id)
+
+    # Checks that all necessary columns are available or adding them with NaN value
+    enriched_metadata = ensure_required_columns(enriched_metadata, source_list)
+    return enriched_metadata
@@ -1,10 +1,11 @@
 import time
 import json
-import subprocess
 import logging
+import subprocess
 from common.r_wrapper import RWrapper
-from common.decorators import error_logging_aspect
 from common.rate_limiter import RateLimiter
+from common.decorators import error_logging_aspect
+
 
 formatter = logging.Formatter(
     fmt='%(asctime)s %(levelname)-8s %(message)s',
@@ -35,15 +36,13 @@ def next_item(self):
     @error_logging_aspect(log_level=logging.ERROR)
     def execute_search(self, params: dict, metadata: str) -> dict:
         command = [
-            self.command, 
-            self.runner, 
-            self.wd, 
-            params.get('q'), 
+            self.command,
+            self.runner,
+            self.wd,
+            params.get('q'),
             params.get('service')
         ]
 
-        self.logger.debug(f"Executing command: {command}")
-
         data = {
             "params": params,
             "metadata": metadata
@@ -59,8 +58,6 @@ def execute_search(self, params: dict, metadata: str) -> dict:
             )
             stdout, stderr = proc.communicate(json.dumps(data))
 
-            self.logger.debug(f"Stdout: {stdout}")
-
             output = [line for line in stdout.split('\n') if line]
             errors = [line for line in stderr.split('\n') if line]
 
@@ -69,8 +66,6 @@ def execute_search(self, params: dict, metadata: str) -> dict:
 
             raw_metadata = json.loads(output[-2])
 
-            self.logger.debug(f"Raw metadata: {raw_metadata}")
-
             if isinstance(raw_metadata, dict) and raw_metadata.get('status') == "error":
                 return raw_metadata