Add version v7 of the CORD-19 dataset (#14)

asreview · Apr 14, 2020 · ab11787 · ab11787
1 parent c903158
commit ab11787
Show file tree

Hide file tree

Showing 15 changed files with 64,802 additions and 35 deletions.
diff --git a/.Rhistory b/.Rhistory
@@ -0,0 +1,28 @@
+library(tidyverse)
+library(lubridate)
+CORD19id_date <- read_csv(
+"CORD19v7_R/output/CORD19id_date_v7.csv",
+col_types = cols(pmcid = col_character(),
+pubmed_id = col_character())
+)
+#create column for year
+CORD19id_date_year <- CORD19id_date %>%
+mutate(year = lubridate::year(date))
+#quick plot
+date_plot <- CORD19id_date_year %>%
+filter(!is.na(year))
+ggplot(date_plot, aes(year)) +
+geom_histogram(binwidth = 1, fill="steelblue") +
+theme_minimal()
+ave_month <- CORD19id_date_year %>%
+filter(!is.na(date)) %>%
+group_by(year) %>%
+count() %>%
+mutate(month = case_when(
+year != 2020 ~ round((n/12)),
+year == 2020 ~ round((n/3)))
+)
+ggplot(data=ave_month, aes(x=year, y=month)) +
+geom_bar(stat="identity", width=0.8, fill="steelblue") +
+theme_minimal() +
+labs(y = "articles / month")
diff --git a/Cord19_dataset/CORD19_dataset_R.Rmd b/Cord19_dataset/CORD19_dataset_R.Rmd
@@ -1,56 +1,56 @@
 ---
 title: "CORD19 dates"
 author: "Bianca Kramer, for ASReview"
-date: "April 5, 2020"
+date: "April 14, 2020"
 output:
   pdf_document: default
   html_document: default
 ---
 
 Retrieved dates for DOIs ('date created') and PMCID ('date first published') for records in CORD19 dataset that do not have date in proper format
 
-CORD19 retrieved from: [https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-03/metadata.csv](https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-03/metadata.csv)  
-Version of CORD19 used: 20200403 (v6)  
-Date of sampling: 20200404
+CORD19 retrieved from: [https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv](https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv)  
+Version of CORD19 used: 20200410 (v7)  
+Date of sampling: 20200413
 
-Scripts and data in:
-[CORD19v6_R](/CORD19v6_R) [_fix path display_]
+Scripts in:
+[CORD19_processing_R](/CORD19_processing_R) [_fix path display_]
 
 Results files:
-[CORD19v6_R/output/CORD19id_date_v6.csv](/CORD19v6_R/output/CORD19id_date_v6.csv)  [_fix path display_]  
+[CORD19v7_R/output/CORD19id_date_v7.csv](/CORD19v7_R/output/CORD19id_date_v7.csv)  [_fix path display_]  
 (source, publication ids, date)
 
-[CORD19v6_R/output/cord19_v6_20191201.csv](/CORD19v6_R/output/cord19_v6_20191201.csv)  [_fix path display_]  
+[CORD19v6_R/output/cord19_v7_20191201.csv](/CORD19v7_R/output/cord19_v7_20191201.csv)  [_fix path display_]  
 (source, publication ids, date)
 
 ## Workflow
 
 0. **Get ids from CORD19 dataset**
-    + download CORD19 [n = 47,298] 
+    + download CORD19 [n = 51,078] 
       + (NB specify date format for publish_time, this will only retrieve dates that are in standard format)
-    + keep columns for source, doi, pmcid, pmid, publish_time [n = 47,298]
-    + check duplicate* [n = 47,298] <- no duplicate records
+    + keep columns for cord_uid, source, doi, pmcid, pmid, publish_time [n = 51,078]
+    + check duplicate* [n = 51,078] <- no duplicate records
 1. **For records without proper date format that have DOIs, get date created from Crossref API**
-    + select records without date in proper format [n = 1724]
-    + of these, select records with DOIs [n = 1174]
-    + retrieve data from Crossref [n = 982]
+    + select records without date in proper format [n = 2116]
+    + of these, select records with DOIs [n = 1610]
+    + retrieve data from Crossref [n = 1419]
       + (NB notation of dois from CORD19 v4 onwards is harmonized)
     + match dates back to CORD19 ID data 
       + (NB for match, use lowercase for both sets of dois)
 2. **For remaining records with PMCID, get date first published from EuropePMC API**
     + select remaining records without date in proper format* [n = 741]
-    + of these, select records with PMCID [n = 256]
+    + of these, select records with PMCID [n = 258]
     + retrieve data from EuropePMC [n = 256]
     + match dates back to CORD19 ID data
 3. **Analyze remaining records**
-    + select remaining records without date in proper format* [n = 485]
-    + of these, select records with PMID [n= 165] <- do NOT pursue at this time)
+    + select remaining records without date in proper format* [n = 483]
+    + of these, select records with PMID [n= 163] <- do NOT pursue at this time)
 4. **Create CORD-19 subset by date**
-    + + download CORD19 [n = 47,298] 
+    + + download CORD19 [n = 51,078] 
       + (NB specify character format for publish_time, this will retrieve dates that are in standard format)
     + copy column 'publish_time' into column 'date' as date format, (this will only keep dates that are in standard format)
     + match retrieved dates by doi, pmcid and pmid
-    + filter on records with date >= 2019-12-01, or publish_time == 2020 [n = 4774]
+    + filter on records with date >= 2019-12-01, or publish_time == 2020 [n = 5753]
 
 *This includes records without DOI, and records where DOI data was not retrieved from Crossref
 
@@ -67,7 +67,7 @@ library(lubridate)
 Read file with matched dates
 ```{r}
 CORD19id_date <- read_csv(
-  "CORD19v6_R/output/CORD19id_date_v6.csv",
+  "CORD19v7_R/output/CORD19id_date_v7.csv",
   col_types = cols(pmcid = col_character(),
                    pubmed_id = col_character())
   )
@@ -117,16 +117,11 @@ ggplot(data=ave_month, aes(x=year, y=month)) +
 
 ## Outstanding issues:
 1. Validate results against years in original dataset
-
-2. Pretty pictures!
-    + also plot by month for 2019-2020
 
-3. Clean titles/abstracts
-
-4. Add missing abstracts from open sources
-
-5. Analyze publisher-added content to PMC
+2. Clean titles/abstracts
 
+3. Add missing abstracts from open sources
 
+4. Analyze publisher-added content to PMC
 
 
diff --git a/Cord19_dataset/CORD19_processing_R/CORD19_processing.R b/Cord19_dataset/CORD19_processing_R/CORD19_processing.R
@@ -0,0 +1,217 @@
+#Add dates to CORD19 dataset (missing and/or non-formatted dates)
+#Create subset of records from Dec 2019 onwards
+#Current version: v7 dd 20200410
+
+#info on CORD19 dataset:
+#https://pages.semanticscholar.org/coronavirus-research
+
+#install.packages("tidyverse")
+#install.packages("rcrossref")
+#install.packages("europepmc")
+#install.packages("lubridate")
+library(tidyverse)
+library(rcrossref)
+library(europepmc)
+library(lubridate)
+
+source("CORD19_processing_R/src/CORD19_import.R")
+source("CORD19_processing_R/src/CORD19_match_dois.R")
+source("CORD19_processing_R/src/CORD19_match_PMCID.R")
+source("CORD19_processing_R/src/CORD19_create_subset.R")
+
+#set email in Renviron
+file.edit("~/.Renviron")
+#add email address to be shared with Crossref:
+#crossref_email = [email protected]
+
+
+#----------------------------------------------------
+#URLs for CORD19 dataset
+#url3 <- "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-13/all_sources_metadata_2020-03-13.csv"
+#url4 <- "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-20/metadata.csv"
+#url5 <- "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-27/metadata.csv"
+#url6 <- "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-03/metadata.csv"
+#url7 <- "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv"
+
+#set current version number and url
+version <- 7
+url <- "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv"
+
+
+
+#inspect first lines of full dataset to view columns
+#CORD19 <- seeCORDfull(url)
+#rm(CORD19)
+
+#read selected columns (uid, source, ids, date)
+#warnings on date column are expected, only properly formatted dates are included 
+CORD19id <- getCORDid(url)
+#51078 records 
+
+#------------------------------------------------------
+
+#for records without proper date, extract dois as character vector
+doi_list <- CORD19id %>%
+  filter(is.na(publish_time)) %>%
+  filter(!is.na(doi)) %>%
+  pull(doi) %>%
+  unique() 
+#1610 dois, 1609 unique
+
+#get created date from Crossref for dois
+#runtime appr. 1 m per 100 dois, progress bar is shown
+doi_date <- getCrossref(doi_list)
+doi_date <- formatDateCrossref(doi_date)
+doi_date <- distinct(doi_date)
+#1419 retrieved
+
+filename <- paste0("CORD19v",
+                   version,
+                   "_R/data/dois_date_v",
+                   version,
+                   ".csv")
+# write to csv for later use in matching
+write_csv(doi_date, filename)
+
+#create columns doi_lc in both databases for matching
+CORD19id_date <- CORD19id %>%
+  mutate(doi_lc = str_to_lower(doi))
+
+doi_date <- doi_date %>%
+  mutate(doi_lc = str_to_lower(doi))
+
+#join dates to list of ids, create new column date
+CORD19id_date <- joinDateCrossref(CORD19id_date, doi_date)
+rm(doi_date, doi_list)
+
+#------------------------------------------------------------
+
+#for records without proper date, extract pmcids as character vector
+pmcid_list <- CORD19id_date %>%
+  filter(is.na(date)) %>%
+  filter(!is.na(pmcid)) %>%
+  pull(pmcid) %>%
+  unique() 
+#258 pmcids
+
+#set parameter for progress bar
+pb <- progress_estimated(length(pmcid_list))
+
+# get data from EuropePMC
+# app 1 min/100 DOIS, progress bar shown
+pmcid_date <- map_dfr(pmcid_list, getEuropePMC_progress)
+# NB this gives an message for each result - find a way to suppress them
+rm(pb)
+
+#extract data and format date
+#warning if columns are not included in ePMC output
+pmcid_date <- extractDataEuropePMC(pmcid_date)
+#258 records
+
+filename <- paste0("CORD19v",
+                   version,
+                   "_R/data/pmcid_date_v",
+                   version,
+                   ".csv")
+# write to csv for later use in matching
+write_csv(pmcid_date, filename)
+
+
+#join dates to list of ids
+CORD19id_date <- joinDateEuropePMC(CORD19id_date, pmcid_date)
+rm(pmcid_date, pmcid_list)
+
+
+
+#merge dates for doi and pmcid
+CORD19id_date <- mergeDate(CORD19id_date)
+
+
+#check still missing dates
+count <- CORD19id_date %>%
+  filter(is.na(date)) %>%
+#  filter(!is.na(pubmed_id))
+#rm(count)
+#483 without proper date
+#163 with pmid
+
+filename <- paste0("CORD19v",
+                   version,
+                   "_R/output/CORD19id_date_v",
+                   version,
+                   ".csv")
+# write to csv
+write_csv(CORD19id_date, filename)
+# read file for processing at later time
+#CORD19id_date <- read_csv(filename, col_types = cols(pmcid = col_character(),
+#                                                      pubmed_id = col_character()))
+
+
+#--------------------------------------------------
+
+#read full dataset, set 4 columns as char for full import (to prevent errors)
+#set column publish_time as character, to read all in
+CORD19full <- read_csv(url, 
+                       col_types = cols(`Microsoft Academic Paper ID` = col_character(), 
+                                        `WHO #Covidence` = col_character(), 
+                                        journal = col_character(), 
+                                        pmcid = col_character(), 
+                                        publish_time = col_character(), 
+                                        pubmed_id = col_character()))
+
+
+#Add date column - setting date format will only keep properly formatted entries
+CORD19full_date <- CORD19full %>%
+  mutate(date = as.Date(publish_time))
+
+#get counts of all non-NAs per column 
+count_all <- CORD19full_date %>%
+  summarise_all(~ sum(!is.na(.)))
+
+count_id_date <- CORD19id_date %>%
+  summarise_all(~ sum(!is.na(.)))
+
+rm(count_all, count_id_date)
+
+
+#map date from CORD19_id_date, for DOI and PMCID
+CORD19full_date <- CORD19full_date %>%
+  joinDOI(CORD19id_date) %>%
+  joinPMCID(CORD19id_date) %>%
+  distinct()
+
+#test count (expected 483)
+count <- CORD19full_date2 %>%
+  filter(is.na(date))
+#n= 483 
+rm(count)
+
+#create subset from 20191201
+CORD19_201912 <- CORD19full_date %>%
+  mutate(subset = case_when(
+    date >= as.Date("2019-12-01") ~ TRUE,
+    publish_time == 2020 ~ TRUE,
+    TRUE ~ FALSE)) %>%
+  filter(subset == TRUE) %>%
+  select(-subset) 
+#n= 5753
+
+
+filename <- paste0("CORD19v",
+                   version,
+                   "_R/output/cord19_v",
+                   version,
+                   "_20191201.csv")
+write_csv(CORD19_201912, filename)
+
+#----------------------------------------------------------
+#for ASReview, get number of missing titles/abstracts
+#rewrite as function with one output
+
+count_abs_full <- CORD19full %>%
+  select(title, abstract) %>%
+  summarise_all(~ sum(is.na(.)))
+
+count_abs_subset <- CORD19_201912 %>%
+  select(title, abstract) %>%
+  summarise_all(~ sum(is.na(.)))
diff --git a/Cord19_dataset/CORD19_processing_R/src/CORD19_create_subset.R b/Cord19_dataset/CORD19_processing_R/src/CORD19_create_subset.R
@@ -0,0 +1,44 @@
+#Create subset of full dataset for publiations > 20191201
+
+
+#define 2 functions to match on dois and pmcids
+joinDOI <- function(x,y){
+
+  y <- y %>%
+    select(doi, date) %>%
+    rename(date2 = date)
+
+  res <- x %>%
+    left_join(y, by = "doi", na_matches = "never")
+
+  res <- res  %>%
+    mutate(date = case_when(
+      is.na(date) ~ date2,
+      !is.na(date) ~ date)) %>%
+    select(-date2)
+
+  return(res)
+}
+
+joinPMCID <- function(x,y){
+
+  y <- y %>%
+    select(pmcid, date) %>%
+    rename(date2 = date)
+
+  res <- x %>%
+    left_join(y, by = "pmcid", na_matches = "never")
+
+  res <- res  %>%
+    mutate(date = case_when(
+      is.na(date) ~ date2,
+      !is.na(date) ~ date)) %>%
+    select(-date2)
+
+
+  return(res)
+}
+
+
+
+