forked from NicoRiedel/OpenAccessWhitelist
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJournal_Positive_List_script.R
324 lines (248 loc) · 15.5 KB
/
Journal_Positive_List_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#This script creates an ‘Open Access Journal Positive_List’ by aggregating
#information on Open Access Journals from the
#Directory of Open Access Journals (DOAJ) and Pubmed Central (PMC).
#These two data sources ensure that the Journals obey certain quality standards.
#DOAJ ensures high quality standards for journals; individual journals have to apply at DOAJ and
#are checked against a list of quality criteria. PMC stores the full-text version of
#open access articles and increases the visibility of research in that way.
#The current list focuses on biomedical journals but the included subjects
#can be adjusted in the 'adjustable parameters' section
#Only journals that are assigned to the DOAJ subject categories ‘Medicine’ or ‘Biology’ and
#that have English or German as full-text language are included.
#----------------------------------------------------------------------------------------------------------------------------
# load packages
#----------------------------------------------------------------------------------------------------------------------------
source('Journal_Positive_List_functions.R')
library(tidyverse)
library(jsonlite)
library(readxl)
#----------------------------------------------------------------------------------------------------------------------------
# adjustable parameters
#----------------------------------------------------------------------------------------------------------------------------
#those are parameters for filtering the entries for certain parameters
full_text_languages <- c("English", "German") |> paste(collapse = "|")
subjects_included <- c("Medicine", "Biology", "Biotechnology", "Physiology")
#there might still be some more specialised subject categories we want to exclude
subjects_excluded <- c("Agriculture", "Plant culture")
APC_currencies_included <- c("EUR", "GBP",
"USD", "CHF")
#----------------------------------------------------------------------------------------------------------------------------
# load datasets
#----------------------------------------------------------------------------------------------------------------------------
#necessary datasets (only Scopus data needs to be provided, the others are automatically downloaded to that file)
if(!dir.exists('Datasets/')) {
dir.create('Datasets/')
}
doaj_filename <- 'Datasets/DOAJ_journal_list_upd.csv'
pmc_filename <- 'Datasets/PMC_journal_list_upd.csv'
sjr_filename <- 'Datasets/Scopus_SJR_2023.csv'
#update datasets
download.file('https://doaj.org/csv', doaj_filename)
pmc_list <- read_lines("https://www.ncbi.nlm.nih.gov/pmc/journals/?format=csv")
write_lines(pmc_list, pmc_filename)
download.file('http://www.scimagojr.com/journalrank.php?out=xls', sjr_filename, mode = "wb")
exchange_currencies <- str_sub(APC_currencies_included, 1, 3)
# exchange_currencies <- exchange_currencies[exchange_currencies != "EUR"]
exchange_rates <- c(map_dbl(exchange_currencies, get_exchange_rate),
NA)
names(exchange_rates) <- c(exchange_currencies, "-")
# exchange_rates <- c(1.0, exchange_rates |> pluck("GBP"), exchange_rates |> pluck("USD"),exchange_rates |> pluck("CHF"), NA)
#----------------------------------------------------------------------------------------------------------------------------
# DOAJ dataset
#----------------------------------------------------------------------------------------------------------------------------
doaj_data <- read_csv(doaj_filename)
useful_cols_doaj <- c('Journal title', 'Journal URL', 'Journal ISSN (print version)',
'Journal EISSN (online version)', 'Publisher',
'APC', 'APC information URL',
'APC amount', "Has other fees",
"Other fees information URL",
"Languages in which the journal accepts manuscripts",
"Average number of weeks between article submission and publication",
"Journal license", "Subjects")
#filter columns
doaj_data <- doaj_data |> select(all_of(useful_cols_doaj))
APC_amount <- doaj_data$`APC amount` |> str_split(" ") |> map(first) |> unlist() |> as.numeric()
APC_currency <- doaj_data$`APC amount` |> str_split(" ") |> map(last) |> unlist()
doaj_data <- doaj_data |>
mutate(Currency = APC_currency,
`APC amount` = APC_amount) |>
rename(`Full text language` = `Languages in which the journal accepts manuscripts`,
`Journal article processing charges (APCs)` = APC,
`Average number of weeks between submission and publication` = `Average number of weeks between article submission and publication`) |>
filter(`Full text language` |> str_detect(full_text_languages)) |> #English only journals
filter(grepl(paste(subjects_included, collapse = "|"), `Subjects`)) |>
filter(!grepl(paste(subjects_excluded, collapse = "|"), `Subjects`)) |> #Which categories to use?
filter(`Has other fees` == "No") |> #Take only journals without submission fee
filter(`Currency` %in% APC_currencies_included | is.na(`Currency`))
#filter rows that do not easily work with the dplyr filter
#remove journals with a regional focus (using non-comprehensive list of regional terms,
# - might exclude journals with an actual international focus!)
doaj_data <- doaj_data[!sapply(doaj_data$`Journal title`, is_regional_journal),]
#modify and merge APC columns to condense information in one column
doaj_data <- doaj_data |>
mutate(Currency = substr(Currency, 1, 3)) |>
mutate(APC = `APC amount`)
doaj_data[is.na(doaj_data$Currency), "Currency"] <- "-"
#value replacement for subset of rows is not properly implemented in dplyr as simple solution, so use usual R way
noAPC_idx <- which(doaj_data["Journal article processing charges (APCs)"] != "Yes")
doaj_data$APC <- as.character(doaj_data$APC)
doaj_data[noAPC_idx, "APC"] <- doaj_data[noAPC_idx, "Journal article processing charges (APCs)"]
#create column with APCs converted to EUR
doaj_data <- doaj_data |>
mutate(`APC in EUR (including 19% taxes)` = round( (1/exchange_rates[Currency]) * ifelse(is.na(`APC amount`), 0, `APC amount`) * 1.19))
#for those journals without APCs set coverted amount to 0 Eur
no_APC_journal <- doaj_data$`Journal article processing charges (APCs)` == "No"
doaj_data$`APC in EUR (including 19% taxes)`[no_APC_journal] <- 0
doaj_data$`APC in EUR (including 19% taxes)`[is.na(doaj_data$`APC in EUR (including 19% taxes)`)] <- 0
doaj_data <- doaj_data |>
mutate(`APC below 2000 EUR` = logical_to_yes_no(`APC in EUR (including 19% taxes)` < 2000))
#add information on special terms of the Charite library for specific journals
lib_list <- read_excel("Datasets/DOAJ-2023_library.xlsx") |>
select(`Journal title`, contains("ISSN"), Publisher, `förderfähig oder Diamond`) |>
mutate(has_special_conditions2 = case_when(
`förderfähig oder Diamond` == "Diamond" ~ "diamond",
.default = "no"
))
# check for duplicates, should be unique
dupes <- lib_list |>
janitor::get_dupes(c(`Journal title`, Publisher, `Journal ISSN (print version)`, `Journal EISSN (online version)`))
special_conditions_publishers <- c("Cambridge University Press",
"MDPI AG", "JMIR Publications")
special_conditions_journals <- c("SAGE Open", "SAGE Open Medicine",
"SAGE Open Medical Case Reports")
journals_not_fundable <- c(
# "Cancers", "Cells",
# "International Journal of Environmental Research and Public Health",
# "International Journal of Molecular Sciences",
# "Journal of Clinical Medicine",
# "Molecules", "Nutrients", "Viruses"
"Marine Drugs",
"Materials", "Nanomaterials", "Remote Sensing", "Sensors", # not on list??
"Toxins")
has_special_conditions <- doaj_data$Publisher %in% special_conditions_publishers |
doaj_data$`Journal title` %in% special_conditions_journals
not_fundable <- doaj_data$`Journal title` %in% journals_not_fundable
doaj_data[has_special_conditions,][["APC below 2000 EUR"]] <- "yes, library special terms"
doaj_data[not_fundable,][["APC below 2000 EUR"]] <- "no"
deal_publishers <- c("SpringerOpen", "Nature Publishing Group",
"Adis, Springer Healthcare", "Wiley")
has_deal_conditions <- doaj_data$Publisher %in% deal_publishers
doaj_data[has_deal_conditions & doaj_data$`APC below 2000 EUR` == "yes",][["APC below 2000 EUR"]] <- "yes, also under DEAL"
doaj_data[has_deal_conditions & doaj_data$`APC below 2000 EUR` == "no",][["APC below 2000 EUR"]] <- "no, but DEAL special terms"
#for some journals there is a reduction in the journal fee but they still don't fall under the 2000€ threshold
#only_discount_journals <- c("BMJ Open Diabetes Research & Care")
#is_discount_only <- doaj_data$`Journal title` %in% only_discount_journals
#doaj_data[is_discount_only,][["APC below 2000 EUR"]] <- "no, but library discount applies"
#simplify subject categories
subjects_simplified <- lapply(doaj_data$Subjects, subject_simplification)
doaj_data <- doaj_data |>
add_column(`Subject category 1` = sapply(subjects_simplified, "[[", 1),
`Subject category 2` = sapply(subjects_simplified, "[[", 2))
#rename some columns
doaj_data <- doaj_data |>
rename(pISSN = `Journal ISSN (print version)`,
eISSN = `Journal EISSN (online version)`,
`APC drop` = `Journal article processing charges (APCs)`,
`Journal article processing charges (APCs)` = APC)
#drop some columns
doaj_data <- doaj_data |>
select(-`Has other fees`, -`Other fees information URL`,
-`APC drop`, -`APC amount`, -Subjects)
unique(doaj_data$`APC below 2000 EUR`)
#----------------------------------------------------------------------------------------------------------------------------
# PMC dataset
#----------------------------------------------------------------------------------------------------------------------------
pmc_data <- read_csv(pmc_filename)
useful_cols_pmc <- c("Journal title", "pISSN", "eISSN", "Publisher")
pmc_data <- pmc_data |> select(one_of(useful_cols_pmc))
#----------------------------------------------------------------------------------------------------------------------------
# Scopus dataset
#----------------------------------------------------------------------------------------------------------------------------
scopus_data <- read_delim(sjr_filename, delim = ";", locale = locale(decimal_mark = ","))#read.xls(sjr_filename) |> as_tibble()
useful_cols_scopus <- c("Title", "Issn", "SJR", "SJR Best Quartile")
scopus_data <- scopus_data |>
select(one_of(useful_cols_scopus)) |>
separate(Issn, into = c("eISSN", "pISSN")) |>
rename(`SJR Impact` = SJR) |>
mutate(`SJR Impact` = as.double(as.character(`SJR Impact`)),
`SJR Best Quartile` = as.character(`SJR Best Quartile`))
scopus_data$eISSN[scopus_data$eISSN == ""] <- NA
#introduce '-' character in the middle of eISSN/pISSN to make it consistent with other data sources
scopus_data$pISSN <- paste(str_sub(scopus_data$pISSN, 1, 4), str_sub(scopus_data$pISSN, 5, 8), sep = "-")
scopus_data$eISSN <- paste(substring(scopus_data$eISSN, 1, 4), substring(scopus_data$eISSN, 5, 8), sep = "-")
#unfortunately paste() doesn't know how to deal with NA's
scopus_data$pISSN[scopus_data$pISSN == "NA-NA"] <- NA
scopus_data$eISSN[scopus_data$eISSN == "NA-NA"] <- NA
#filter all entries that have neither an eISSN nor an pISSN
scopus_data <- scopus_data |>
filter(!(is.na(eISSN) & is.na(pISSN)))
#----------------------------------------------------------------------------------------------------------------------------
# join datasets
#----------------------------------------------------------------------------------------------------------------------------
#DOAJ and PMC join
#first join on print ISSN
joined_data_pISSN <- doaj_data |>
filter(!is.na(pISSN)) |>
left_join(select(pmc_data, -eISSN), by = "pISSN", suffix = c(".doaj", ".pmc")) #remove other key in pmc_data before joining to avoid double second key cols
#second join on eISSN
joined_data_eISSN <- doaj_data |>
filter(!is.na(eISSN)) |>
left_join(select(pmc_data, -pISSN), by = "eISSN", suffix = c(".doaj", ".pmc"))
#some entries have wrong metadata, where the eISSN in one dataset corresponds to
#the pISSN in the other dataset
joined_data_mixed_ISSN <- doaj_data |>
filter(!is.na(pISSN)) |>
left_join(select(pmc_data, -pISSN), by = c("pISSN" = "eISSN"),
suffix = c(".doaj", ".pmc"))
#combine both join versions and remove duplicates
joined_data <- joined_data_pISSN |>
union(joined_data_eISSN) |>
union(joined_data_mixed_ISSN)
#add column is_PMC_listed by computing which rows have journal title.pmc entry
joined_data <- joined_data |>
mutate(is_PMC_listed = !is.na(`Journal title.pmc`),
is_PMC_listed = logical_to_yes_no(is_PMC_listed))
#only select PMC-listed journals?
joined_data <- joined_data |>
filter(is_PMC_listed == "yes")
#scopus join
#as the scopus eISSN and pISSN might be interchanged
#both have to be checked against both DOAJ pISSN/eISSN
#use self-defined function instead of regular join
SJR_vec <- map2_dbl(joined_data$eISSN, joined_data$pISSN, get_scopus_var,
scopus_data = scopus_data, varname = "SJR Impact")
#add the SJR quartile column
SJR_quartile_vec <- map2_chr(joined_data$eISSN, joined_data$pISSN, get_scopus_var,
scopus_data = scopus_data, varname = "SJR Best Quartile")
joined_data <- joined_data |>
add_column(`SJR Impact` = SJR_vec,
`SJR Subject Category Best Quartile` = SJR_quartile_vec)
#----------------------------------------------------------------------------------------------------------------------------
# save current version of spreadsheet
#----------------------------------------------------------------------------------------------------------------------------
#rearrange columns
final_col <- c('Journal title.doaj',
'SJR Impact', 'SJR Subject Category Best Quartile',
'Journal article processing charges (APCs)', 'Currency',
'APC in EUR (including 19% taxes)', 'APC below 2000 EUR', 'APC information URL',
"Average number of weeks between submission and publication",
'Subject category 1', 'Subject category 2', 'Journal license',
'Journal URL', 'Publisher.doaj', "pISSN", "eISSN")
joined_data <- joined_data |>
select(all_of(final_col)) |>
rename(`Journal title` = `Journal title.doaj`,
Publisher = `Publisher.doaj`) |>
arrange(`Subject category 1`, desc(`SJR Impact`))
#filter duplicated journals (again, since some duplicates reappeared for some unknown reason)
joined_data <- joined_data[!duplicated(joined_data$`Journal title`),]
#convert NAs to "-" for final list
joined_data$`SJR Impact`[is.na(joined_data$`SJR Impact`)] <- "-"
joined_data$`SJR Subject Category Best Quartile`[is.na(joined_data$`SJR Subject Category Best Quartile`)] <- "-"
#save resulting table in two formats (.csv for e.g. loading into Excel) or .RDS for use in the R-Shiny app
if(!dir.exists('Table/')) {
dir.create('Table/')
}
save_filename <- paste0('OAPositive_List_shiny/data/Journal_Positive_List_Table_', Sys.Date(), '.csv')
write_csv(joined_data, save_filename)
save_filename_RDS <- paste0('OAPositive_List_shiny/data/Journal_Positive_List_Table_', Sys.Date(), '.rds')
saveRDS(joined_data, save_filename_RDS, version = 2)