-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_indicator1_data.R
128 lines (92 loc) · 5.21 KB
/
get_indicator1_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
###
### This R function takes as input the output of the Kobo form "International Genetic Indicator testing"
### and reformat its in order to have the data in a dataframe useful for estimating
### Genetic Diversity Indicator 1 (the proportion of populations within species with
### a genetically effective size, Ne, greater than 500.)
###
### If you use this script, please check https://github.com/AliciaMstt/GeneticIndicators
### for citation guidelines
get_indicator1_data<-function(kobo_output=kobo_output){
###
### Arguments:
###
# kobo_output = a data frame object read into R from the `.csv` file
# resulting from exporting the Kobotoolbox data from the form
# "International Genetic Indicator testing" wit the settings explaiend at
# https://github.com/AliciaMstt/GeneticIndicators
### Needed libraries:
# library(tidyr)
# library(dplyr)
# library(utile.tools)
# library(stringr)
###
### Function
###
### Get data
kobo_output<-kobo_output
### Separate data
# create a variable with the full taxon name if this variable doesn't exist already
# (raw kobo output doesn't include it, but it may exists in a "clean" version of the
# output if ran through the quality check pipeline)
if("taxon" %in% colnames(kobo_output)){
print("the data already contained a taxon column, that was used instead of creating a new one")
}else {
kobo_output<-kobo_output %>%
mutate(taxon=(utile.tools::paste(genus, species, subspecies_variety, na.rm=TRUE))) %>%
# remove white space at the end of the name
mutate(taxon=str_trim(taxon, "right"))
}
## Add a variable to the metadata stating if the taxon was assessed multiple times or only a single time
# object with duplicated taxa within a single country
# duplicated() is run twice, the second time with fromLast = TRUE so that
# the first occurrence is also accounted for, i.e. we can subset all records with the same taxon for a given country
kobo_output_duplicates <- kobo_output[which(duplicated(kobo_output[c('taxon', 'country_assessment')]) | duplicated(kobo_output[c('taxon', 'country_assessment')], fromLast = TRUE)), ]
# if it is a duplicate then tag it as multi_assessment, if it is not duplicated within the country then single
kobo_output <- kobo_output %>%
mutate(multiassessment= if_else(
X_uuid %in% kobo_output_duplicates$X_uuid, "multiassessment", "single_assessment"))
### Process data already including taxon column and multiassessment
indicator1_data <- kobo_output %>%
# create variable with year in which assessment was done (based on date the form was completed)
mutate(year_assesment=substr(end,1,4)) %>%
# make sure some variables that seem numbers are actually character,
# because there may be character and integer values depending on how data was written)
# for example in IntroductionYear, NeYear and NcYear...
mutate(across(starts_with("IntroductionYear"), as.character)) %>%
mutate(across(starts_with("NeYear"), as.character)) %>%
mutate(across(starts_with("NcYear"), as.character)) %>%
mutate(across(starts_with("NcRangeDetails"), as.character)) %>%
## select relevant columns
# taxon and assessment info
dplyr::select(country_assessment, taxonomic_group, taxon, scientific_authority,
genus, taxon, year_assesment, name_assessor, email_assessor, kobo_tabular,
# method to define populations
defined_populations,
# indicator 1 data
time_populations, Name_pop1:Comments_pop25,
# kobo validation status
X_validation_status,
# uuid, this is a unique id generated by kobo for each unique record. It will be used to differentiate different records even if the same species is evaluated twice
X_uuid,
# to know if the taxon was assessed multiple times or only a single time
multiassessment) %>%
### Get population data as single variables
pivot_longer(cols = matches("_pop[0-9]"),
names_to=c(".value", "population"),
names_sep = "_",
values_drop_na = TRUE) %>%
# omit populations w/data (empty because they were not filled, this is ok)
filter(Origin!="") %>% #origin is a mandatory question, so it should be answered, if not, the pop doesn't exist
# change all "" (empty) cells to NA
mutate_all(list(~na_if(.,""))) %>%
# change -999 to Na
mutate(Ne=na_if(Ne, -999),
NeLower=na_if(NeLower, -999),
NeUpper=na_if(NeUpper, -999),
NcPoint=na_if(NcPoint, -999),
NcLower=na_if(NcLower, -999),
NcUpper=na_if(NcUpper, -999)) %>%
# change "" in kobo_tabular to "kobo" ("" means that question was not answered because the taxon had less populations that the min to trigger tabular)
mutate(kobo_tabular=ifelse(is.na(kobo_tabular), "kobo", kobo_tabular))
# End of function
}