get_indicator1_data.R

###
### This R function takes as input the output of the Kobo form "International Genetic Indicator testing" 
### and reformat its in order to have the data in a dataframe useful for estimating 
### Genetic Diversity Indicator 1 (the proportion of populations within species with 
###                                a genetically effective size, Ne, greater than 500.)
### 

### If you use this script, please check https://github.com/AliciaMstt/GeneticIndicators 
### for citation guidelines

get_indicator1_data<-function(kobo_output=kobo_output){
  ###
  ### Arguments:
  ###
  
  # kobo_output = a data frame object read into R from the `.csv` file 
  # resulting from exporting the Kobotoolbox data from the form 
  # "International Genetic Indicator testing" wit the settings explaiend at
  # https://github.com/AliciaMstt/GeneticIndicators
  
  ### Needed libraries:  
  
  #  library(tidyr)
  #  library(dplyr)
  #  library(utile.tools)
  #  library(stringr)
  
  ###
  ### Function  
  ### 
  
  ### Get data
  kobo_output<-kobo_output
  
  ### Separate data 
  
    # create a variable with the full taxon name if this variable doesn't exist already
    # (raw kobo output doesn't include it, but it may exists in a "clean" version of the 
    # output if ran through the quality check pipeline)
    
    if("taxon" %in% colnames(kobo_output)){
       print("the data already contained a taxon column, that was used instead of creating a new one")
        
    }else {
    kobo_output<-kobo_output %>% 
      mutate(taxon=(utile.tools::paste(genus, species, subspecies_variety, na.rm=TRUE))) %>%
        # remove white space at the end of the name
        mutate(taxon=str_trim(taxon, "right"))
    } 
  
   ## Add a variable to the metadata stating if the taxon was assessed multiple times or only a single time
  
  # object with duplicated taxa within a single country
  # duplicated() is run twice, the second time with  fromLast = TRUE so that 
  # the first occurrence is also accounted for, i.e. we can subset all records with the same taxon for a given country
  kobo_output_duplicates <- kobo_output[which(duplicated(kobo_output[c('taxon', 'country_assessment')]) | duplicated(kobo_output[c('taxon', 'country_assessment')], fromLast = TRUE)), ]
  
  # if it is a duplicate then tag it as multi_assessment, if it is not duplicated within the country then single
  kobo_output <- kobo_output %>% 
    mutate(multiassessment= if_else(
      X_uuid %in% kobo_output_duplicates$X_uuid, "multiassessment", "single_assessment"))
  
  
   ### Process data already including taxon column and multiassessment
   
    indicator1_data <- kobo_output %>% 
      
    # create variable with year in which assessment was done (based on date the form was completed)
    mutate(year_assesment=substr(end,1,4)) %>%
    
    # make sure some variables that seem numbers are actually character,
    # because there may be character and integer values depending on how data was written)
    # for example in IntroductionYear, NeYear and NcYear...
    mutate(across(starts_with("IntroductionYear"), as.character)) %>%
    mutate(across(starts_with("NeYear"), as.character)) %>%
    mutate(across(starts_with("NcYear"), as.character)) %>%
    mutate(across(starts_with("NcRangeDetails"), as.character)) %>%
      
    ## select relevant columns 
    # taxon and assessment info
    dplyr::select(country_assessment, taxonomic_group, taxon, scientific_authority,
                  genus, taxon, year_assesment, name_assessor, email_assessor, kobo_tabular, 
                  
                  # method to define populations
                  defined_populations,
                  
                  # indicator 1 data               
                  time_populations, Name_pop1:Comments_pop25,
                  
                  # kobo validation status
                  X_validation_status,
                  
                  # uuid, this is a unique id generated by kobo for each unique record. It will be used to differentiate different records even if the same species is evaluated twice
                  X_uuid,
                  
                  # to know if the taxon was assessed multiple times or only a single time
                  multiassessment) %>% 
                  
  ### Get population data as single variables               
    pivot_longer(cols = matches("_pop[0-9]"),
                 names_to=c(".value", "population"),
                 names_sep = "_",
                 values_drop_na = TRUE) %>%
  
  # omit populations w/data (empty because they were not filled, this is ok)  
  filter(Origin!="")  %>% #origin is a mandatory question, so it should be answered, if not, the pop doesn't exist

  # change all "" (empty) cells to NA
  
  mutate_all(list(~na_if(.,""))) %>%
      
  # change -999 to Na
  mutate(Ne=na_if(Ne, -999),
         NeLower=na_if(NeLower, -999),
         NeUpper=na_if(NeUpper, -999),
         NcPoint=na_if(NcPoint, -999),
         NcLower=na_if(NcLower, -999),
         NcUpper=na_if(NcUpper, -999)) %>%
      
  # change "" in kobo_tabular to "kobo" ("" means that question was not answered because the taxon had less populations that the min to trigger tabular)
  mutate(kobo_tabular=ifelse(is.na(kobo_tabular), "kobo", kobo_tabular)) 
    
  
    
  
  # End of function
}