index.Rmd

---
title: "DwC-A Writer"
author: "Eduardo Klein and Enrique Montes (enrique.montes@noaa.gov)"
date: "10/7/2020"
output:
  html_document:
    number_sections: yes
    toc: yes
    toc_float: yes
  pdf_document:
    toc: yes
  word_document:
    toc: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```

This script transforms long-format data tables used in rocky intertidal surveys of the Marine Biodiversity Observation Network Pole to Pole of the Americas ([MBON Pole to Pole](https://marinebon.org/p2p/){target="_blank"}) into Darwin Core Archive (DwC-A) files for publishing data in the Ocean Biodiversity Information System ([OBIS](https://obis.org/){target="_blank"}) following instructions from this [manual](https://diodon.github.io/P2P_documents/PublishData/docs/PublishingDataIPT.html){target="_blank"}. It also generates an integrated file ready for data analysis. 

To test this script, you will need to:

- Install R software.

- Create three folders in your working directory: Analysis, Data, IPT.

- Set your working directory to the location of these three folders (e.g. setwd("~/your directory").

- Save the [The DataSheet_longformat_TEST](https://github.com/diodon/P2P-templates/blob/main/DataSheet_longformat_TEST_v2.xlsx){target="_blank"} file in the "Data" folder. This is the data table that you will substitute with your own data.

Now, just copy the code chunks below and paste them into your R console.

# Basic Setup

You need a few packages to run this code. Check that the "Data", IPT" and "Analysis" folders are created under your working directory. [The DataSheet_longformat_TEST](https://github.com/diodon/P2P-templates/blob/main/DataSheet_longformat_TEST_v2.xlsx){target="_blank"} file should be available in the "Data" folder.

```{r setupPackages}
library(lutz)
library(countrycode)
library(readxl)
library(reshape2)
library(lubridate)
library(dplyr)
library(ggplot2)
library(kableExtra)
options(dplyr.summarise.inform = FALSE)

# Create a "Data", IPT" and "Analysis" folder under your selected directory
baseDataDir = "Data"
baseIPT = "IPT"
baseAnalysis = "Analysis"

```


# Load your data table

Set `fileName`to the name of the longformat data table from your survey

```{r, echo=T, warning=FALSE, message=FALSE}

# Select your data table
fileName = "DataSheet_longformat_TEST_v2.xlsx"
```


# Read file sheets
```{r, echo=T, warning=FALSE, message=FALSE}
## Extract information about your sampling site from the SiteInfo tab of your data table file

DF.sites = read_xlsx(file.path(baseDataDir, fileName),sheet = "SiteInfo")
DF.sites = DF.sites[!is.na(DF.sites$COUNTRY),]
```

## convert time to UTC
```{r, echo=T, warning=FALSE, message=FALSE}

## get number of seconds from midnight
secsSTART = (1 -abs(as.numeric(julian(DF.sites$TIME_START)) - (as.integer(julian(DF.sites$TIME_START))))) * (60*60*24)
secsEND = (1 - abs(as.numeric(julian(DF.sites$TIME_END)) - (as.integer(julian(DF.sites$TIME_END))))) * (60*60*24)
dateChar = paste(DF.sites$YEAR, DF.sites$MONTH, DF.sites$DAY, sep="-")

## get timezone and timezone offset
timeZone = tz_lookup_coords(mean(DF.sites$LATITUDE, na.rm=T), mean(DF.sites$LONGITUDE, na.rm=T), method="accurate")
dateOffset = tz_offset(dateChar, timeZone)$utc_offset_h

## create data and time UTC
DF.sites$eventDate = as.POSIXct(dateChar, tz="UTC")
DF.sites$TIME_START = DF.sites$eventDate + seconds(secsSTART) + hours(dateOffset)
DF.sites$TIME_END = DF.sites$eventDate + seconds(secsEND) + hours(dateOffset)
DF.sites$eventTime = paste(format(DF.sites$TIME_START, "%H:%M:%SZ"), format(DF.sites$TIME_END, "%H:%M:%SZ"), sep="/")

print(timeZone)
kable(DF.sites[1:5, c(3:5, 13:14)]) %>% kable_styling("striped")
```

## Extract other fields
```{r, echo=T, warning=FALSE, message=FALSE}

# Country code
DF.sites$datasetName = paste0("MBON-P2P-biodiversity-",unique(DF.sites$countryCode))
DF.sites$countryCodeISO = countrycode(DF.sites$COUNTRY, "country.name","iso3c")

# Sampling protocol
DF.sites$samplingProtocol = "MBON-P2P_bestpractices-rockyshores"

# Sampling size value
DF.sites$samplingSizeValue = 0.25

# Sampling unit
DF.sites$samplingSizeUnit = "square meter"

print(DF.sites$countryCodeISO[1])
```

## Extrat data, taxa list and codes
```{r, echo=T, warning=FALSE, message=FALSE}
## data
DF.data = read_xlsx(file.path(baseDataDir, fileName),sheet = "DATA")
DF.data = DF.data[!is.na(DF.data$LOCALITY),]

## spp list
DF.spp = read_xlsx(file.path(baseDataDir, fileName),sheet = "sppList")

## codes
DF.countryCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Countries")
DF.localityCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Locality")
DF.siteCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Sites")
DF.habitatCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Habitat")

kable(DF.data[1:5, 1:7]) %>% kable_styling("striped")
kable(DF.spp[1:5,]) %>% kable_styling("striped")
```

# Generate IDs
```{r, echo=T, warning=FALSE, message=FALSE}


## add codes: SITES
DF.sites = left_join(DF.sites, DF.countryCodes, by = "COUNTRY")
DF.sites = left_join(DF.sites, DF.localityCodes, by = "LOCALITY")
DF.sites = left_join(DF.sites, DF.siteCodes, by = "SITE")
DF.sites = left_join(DF.sites, DF.habitatCodes, by = "HABITAT")

DF.sites$PARENT_UNIT_ID = paste(DF.sites$countryCode, DF.sites$localityCode, DF.sites$siteCode, DF.sites$habitatCode, 
                                paste0(DF.sites$YEAR, DF.sites$MONTH, DF.sites$DAY), sep="_")
DF.sites$UNIT_ID = paste(DF.sites$PARENT_UNIT_ID, DF.sites$STRATA, sep="_")

print(DF.sites$UNIT_ID[1:6])
```

# Assign codes to DATA 
```{r, echo=T, warning=FALSE, message=FALSE}

## Add Aphia ID and taxa rank
DF.data = left_join(DF.data, DF.spp[,c("scientificName", "AphiaID", "Rank")])
DF.data = left_join(DF.data, DF.sites[,c("UNIT_ID", "LOCALITY", "SITE", "STRATA")])
DF.data = DF.data %>% group_by(LOCALITY, SITE, STRATA, SAMPLE) %>% 
  mutate(sampleOrganismID = 1:n(), scientificName, AphiaID, Rank, Variable, Value)
DF.data$occurrenceID = paste(DF.data$UNIT_ID, DF.data$SAMPLE, sprintf("%03d", DF.data$sampleOrganismID), sep="_")

print(DF.data$occurrenceID[1:20])
```

## Convert abundance values 
```{r, echo=T, warning=FALSE, message=FALSE}
## to count per square meter 
DF.data$Value[DF.data$Variable=="ABUNDANCE"] = DF.data$Value[DF.data$Variable=="ABUNDANCE"] * 4

print(DF.data$Value[DF.data$Variable=="ABUNDANCE"][1:10])
```

## Assign other IPT fields 
```{r, echo=T, warning=FALSE, message=FALSE}
DF.data$basisOfRecord = "HumanObservation"
DF.data$occurrenceStatus = "present"
DF.data$scientificNameID = paste0("lsid:marinespecies.org:taxname:", DF.data$AphiaID)

## fields for the eMoF
DF.data$measurementTypeID = ifelse(DF.data$Variable=="COVER",
                                   "http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/",  ##Coverage (in assayed sample) of biological entity 
                                   "http://vocab.nerc.ac.uk/collection/P06/current/UPMS/") ## number per square meter

DF.data$measurementUnit = ifelse(DF.data$Variable=="COVER", "percent", "count")
DF.data$measurementUnitID = ifelse(DF.data$Variable=="COVER",
                                   "http://vocab.nerc.ac.uk/collection/P06/current/UPCT/",    ## percentage
                                   "http://vocab.nerc.ac.uk/collection/P06/current/UPMS/")   ## number per square meter

DF.data = DF.data %>% arrange(occurrenceID, scientificName)

kable(DF.data[1:5, ]) %>% kable_styling("striped")
```

## Remove substrate type records
```{r, echo=T, warning=FALSE, message=FALSE}
## EventCore file
IPT.event = DF.sites %>% 
  select(datasetName,
         parentEventID=PARENT_UNIT_ID,
         eventID = UNIT_ID,
         samplingProtocol,
         samplingSizeValue,
         samplingSizeUnit,
         eventDate,
         eventTime,
         year = YEAR,
         month = MONTH,
         day = DAY,
         habitat = HABITAT,
         eventRemarks = REMARKS,
         country = COUNTRY,
         countryCode = countryCodeISO,
         locality = LOCALITY,
         decimalLatitude = LATITUDE.x,
         decimalLongitude = LONGITUDE.x,
         coordinateUncertaintyInMeters = GPS_ERROR,
         geodeticDatum = DATUM,
         strata=STRATA)

## Remove substrate type records
DF.data.noSubstrate = DF.data %>% 
  filter(! grepl("substrate", scientificName, fixed = T))

## OccurrenceCore file
IPT.occurrence = DF.data.noSubstrate %>% ungroup() %>% 
  select(eventID = UNIT_ID,
         basisOfRecord,
         occurrenceID,
         scientificNameID,
         scientificName, 
         taxonRank = Rank) 

## Event Measurement or Fact (eMOF) file
IPT.mof = data.frame(eventID = DF.data.noSubstrate$UNIT_ID, 
                     occurrenceID = DF.data.noSubstrate$occurrenceID,
                     measurementType = tolower(DF.data.noSubstrate$Variable), 
                     measurmenetTypeID = DF.data.noSubstrate$measurementTypeID,
                     measurementValue = DF.data.noSubstrate$Value,
                     measurementUnit = DF.data.noSubstrate$measurementUnit,
                     measurementUnitID = DF.data.noSubstrate$measurementUnitID
                     )
print("EventCore")
kable(IPT.event[1:3, ]) %>% kable_styling("striped")
print("OccurrenceCore")
kable(IPT.occurrence[1:3, ]) %>% kable_styling("striped")
print("MoF")
kable(IPT.mof[1:3, ]) %>% kable_styling("striped")
```

## Generate Data Anaylisis files
```{r, echo=T, warning=FALSE, message=FALSE}
## reformat to wide
DF.dataWide = dcast(occurrenceID+LOCALITY+SITE+STRATA+SAMPLE+scientificName+AphiaID+Rank~Variable, value.var = "Value", data=DF.data, sum)
```

## Save files
```{r, echo=T, warning=FALSE, message=FALSE}
rootFileName = paste(unique(DF.sites$countryCodeISO), unique(DF.sites$HABITAT), 
                        gsub("-","", min(DF.sites$eventDate)), sep="_")

## IPT files
readr::write_csv(IPT.event, path = file.path(baseIPT,paste0(rootFileName, "_IPT-event.csv")))
readr::write_csv(IPT.occurrence, path = file.path(baseIPT,paste0(rootFileName, "_IPT-occurrence.csv")))
readr::write_csv(IPT.mof, path = file.path(baseIPT,paste0(rootFileName, "_IPT-mof.csv")))

## Analysis file
readr::write_csv(DF.dataWide, path = file.path(baseAnalysis,paste0(rootFileName, "_analysis.csv")))
readr::write_csv(DF.sites, path = file.path(baseAnalysis,paste0(rootFileName, "_site.csv")))
```

**You are done! You can now upload your data (DwC-A files) to [OBIS](https://obis.org/){target="_blank"} following this How-To [manual](https://diodon.github.io/P2P_documents/PublishData/docs/PublishingDataIPT.html){target="_blank"}.**