-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.Rmd
286 lines (212 loc) · 10.4 KB
/
index.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
---
title: "DwC-A Writer"
author: "Eduardo Klein and Enrique Montes (enrique.montes@noaa.gov)"
date: "10/7/2020"
output:
html_document:
number_sections: yes
toc: yes
toc_float: yes
pdf_document:
toc: yes
word_document:
toc: yes
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```
This script transforms long-format data tables used in rocky intertidal surveys of the Marine Biodiversity Observation Network Pole to Pole of the Americas ([MBON Pole to Pole](https://marinebon.org/p2p/){target="_blank"}) into Darwin Core Archive (DwC-A) files for publishing data in the Ocean Biodiversity Information System ([OBIS](https://obis.org/){target="_blank"}) following instructions from this [manual](https://diodon.github.io/P2P_documents/PublishData/docs/PublishingDataIPT.html){target="_blank"}. It also generates an integrated file ready for data analysis.
To test this script, you will need to:
- Install R software.
- Create three folders in your working directory: Analysis, Data, IPT.
- Set your working directory to the location of these three folders (e.g. setwd("~/your directory").
- Save the [The DataSheet_longformat_TEST](https://github.com/diodon/P2P-templates/blob/main/DataSheet_longformat_TEST_v2.xlsx){target="_blank"} file in the "Data" folder. This is the data table that you will substitute with your own data.
Now, just copy the code chunks below and paste them into your R console.
# Basic Setup
You need a few packages to run this code. Check that the "Data", IPT" and "Analysis" folders are created under your working directory. [The DataSheet_longformat_TEST](https://github.com/diodon/P2P-templates/blob/main/DataSheet_longformat_TEST_v2.xlsx){target="_blank"} file should be available in the "Data" folder.
```{r setupPackages}
library(lutz)
library(countrycode)
library(readxl)
library(reshape2)
library(lubridate)
library(dplyr)
library(ggplot2)
library(kableExtra)
options(dplyr.summarise.inform = FALSE)
# Create a "Data", IPT" and "Analysis" folder under your selected directory
baseDataDir = "Data"
baseIPT = "IPT"
baseAnalysis = "Analysis"
```
# Load your data table
Set `fileName`to the name of the longformat data table from your survey
```{r, echo=T, warning=FALSE, message=FALSE}
# Select your data table
fileName = "DataSheet_longformat_TEST_v2.xlsx"
```
# Read file sheets
```{r, echo=T, warning=FALSE, message=FALSE}
## Extract information about your sampling site from the SiteInfo tab of your data table file
DF.sites = read_xlsx(file.path(baseDataDir, fileName),sheet = "SiteInfo")
DF.sites = DF.sites[!is.na(DF.sites$COUNTRY),]
```
## convert time to UTC
```{r, echo=T, warning=FALSE, message=FALSE}
## get number of seconds from midnight
secsSTART = (1 -abs(as.numeric(julian(DF.sites$TIME_START)) - (as.integer(julian(DF.sites$TIME_START))))) * (60*60*24)
secsEND = (1 - abs(as.numeric(julian(DF.sites$TIME_END)) - (as.integer(julian(DF.sites$TIME_END))))) * (60*60*24)
dateChar = paste(DF.sites$YEAR, DF.sites$MONTH, DF.sites$DAY, sep="-")
## get timezone and timezone offset
timeZone = tz_lookup_coords(mean(DF.sites$LATITUDE, na.rm=T), mean(DF.sites$LONGITUDE, na.rm=T), method="accurate")
dateOffset = tz_offset(dateChar, timeZone)$utc_offset_h
## create data and time UTC
DF.sites$eventDate = as.POSIXct(dateChar, tz="UTC")
DF.sites$TIME_START = DF.sites$eventDate + seconds(secsSTART) + hours(dateOffset)
DF.sites$TIME_END = DF.sites$eventDate + seconds(secsEND) + hours(dateOffset)
DF.sites$eventTime = paste(format(DF.sites$TIME_START, "%H:%M:%SZ"), format(DF.sites$TIME_END, "%H:%M:%SZ"), sep="/")
print(timeZone)
kable(DF.sites[1:5, c(3:5, 13:14)]) %>% kable_styling("striped")
```
## Extract other fields
```{r, echo=T, warning=FALSE, message=FALSE}
# Country code
DF.sites$datasetName = paste0("MBON-P2P-biodiversity-",unique(DF.sites$countryCode))
DF.sites$countryCodeISO = countrycode(DF.sites$COUNTRY, "country.name","iso3c")
# Sampling protocol
DF.sites$samplingProtocol = "MBON-P2P_bestpractices-rockyshores"
# Sampling size value
DF.sites$samplingSizeValue = 0.25
# Sampling unit
DF.sites$samplingSizeUnit = "square meter"
print(DF.sites$countryCodeISO[1])
```
## Extrat data, taxa list and codes
```{r, echo=T, warning=FALSE, message=FALSE}
## data
DF.data = read_xlsx(file.path(baseDataDir, fileName),sheet = "DATA")
DF.data = DF.data[!is.na(DF.data$LOCALITY),]
## spp list
DF.spp = read_xlsx(file.path(baseDataDir, fileName),sheet = "sppList")
## codes
DF.countryCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Countries")
DF.localityCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Locality")
DF.siteCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Sites")
DF.habitatCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Habitat")
kable(DF.data[1:5, 1:7]) %>% kable_styling("striped")
kable(DF.spp[1:5,]) %>% kable_styling("striped")
```
# Generate IDs
```{r, echo=T, warning=FALSE, message=FALSE}
## add codes: SITES
DF.sites = left_join(DF.sites, DF.countryCodes, by = "COUNTRY")
DF.sites = left_join(DF.sites, DF.localityCodes, by = "LOCALITY")
DF.sites = left_join(DF.sites, DF.siteCodes, by = "SITE")
DF.sites = left_join(DF.sites, DF.habitatCodes, by = "HABITAT")
DF.sites$PARENT_UNIT_ID = paste(DF.sites$countryCode, DF.sites$localityCode, DF.sites$siteCode, DF.sites$habitatCode,
paste0(DF.sites$YEAR, DF.sites$MONTH, DF.sites$DAY), sep="_")
DF.sites$UNIT_ID = paste(DF.sites$PARENT_UNIT_ID, DF.sites$STRATA, sep="_")
print(DF.sites$UNIT_ID[1:6])
```
# Assign codes to DATA
```{r, echo=T, warning=FALSE, message=FALSE}
## Add Aphia ID and taxa rank
DF.data = left_join(DF.data, DF.spp[,c("scientificName", "AphiaID", "Rank")])
DF.data = left_join(DF.data, DF.sites[,c("UNIT_ID", "LOCALITY", "SITE", "STRATA")])
DF.data = DF.data %>% group_by(LOCALITY, SITE, STRATA, SAMPLE) %>%
mutate(sampleOrganismID = 1:n(), scientificName, AphiaID, Rank, Variable, Value)
DF.data$occurrenceID = paste(DF.data$UNIT_ID, DF.data$SAMPLE, sprintf("%03d", DF.data$sampleOrganismID), sep="_")
print(DF.data$occurrenceID[1:20])
```
## Convert abundance values
```{r, echo=T, warning=FALSE, message=FALSE}
## to count per square meter
DF.data$Value[DF.data$Variable=="ABUNDANCE"] = DF.data$Value[DF.data$Variable=="ABUNDANCE"] * 4
print(DF.data$Value[DF.data$Variable=="ABUNDANCE"][1:10])
```
## Assign other IPT fields
```{r, echo=T, warning=FALSE, message=FALSE}
DF.data$basisOfRecord = "HumanObservation"
DF.data$occurrenceStatus = "present"
DF.data$scientificNameID = paste0("lsid:marinespecies.org:taxname:", DF.data$AphiaID)
## fields for the eMoF
DF.data$measurementTypeID = ifelse(DF.data$Variable=="COVER",
"http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/", ##Coverage (in assayed sample) of biological entity
"http://vocab.nerc.ac.uk/collection/P06/current/UPMS/") ## number per square meter
DF.data$measurementUnit = ifelse(DF.data$Variable=="COVER", "percent", "count")
DF.data$measurementUnitID = ifelse(DF.data$Variable=="COVER",
"http://vocab.nerc.ac.uk/collection/P06/current/UPCT/", ## percentage
"http://vocab.nerc.ac.uk/collection/P06/current/UPMS/") ## number per square meter
DF.data = DF.data %>% arrange(occurrenceID, scientificName)
kable(DF.data[1:5, ]) %>% kable_styling("striped")
```
## Remove substrate type records
```{r, echo=T, warning=FALSE, message=FALSE}
## EventCore file
IPT.event = DF.sites %>%
select(datasetName,
parentEventID=PARENT_UNIT_ID,
eventID = UNIT_ID,
samplingProtocol,
samplingSizeValue,
samplingSizeUnit,
eventDate,
eventTime,
year = YEAR,
month = MONTH,
day = DAY,
habitat = HABITAT,
eventRemarks = REMARKS,
country = COUNTRY,
countryCode = countryCodeISO,
locality = LOCALITY,
decimalLatitude = LATITUDE.x,
decimalLongitude = LONGITUDE.x,
coordinateUncertaintyInMeters = GPS_ERROR,
geodeticDatum = DATUM,
strata=STRATA)
## Remove substrate type records
DF.data.noSubstrate = DF.data %>%
filter(! grepl("substrate", scientificName, fixed = T))
## OccurrenceCore file
IPT.occurrence = DF.data.noSubstrate %>% ungroup() %>%
select(eventID = UNIT_ID,
basisOfRecord,
occurrenceID,
scientificNameID,
scientificName,
taxonRank = Rank)
## Event Measurement or Fact (eMOF) file
IPT.mof = data.frame(eventID = DF.data.noSubstrate$UNIT_ID,
occurrenceID = DF.data.noSubstrate$occurrenceID,
measurementType = tolower(DF.data.noSubstrate$Variable),
measurmenetTypeID = DF.data.noSubstrate$measurementTypeID,
measurementValue = DF.data.noSubstrate$Value,
measurementUnit = DF.data.noSubstrate$measurementUnit,
measurementUnitID = DF.data.noSubstrate$measurementUnitID
)
print("EventCore")
kable(IPT.event[1:3, ]) %>% kable_styling("striped")
print("OccurrenceCore")
kable(IPT.occurrence[1:3, ]) %>% kable_styling("striped")
print("MoF")
kable(IPT.mof[1:3, ]) %>% kable_styling("striped")
```
## Generate Data Anaylisis files
```{r, echo=T, warning=FALSE, message=FALSE}
## reformat to wide
DF.dataWide = dcast(occurrenceID+LOCALITY+SITE+STRATA+SAMPLE+scientificName+AphiaID+Rank~Variable, value.var = "Value", data=DF.data, sum)
```
## Save files
```{r, echo=T, warning=FALSE, message=FALSE}
rootFileName = paste(unique(DF.sites$countryCodeISO), unique(DF.sites$HABITAT),
gsub("-","", min(DF.sites$eventDate)), sep="_")
## IPT files
readr::write_csv(IPT.event, path = file.path(baseIPT,paste0(rootFileName, "_IPT-event.csv")))
readr::write_csv(IPT.occurrence, path = file.path(baseIPT,paste0(rootFileName, "_IPT-occurrence.csv")))
readr::write_csv(IPT.mof, path = file.path(baseIPT,paste0(rootFileName, "_IPT-mof.csv")))
## Analysis file
readr::write_csv(DF.dataWide, path = file.path(baseAnalysis,paste0(rootFileName, "_analysis.csv")))
readr::write_csv(DF.sites, path = file.path(baseAnalysis,paste0(rootFileName, "_site.csv")))
```
**You are done! You can now upload your data (DwC-A files) to [OBIS](https://obis.org/){target="_blank"} following this How-To [manual](https://diodon.github.io/P2P_documents/PublishData/docs/PublishingDataIPT.html){target="_blank"}.**