From 338c6d63cabe08eafa313b3a9afae30062917caf Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Tue, 27 Feb 2024 11:35:56 +0000 Subject: [PATCH 1/3] Adding the file where the seeds are fetched from for countries list --- data/ena_sample_checklists/.gitignore | 1 + data/ena_sample_checklists/ERC000011.xml | 1213 ++++++++++++++++++++++ 2 files changed, 1214 insertions(+) create mode 100644 data/ena_sample_checklists/ERC000011.xml diff --git a/data/ena_sample_checklists/.gitignore b/data/ena_sample_checklists/.gitignore index b8818a7869..8a147fd1c5 100644 --- a/data/ena_sample_checklists/.gitignore +++ b/data/ena_sample_checklists/.gitignore @@ -1,2 +1,3 @@ # We might be able to store these in the repo, but need to double check licenses *.xml +!ERC000011.xml diff --git a/data/ena_sample_checklists/ERC000011.xml b/data/ena_sample_checklists/ERC000011.xml new file mode 100644 index 0000000000..9739dd8816 --- /dev/null +++ b/data/ena_sample_checklists/ERC000011.xml @@ -0,0 +1,1213 @@ + + + + + ERC000011 + + + + ENA default sample checklist + Minimum information required for the sample + ENA + + Part and developmental stage of organism + Anatomical and developmental descriptions of the sample site or source material + + + cell_type + cell type from which the sample was obtained + + + + optional + multiple + + + + dev_stage + if the sample was obtained from an organism in a specific developmental stage, it is specified with this qualifier + + + + optional + multiple + + + + germline + the sample described presented in the entry has not undergone somatic genomic rearrangement as part of an adaptive immune response; it is the unrearranged molecule that was inherited from the parental germline + + + + optional + multiple + + + + tissue_lib + tissue library from which sample was obtained + + + + optional + multiple + + + + tissue_type + tissue type from which the sample was obtained + + + + optional + multiple + + + + Collection event information + + + isolation_source + describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived + + + + optional + multiple + + + + lat_lon + geographical coordinates of the location where the specimen was collected + + + + optional + multiple + + + + collected_by + name of persons or institute who collected the specimen + + + + optional + multiple + + + + Event Date/Time + collection_date + collection date + The date the sample was collected with the intention of sequencing, either as an instance (single point in time) or interval. In case no exact time is available, the date/time can be right truncated i.e. all of these are valid ISO8601 compliant times: 2008-01-23T19:23:10+00:00; 2008-01-23T19:23:10; 2008-01-23; 2008-01; 2008. + + + (^[12][0-9]{3}(-(0[1-9]|1[0-2])(-(0[1-9]|[12][0-9]|3[01])(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?(/[0-9]{4}(-[0-9]{2}(-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?)?$)|(^not collected$)|(^not provided$)|(^restricted access$)|(^missing: control sample$)|(^missing: sample group$)|(^missing: synthetic construct$)|(^missing: lab stock$)|(^missing: third party data$)|(^missing: data agreement established pre-2023$)|(^missing: endangered species$)|(^missing: human-identifiable$) + + + mandatory + multiple + + + + geographic location (country and/or sea) + The location the sample was collected from with the intention of sequencing, as defined by the country or sea. Country or sea names should be chosen from the INSDC country list (http://insdc.org/country.html). + + + + Afghanistan + + + Albania + + + Algeria + + + American Samoa + + + Andorra + + + Angola + + + Anguilla + + + Antarctica + + + Antigua and Barbuda + + + Arctic Ocean + + + Argentina + + + Armenia + + + Aruba + + + Ashmore and Cartier Islands + + + Atlantic Ocean + + + Australia + + + Austria + + + Azerbaijan + + + Bahamas + + + Bahrain + + + Baker Island + + + Baltic Sea + + + Bangladesh + + + Barbados + + + Bassas da India + + + Belarus + + + Belgium + + + Belize + + + Benin + + + Bermuda + + + Bhutan + + + Bolivia + + + Borneo + + + Bosnia and Herzegovina + + + Botswana + + + Bouvet Island + + + Brazil + + + British Virgin Islands + + + Brunei + + + Bulgaria + + + Burkina Faso + + + Burundi + + + Cambodia + + + Cameroon + + + Canada + + + Cape Verde + + + Cayman Islands + + + Central African Republic + + + Chad + + + Chile + + + China + + + Christmas Island + + + Clipperton Island + + + Cocos Islands + + + Colombia + + + Comoros + + + Cook Islands + + + Coral Sea Islands + + + Costa Rica + + + Cote d'Ivoire + + + Croatia + + + Cuba + + + Curacao + + + Cyprus + + + Czech Republic + + + Democratic Republic of the Congo + + + Denmark + + + Djibouti + + + Dominica + + + Dominican Republic + + + East Timor + + + Ecuador + + + Egypt + + + El Salvador + + + Equatorial Guinea + + + Eritrea + + + Estonia + + + Ethiopia + + + Europa Island + + + Falkland Islands (Islas Malvinas) + + + Faroe Islands + + + Fiji + + + Finland + + + France + + + French Guiana + + + French Polynesia + + + French Southern and Antarctic Lands + + + Gabon + + + Gambia + + + Gaza Strip + + + Georgia + + + Germany + + + Ghana + + + Gibraltar + + + Glorioso Islands + + + Greece + + + Greenland + + + Grenada + + + Guadeloupe + + + Guam + + + Guatemala + + + Guernsey + + + Guinea + + + Guinea-Bissau + + + Guyana + + + Haiti + + + Heard Island and McDonald Islands + + + Honduras + + + Hong Kong + + + Howland Island + + + Hungary + + + Iceland + + + India + + + Indian Ocean + + + Indonesia + + + Iran + + + Iraq + + + Ireland + + + Isle of Man + + + Israel + + + Italy + + + Jamaica + + + Jan Mayen + + + Japan + + + Jarvis Island + + + Jersey + + + Johnston Atoll + + + Jordan + + + Juan de Nova Island + + + Kazakhstan + + + Kenya + + + Kerguelen Archipelago + + + Kingman Reef + + + Kiribati + + + Kosovo + + + Kuwait + + + Kyrgyzstan + + + Laos + + + Latvia + + + Lebanon + + + Lesotho + + + Liberia + + + Libya + + + Liechtenstein + + + Lithuania + + + Luxembourg + + + Macau + + + Macedonia + + + Madagascar + + + Malawi + + + Malaysia + + + Maldives + + + Mali + + + Malta + + + Marshall Islands + + + Martinique + + + Mauritania + + + Mauritius + + + Mayotte + + + Mediterranean Sea + + + Mexico + + + Micronesia + + + Midway Islands + + + Moldova + + + Monaco + + + Mongolia + + + Montenegro + + + Montserrat + + + Morocco + + + Mozambique + + + Myanmar + + + Namibia + + + Nauru + + + Navassa Island + + + Nepal + + + Netherlands + + + New Caledonia + + + New Zealand + + + Nicaragua + + + Niger + + + Nigeria + + + Niue + + + Norfolk Island + + + North Korea + + + North Sea + + + Northern Mariana Islands + + + Norway + + + Oman + + + Pacific Ocean + + + Pakistan + + + Palau + + + Palmyra Atoll + + + Panama + + + Papua New Guinea + + + Paracel Islands + + + Paraguay + + + Peru + + + Philippines + + + Pitcairn Islands + + + Poland + + + Portugal + + + Puerto Rico + + + Qatar + + + Republic of the Congo + + + Reunion + + + Romania + + + Ross Sea + + + Russia + + + Rwanda + + + Saint Helena + + + Saint Kitts and Nevis + + + Saint Lucia + + + Saint Pierre and Miquelon + + + Saint Vincent and the Grenadines + + + Samoa + + + San Marino + + + Sao Tome and Principe + + + Saudi Arabia + + + Senegal + + + Serbia + + + Seychelles + + + Sierra Leone + + + Singapore + + + Sint Maarten + + + Slovakia + + + Slovenia + + + Solomon Islands + + + Somalia + + + South Africa + + + South Georgia and the South Sandwich Islands + + + South Korea + + + Southern Ocean + + + Spain + + + Spratly Islands + + + Sri Lanka + + + Sudan + + + Suriname + + + Svalbard + + + Swaziland + + + Sweden + + + Switzerland + + + Syria + + + Taiwan + + + Tajikistan + + + Tanzania + + + Tasman Sea + + + Thailand + + + Togo + + + Tokelau + + + Tonga + + + Trinidad and Tobago + + + Tromelin Island + + + Tunisia + + + Turkey + + + Turkmenistan + + + Turks and Caicos Islands + + + Tuvalu + + + USA + + + Uganda + + + Ukraine + + + United Arab Emirates + + + United Kingdom + + + Uruguay + + + Uzbekistan + + + Vanuatu + + + Venezuela + + + Viet Nam + + + Virgin Islands + + + Wake Island + + + Wallis and Futuna + + + West Bank + + + Western Sahara + + + Yemen + + + Zambia + + + Zimbabwe + + + missing: control sample + + + missing: data agreement established pre-2023 + + + missing: endangered species + + + missing: human-identifiable + + + missing: lab stock + + + missing: sample group + + + missing: synthetic construct + + + missing: third party data + + + not applicable + + + not collected + + + not provided + + + restricted access + + + + mandatory + multiple + + + + geographic location (region and locality) + The geographical origin of the sample as defined by the specific region name followed by the locality name. + + + + optional + multiple + + + + identified_by + name of the expert who identified the specimen taxonomically + + + + optional + multiple + + + + sample collection + + + environmental_sample + identifies sequences derived by direct molecular isolation from a bulk environmental DNA sample (by PCR with or without subsequent cloning of the product, DGGE, or other anonymous methods) with no reliable identification of the source organism + + + + No + + + Yes + + + + optional + multiple + + + + Organism characteristics + Characteristics of the source organism + + + mating_type + mating type of the organism from which the sequence was obtained; mating type is used for prokaryotes, and for eukaryotes that undergo meiosis without sexually dimorphic gametes + + + + optional + multiple + + + + sex + sex of the organism from which the sample was obtained + + + + optional + multiple + + + + host description + + + lab_host + scientific name of the laboratory host used to propagate the source organism from which the sample was obtained + + + + optional + multiple + + + + specific host + host scientific name + Scientific name of the natural (as opposed to laboratory) host to the organism from which sample was obtained. + + + + optional + multiple + + + + Pointer to physical material + References to sample or sample source material in physical resources + + + bio_material + Unique identifier that references the biological material from which the sample was obtained and that ideally exists in a curated collection (e.g. stock centres, seed banks, DNA banks). The ID should have the following structure: name of the institution (institution code) followed by the collection code (if available) and the voucher id (institution_code:collection_code:voucher_id). Please note institution codes and collection codes are taken from a controlled vocabulary maintained by the INSDC: https://ftp.ncbi.nih.gov/pub/taxonomy/biocollections/ + + + + optional + multiple + + + + culture_collection + Unique identifier that references the culture (e.g. live microbial and viral cultures and cell lines) from which the sample has been obtained and that have been deposited in curated culture collections. The ID needs to provide an institution code and the culture id, with optional collection code, in the following structure: (-institution_code:(collection_code):voucher_id. Please note institution codes (and optional collection codes) are taken from a controlled vocabulary maintained by the INSDC: https://ftp.ncbi.nih.gov/pub/taxonomy/biocollections/ + + + + optional + multiple + + + + specimen_voucher + Unique identifier that references the physical specimen that remains after the sequence has been obtained and that ideally exists in a curated collection. The ID should have the following structure: name of the institution (institution code) followed by the collection code (if available) and the voucher id (institution_code:collection_code:voucher_id). Please note institution codes and collection codes are taken from a controlled vocabulary maintained by the INSDC: https://ftp.ncbi.nih.gov/pub/taxonomy/biocollections/ + + + + optional + multiple + + + + Infraspecies information + Formal and informal infraspecies taxonomic information + + + cultivar + cultivar (cultivated variety) of plant from which sample was obtained + + + + optional + multiple + + + + ecotype + a population within a given species displaying genetically based, phenotypic traits that reflect adaptation to a local habitat. + + + + optional + multiple + + + + isolate + individual isolate from which the sample was obtained + + + + optional + multiple + + + + sub_species + name of sub-species of organism from which sample was obtained + + + + optional + multiple + + + + variety + variety (= varietas, a formal Linnaean rank) of organism from which sample was derived. + + + + optional + multiple + + + + sub_strain + name or identifier of a genetically or otherwise modified strain from which sample was obtained, derived from a parental strain (which should be annotated in the strain field; sub_strain from which sample was obtained + + + + optional + multiple + + + + cell_line + cell line from which the sample was obtained + + + + optional + multiple + + + + serotype + serological variety of a species characterized by its antigenic properties + + + + optional + multiple + + + + serovar + serological variety of a species (usually a prokaryote) characterized by its antigenic properties + + + + optional + multiple + + + + strain + Name of the strain from which the sample was obtained. + + + + optional + multiple + + + + + From 103104e34c760746cb2d193f2bf187b06cf757a1 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Wed, 28 Feb 2024 10:17:25 +0000 Subject: [PATCH 2/3] Conditioning download based on environment for minimum impact --- lib/tasks/insdc/import_countries.rake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/tasks/insdc/import_countries.rake b/lib/tasks/insdc/import_countries.rake index 8cb4917ee3..66e0f846aa 100644 --- a/lib/tasks/insdc/import_countries.rake +++ b/lib/tasks/insdc/import_countries.rake @@ -21,8 +21,10 @@ namespace :insdc do desc 'Download the sample sheet with the accession number specified by [sample_checklist] ' \ "(#{INSDC_COUNTRIES_DEFAULTS[:sample_checklist]} by default)" task :download, %i[sample_checklist ean_root] => :environment do |_t, args| - args.with_defaults(INSDC_COUNTRIES_DEFAULTS) - Insdc::ImportCountries.new(**args.to_h, priorities: INSDC_COUNTRIES_PRIORITIES).download + unless Rails.env.development? || Rails.env.test? + args.with_defaults(INSDC_COUNTRIES_DEFAULTS) + Insdc::ImportCountries.new(**args.to_h, priorities: INSDC_COUNTRIES_PRIORITIES).download + end end desc 'Download and import countries from the sample sheet with the accession number specified by ' \ From 86f631e59e9a8da05bdfe37fab2b15828c6e1c58 Mon Sep 17 00:00:00 2001 From: Dasun Pubudumal Date: Wed, 28 Feb 2024 11:15:27 +0000 Subject: [PATCH 3/3] Updating the import rake job to not include the download job as it can be executed manually if required --- lib/tasks/insdc/import_countries.rake | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/tasks/insdc/import_countries.rake b/lib/tasks/insdc/import_countries.rake index 66e0f846aa..ea5a119aab 100644 --- a/lib/tasks/insdc/import_countries.rake +++ b/lib/tasks/insdc/import_countries.rake @@ -21,15 +21,13 @@ namespace :insdc do desc 'Download the sample sheet with the accession number specified by [sample_checklist] ' \ "(#{INSDC_COUNTRIES_DEFAULTS[:sample_checklist]} by default)" task :download, %i[sample_checklist ean_root] => :environment do |_t, args| - unless Rails.env.development? || Rails.env.test? - args.with_defaults(INSDC_COUNTRIES_DEFAULTS) - Insdc::ImportCountries.new(**args.to_h, priorities: INSDC_COUNTRIES_PRIORITIES).download - end + args.with_defaults(INSDC_COUNTRIES_DEFAULTS) + Insdc::ImportCountries.new(**args.to_h, priorities: INSDC_COUNTRIES_PRIORITIES).download end desc 'Download and import countries from the sample sheet with the accession number specified by ' \ "[sample_checklist] (#{INSDC_COUNTRIES_DEFAULTS[:sample_checklist]} by default)" - task :import, %i[sample_checklist ean_root] => :download do |_t, args| + task :import, %i[sample_checklist ean_root] => :environment do |_t, args| args.with_defaults(INSDC_COUNTRIES_DEFAULTS) Insdc::ImportCountries.new(**args.to_h, priorities: INSDC_COUNTRIES_PRIORITIES).import end