Skip to content

Commit

Permalink
issue #2: first draft dublin core record creation
Browse files Browse the repository at this point in the history
I've created a mapper class and am feeding an input dict to each mapper
to generate a dublin core ElementTree with the required elements with
the appropriate values

TODO: need to get stakeholder approval of this mapping before moving
forward
TODO: fold existing SAF creation code base into this one
  • Loading branch information
Tyler Danstrom committed Jul 14, 2017
1 parent 109321a commit b1b1283
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 43 deletions.
156 changes: 123 additions & 33 deletions bin/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
import csv
import json
from json.decoder import JSONDecodeError
from os.path import basename
from os.path import basename, join
from os import _exit, scandir
from xml.etree.ElementTree import tostring
from xml.dom import minidom

from mamlukimport.parser import Parser
from mamlukimport.mapper import Mapper
Expand Down Expand Up @@ -35,49 +37,137 @@ def main():
for n in a_generator:
try:
data = json.load(open(n, encoding='utf-8'))[0]
output = namedtuple("data", "creator title rights keywords subject createdate filename")
creator = data["Creator"] if not isinstance(data["Creator"], list) else ', '.join(data["Creator"])
title = data["Title"]
rights = data["Rights"]
print(rights.split(' ')[0])
keywords = data["Keywords"] if not isinstance(data["Keywords"], list) else ', '.join([re.sub(';', '', x) for x in data["Keywords"]])
subject = data["Subject"] if not isinstance(data["Subject"], list) else ', '.join([re.sub(';', '', x) for x in data["Subject"]])
createdate = data["CreateDate"]
filename = data["FileName"]
volume = filename.split('_')[2]
publisher = {1: "University of Chicago"}
if not isinstance(data["Creator"], list):
creator = {1: data["Creator"]}
else:
crtr_count = 1
creator_dict = {}
for n_creator in data["Creator"]:
creator_dict[crtr_count] = n_creator
crtr_count += 1
creator = creator_dict
title = {1: data["Title"]}
rights = {1: data["Rights"]}

if not isinstance(data["Keywords"], list):
kw_count = 1
kw_dict = {}
for n_keyw in data["Keywords"].split(';'):
n_keyw = n_keyw.lstrip().strip()
if n_keyw != "":
kw_dict[kw_count] = n_keyw
kw_count += 1
keywords = kw_dict
else:
kw_count = 1
kw_dict = {}
for n_keyw in data["Keywords"][0].split(';'):
n_keyw = n_keyw.lstrip().strip()
if n_keyw != "":
kw_dict[kw_count] = n_keyw
kw_count += 1
keywords = kw_dict

if not isinstance(data["Subject"], list):
subj_count = 1
subj_dict = {}
for n_subj in data["Subject"].split(';'):
n_subj = n_keyw.lstrip().strip()
if n_subj != "":
subj_dict[subj_count] = n_subj
subj_count += 1
subject = subj_dict

else:
subj_count = 1
subj_dict = {}
for n_subj in data["Subject"][0].split(';'):
n_subj = n_keyw.lstrip().strip()
if n_subj != "":
subj_dict[subj_count] = n_subj
subj_count += 1
subject = subj_dict

createdate = {1: data["CreateDate"]}
filename = {1: data["FileName"]}
volume = filename[1].split('_')[2]
temp = volume.split('-')

if len(temp) == 2:
if '.pdf' in temp[1]:
volume = temp[0]
else:
volume = volume
publisher = "University of Chicago"
if len(temp) >= 2:
head = [temp[0]]
tail = temp[1:]
tail = [x for x in tail if re.compile('\d{1,}$').match(x)]
copyrightdate = head + tail
else:
copyrightdate = [re.sub(r'[a-z]', '', re.sub(r'\.', '', x))
for x in temp]
copyrightdate = {1: '-'.join(copyrightdate)}

msr_pattern = re.compile('MSR').search(title[1])
vol_pattern = re.compile('Vol.').search(title[1])
volume_option = re.compile('(\(MSR .*\))').search(title[1])
volume_option2 = re.compile('(Vol. .*)').search(title[1])
if msr_pattern:
volume = title[1][title[1].index('MSR')+3:].lstrip().strip()
title = {1: title[1][0:title[1].index('MSR')]}
print(title)
elif vol_pattern:
volume = title[1][title[1].index('Vol.')+4:].lstrip().strip()
title = {1: title[1][0:title[1].index('Vol.')]}
else:
volume = "none"
title = {1: title[1].lstrip().strip()}

first_check = title[1][-1]
if first_check == '(':
title[1] = title[1][0:-1].strip().lstrip()
second_check = title[1][-1]
if second_check == ":":
title[1] = title[1][0:-1].strip().lstrip()
try:
webstatement = data["WebStatement"]
except KeyError:
webstatement = ""
output.creator = creator
output.title = title
output.rights = rights
output.keywords = keywords
output.subject = subject
output.createdate = createdate
output.filename = filename
webstatement = {1: webstatement}
str_filen = filename[1]
output = {'creator': creator,
'title': title,
'rights': rights,
'keyword': keywords,
'subject': subject,
'createdate': copyrightdate,
'filename': filename,
'webstatement': webstatement,
'publisher': publisher,
}
if volume:
volume = re.sub(r'\)', '', re.sub(r'\(', '', volume))
volume = {1: volume}
if 'MamlukStudiesReview' in filename[1]:
output["formatof"] = volume
else:
output["part"] = volume
outputs.append(output)
row = [filename, creator, title, re.sub(r'\n', ' ', rights), webstatement, subject, keywords, createdate, publisher]
rows.append(row)
except JSONDecodeError:
pass
with open(args.output_file, "w", encoding="utf-8") as csv_file:
csvfieldnames = ["filename", "creator", "title", "rights", "webstatement", "subject", "keywords", "publisher", "createdate"]
writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL, quotechar="\"")
writer.writerow(csvfieldnames)
for n_row in rows:
total_files += 1
writer.writerow(n_row)
# with open(args.output_file, "w", encoding="utf-8") as csv_file:
# csvfieldnames = ["filename", "creator", "title", "rights", "webstatement", "subject", "keywords", "publisher", "createdate"]
# writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL, quotechar="\"")
# writer.writerow(csvfieldnames)
# for n_row in rows:
# total_files += 1
# writer.writerow(n_row)
for n_record in outputs:
filename = n_record["filename"]
del n_record["filename"]
new_mapper = Mapper(n_record)

new_filename = re.sub(r'.pdf', '.xml', filename[1])
xml_string = tostring(new_mapper.out)
xml_string = minidom.parseString(xml_string).toprettyxml()
with open(join('./out', new_filename), "w", encoding="utf-8") as write_file:
write_file.write(xml_string)
return 0
except KeyboardInterrupt:
return 131
Expand Down
27 changes: 17 additions & 10 deletions mamlukimport/mapper.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@

from xml.etree.ElementTree import Element, SubElement
from xml.etree.ElementTree import Element, ElementTree, SubElement
from sys import stderr

class Mapper(object):
def __init__(self, input):
self._in = input
self._lookup = {'title': {'element':'title', 'qualifier':'none'},
'createdate': {'element': 'date', 'qualifier':'copyright'},
'creator': {'element':'contributor', 'qualifiier':'author'},
'creator': {'element':'contributor', 'qualifier':'author'},
'rights': {'element': 'rights', 'qualifier': 'statement'},
'webstatement': {'element': 'rights', 'qualifier': 'url'},
'subject': {'element': 'subject', 'qualifier': 'none'},
'keyword': {'element': 'subject', 'qualifier': 'keyword'},
'source': {'element': 'source', 'qualifier': 'none'},
'isPartOf': {'element': 'relation', 'qualifier' :'isPartOf'},
'isFormatOf': {'element': 'relation', 'qualifier': 'isPartOf'},
'part': {'element': 'relation', 'qualifier' :'isPartOf'},
'formatof': {'element': 'relation', 'qualifier' :'isFormatOf'},
'publisher': {'element': 'publisher', 'qualifier': 'none'},
}
self.out = self._transform()

Expand All @@ -23,13 +24,19 @@ def _transform(self):
for n_key in self._in:
try:
instructions = self._lookup.get(n_key)
new_element = SubElement(root, "dc_value")
new_element.set("element", instructions["element"])
new_element.set("qualifier", instructions["qualifier"])
new_element.text = self._in[n_key]
except KeyError:
stderr.write("{} is an invalid field for this mapping.".format(n_key))
self.out = root
instructions = None
stderr.write("{} is an invalid field for this mapping.\n".format(n_key))
if instructions:
for n_value in self._in[n_key]:
new_element = SubElement(root, "dc_value")
new_element.set("element", instructions["element"])
new_element.set("qualifier", instructions["qualifier"])
if isinstance(self._in[n_key], str):
new_element.text = self._in[n_key]
else:
new_element.text = str(self._in[n_key][n_value])
return root

def get_output(self):
return self.out

0 comments on commit b1b1283

Please sign in to comment.