issue #2: first draft dublin core record creation

I've created a mapper class and am feeding an input dict to each mapper to generate a dublin core ElementTree with the required elements with the appropriate values TODO: need to get stakeholder approval of this mapping before moving forward TODO: fold existing SAF creation code base into this one
uchicago-library · Jul 14, 2017 · b1b1283 · b1b1283
1 parent 109321a
commit b1b1283
Show file tree

Hide file tree

Showing 2 changed files with 140 additions and 43 deletions.
diff --git a/bin/extractor.py b/bin/extractor.py
@@ -6,8 +6,10 @@
 import csv
 import json
 from json.decoder import JSONDecodeError
-from os.path import basename
+from os.path import basename, join
 from os import _exit, scandir
+from xml.etree.ElementTree import tostring
+from xml.dom import minidom
 
 from mamlukimport.parser import Parser
 from mamlukimport.mapper import Mapper
@@ -35,49 +37,137 @@ def main():
         for n in a_generator:
             try:
                 data = json.load(open(n, encoding='utf-8'))[0]
-                output = namedtuple("data", "creator title rights keywords subject createdate filename")
-                creator = data["Creator"] if not isinstance(data["Creator"], list) else ', '.join(data["Creator"])
-                title = data["Title"]
-                rights = data["Rights"]
-                print(rights.split(' ')[0])
-                keywords = data["Keywords"] if not isinstance(data["Keywords"], list) else ', '.join([re.sub(';', '', x) for x in data["Keywords"]])
-                subject = data["Subject"] if not isinstance(data["Subject"], list) else ', '.join([re.sub(';', '', x) for x in data["Subject"]])
-                createdate = data["CreateDate"]
-                filename = data["FileName"]
-                volume = filename.split('_')[2]
+                publisher = {1: "University of Chicago"}
+                if not isinstance(data["Creator"], list):
+                    creator = {1: data["Creator"]}
+                else:
+                    crtr_count = 1
+                    creator_dict = {}
+                    for n_creator in data["Creator"]:
+                        creator_dict[crtr_count] = n_creator
+                        crtr_count += 1
+                    creator = creator_dict
+                title = {1: data["Title"]}
+                rights = {1: data["Rights"]}
+
+                if not isinstance(data["Keywords"], list):
+                   kw_count = 1
+                   kw_dict = {}
+                   for n_keyw in data["Keywords"].split(';'):
+                        n_keyw = n_keyw.lstrip().strip()
+                        if n_keyw != "":
+                            kw_dict[kw_count] = n_keyw
+                            kw_count += 1
+                   keywords = kw_dict
+                else:
+                    kw_count = 1
+                    kw_dict = {}
+                    for n_keyw in data["Keywords"][0].split(';'):
+                        n_keyw = n_keyw.lstrip().strip()
+                        if n_keyw != "":
+                            kw_dict[kw_count] = n_keyw
+                            kw_count += 1
+                    keywords = kw_dict
+
+                if not isinstance(data["Subject"], list):
+                    subj_count = 1
+                    subj_dict = {}
+                    for n_subj in data["Subject"].split(';'):
+                        n_subj = n_keyw.lstrip().strip()
+                        if n_subj != "":
+                            subj_dict[subj_count] = n_subj
+                            subj_count += 1
+                    subject = subj_dict
+
+                else:
+                    subj_count = 1
+                    subj_dict = {}
+                    for n_subj in data["Subject"][0].split(';'):
+                        n_subj = n_keyw.lstrip().strip()
+                        if n_subj != "":
+                            subj_dict[subj_count] = n_subj
+                            subj_count += 1
+                    subject = subj_dict
+
+                createdate = {1: data["CreateDate"]}
+                filename = {1: data["FileName"]}
+                volume = filename[1].split('_')[2]
                 temp = volume.split('-')
 
-                if len(temp) == 2:
-                    if '.pdf' in temp[1]:
-                        volume = temp[0]
-                    else:
-                        volume = volume
-                publisher = "University of Chicago"
+                if len(temp) >= 2:
+                    head = [temp[0]]
+                    tail = temp[1:]
+                    tail = [x for x in tail if re.compile('\d{1,}$').match(x)]
+                    copyrightdate = head + tail
+                else:
+                    copyrightdate = [re.sub(r'[a-z]', '', re.sub(r'\.', '', x))
+                                     for x in temp]
+                copyrightdate = {1: '-'.join(copyrightdate)}
+
+                msr_pattern = re.compile('MSR').search(title[1])
+                vol_pattern = re.compile('Vol.').search(title[1])
+                volume_option = re.compile('(\(MSR .*\))').search(title[1])
+                volume_option2 = re.compile('(Vol. .*)').search(title[1])
+                if msr_pattern:
+                    volume = title[1][title[1].index('MSR')+3:].lstrip().strip()
+                    title = {1: title[1][0:title[1].index('MSR')]}
+                    print(title)
+                elif vol_pattern:
+                    volume = title[1][title[1].index('Vol.')+4:].lstrip().strip()
+                    title = {1: title[1][0:title[1].index('Vol.')]}
+                else:
+                    volume = "none"
+                title = {1: title[1].lstrip().strip()}
+
+                first_check = title[1][-1]
+                if first_check == '(':
+                    title[1] = title[1][0:-1].strip().lstrip()
+                second_check = title[1][-1]
+                if second_check == ":":
+                    title[1] = title[1][0:-1].strip().lstrip()
                 try:
                     webstatement = data["WebStatement"]
                 except KeyError:
                     webstatement = ""
-                output.creator = creator
-                output.title = title
-                output.rights = rights
-                output.keywords = keywords
-                output.subject = subject
-                output.createdate = createdate
-                output.filename = filename
+                webstatement = {1: webstatement}
+                str_filen = filename[1]
+                output = {'creator': creator,
+                          'title': title,
+                          'rights': rights,
+                          'keyword': keywords,
+                          'subject': subject,
+                          'createdate': copyrightdate,
+                          'filename': filename,
+                          'webstatement': webstatement,
+                          'publisher': publisher,
+                         }
+                if volume:
+                    volume = re.sub(r'\)', '', re.sub(r'\(', '', volume))
+                    volume = {1: volume}
+                    if 'MamlukStudiesReview' in filename[1]:
+                        output["formatof"] = volume
+                    else:
+                        output["part"] = volume
                 outputs.append(output)
-                row = [filename, creator, title, re.sub(r'\n', ' ', rights), webstatement, subject, keywords, createdate, publisher]
-                rows.append(row)
             except JSONDecodeError:
                 pass
-        with open(args.output_file, "w", encoding="utf-8") as csv_file:
-            csvfieldnames = ["filename", "creator", "title", "rights", "webstatement", "subject", "keywords", "publisher",  "createdate"]
-            writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL, quotechar="\"")
-            writer.writerow(csvfieldnames)
-            for n_row in rows:
-                total_files += 1
-                writer.writerow(n_row)
+        # with open(args.output_file, "w", encoding="utf-8") as csv_file:
+        #     csvfieldnames = ["filename", "creator", "title", "rights", "webstatement", "subject", "keywords", "publisher",  "createdate"]
+        #     writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL, quotechar="\"")
+        #     writer.writerow(csvfieldnames)
+        #     for n_row in rows:
+        #         total_files += 1
+        #         writer.writerow(n_row)
         for n_record in outputs:
+            filename = n_record["filename"]
+            del n_record["filename"]
             new_mapper = Mapper(n_record)
+
+            new_filename = re.sub(r'.pdf', '.xml', filename[1])
+            xml_string = tostring(new_mapper.out)
+            xml_string = minidom.parseString(xml_string).toprettyxml()
+            with open(join('./out', new_filename), "w", encoding="utf-8") as write_file:
+                write_file.write(xml_string)
         return 0
     except KeyboardInterrupt:
         return 131

diff --git a/mamlukimport/mapper.py b/mamlukimport/mapper.py
@@ -1,20 +1,21 @@
 
-from xml.etree.ElementTree import Element, SubElement
+from xml.etree.ElementTree import Element, ElementTree, SubElement
 from sys import stderr
 
 class Mapper(object):
     def __init__(self, input):
         self._in = input
         self._lookup = {'title': {'element':'title', 'qualifier':'none'},
                        'createdate': {'element': 'date', 'qualifier':'copyright'},
-                       'creator': {'element':'contributor', 'qualifiier':'author'},
+                       'creator': {'element':'contributor', 'qualifier':'author'},
                        'rights': {'element': 'rights', 'qualifier': 'statement'},
                        'webstatement': {'element': 'rights', 'qualifier': 'url'},
                        'subject': {'element': 'subject', 'qualifier': 'none'},
                        'keyword': {'element': 'subject', 'qualifier': 'keyword'},
                        'source': {'element': 'source', 'qualifier': 'none'},
-                       'isPartOf': {'element': 'relation', 'qualifier' :'isPartOf'},
-                       'isFormatOf': {'element': 'relation', 'qualifier': 'isPartOf'},
+                       'part': {'element': 'relation', 'qualifier' :'isPartOf'},
+                       'formatof': {'element': 'relation', 'qualifier' :'isFormatOf'},
+                       'publisher': {'element': 'publisher', 'qualifier': 'none'},
                       }
         self.out = self._transform()
 
@@ -23,13 +24,19 @@ def _transform(self):
         for n_key in self._in:
             try:
                 instructions = self._lookup.get(n_key)
-                new_element = SubElement(root, "dc_value")
-                new_element.set("element", instructions["element"])
-                new_element.set("qualifier", instructions["qualifier"])
-                new_element.text = self._in[n_key]
             except KeyError:
-                stderr.write("{} is an invalid field for this mapping.".format(n_key))
-        self.out = root
+                instructions = None
+                stderr.write("{} is an invalid field for this mapping.\n".format(n_key))
+            if instructions:
+                for n_value in self._in[n_key]:
+                    new_element = SubElement(root, "dc_value")
+                    new_element.set("element", instructions["element"])
+                    new_element.set("qualifier", instructions["qualifier"])
+                    if isinstance(self._in[n_key], str):
+                        new_element.text = self._in[n_key]
+                    else:
+                        new_element.text = str(self._in[n_key][n_value])
+        return root
 
     def get_output(self):
         return self.out