scripts/create_ontology.py

import csv
from pathlib import Path
from typing import Tuple
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, OWL, RDFS, SKOS

'''
Creates the dfgfo.ttl ontology by parsing 
the DFG classification system in  csv/Fachsystematik_2020-2024_EN_20210621.csv

Each Subject is a owl:Class with:
* DFG Subject number as URI 
* labels in EN and DE.
* class subpeclass accordinng to DFG Classification hierarchy 
'''
dfg_onto_metadata_fn = Path(__file__).parent.parent / 'metadata.ttl'
dfg_onto_fn = Path(__file__).parent.parent / 'dfgfo.ttl' 
dfg_onto_fn_owl = Path(__file__).parent.parent / 'dfgfo.owl' 
#dfg_csv_en = Path(__file__).parent.parent / 'csv' / '2020-2024' / 'Fachsystematik_2020-2024.csv'
dfg_csv_en = Path(__file__).parent.parent / 'csv' / '2024-2028' / 'Fachsystematik_2024-2028.csv'
print(dfg_csv_en)

g_metadata = Graph()
g_metadata.parse(str(dfg_onto_metadata_fn.absolute()))

g_classes = Graph()
ns_str = 'https://w3id.org/dfgfo/2024/'
namespace = Namespace(ns_str)

g_classes.namespace_manager.bind('owl', 'http://www.w3.org/2002/07/owl#', override=False)
g_classes.namespace_manager.bind('dfgfo', ns_str, override=False)


def split_id_label(id_n_label:str) -> Tuple[str, str]:
    id, label = id_n_label.split('\n')
    id = id.replace(' ', '') # remove spaces
    return id, label


def create_class(graph, ns, node_name, labels, parent):
    uri_str = f'{ns_str}{node_name}'
    node = URIRef(uri_str)
    print(f'Class: {uri_str} labels: {labels}')

    # type
    graph.add((node, RDF.type, OWL.Class))
    # class
    if parent is None:
        graph.add((node, RDFS.subClassOf, OWL.Thing))
    else:
        parent_uri_str = f'{ns_str}{parent}'
        parent_node = URIRef(parent_uri_str)
        graph.add((node, RDFS.subClassOf, parent_node))
    # labels
    graph.add((node, RDFS.label, Literal(f'{labels[0]}', lang='en')))
    graph.add((node, SKOS.altLabel, Literal(f'{labels[1]}', lang='de')))
    # mappings
    graph.add((node, SKOS.closeMatch, URIRef(f'http://uri.gbv.de/terminology/dfg2024/{node_name}')))

    # print(f'GRAPH NODE: {node} ------')


tree_hierarchy = ['Scientific Discipline', 'Subject Area', 'Review Board', 'Subject']
header_en_de_mapping = {
    'Scientific Discipline': 'Wissenschaftsbereich',
    'Subject Area': 'Fachgebiet', 
    'Review Board': 'Fachkollegium',
    'Subject':'Fach'}

# de_tree_hierarchy = [
  # top to bottom 
# DFG Tree hierarchy:
# * Scientific Discipline
#   * Subject Area
#     * Review Board
#       * Subject
#       * Subject Number

with open(dfg_csv_en, newline='', encoding="utf-8") as csvfile:
    csvfile = csv.DictReader(csvfile, delimiter=',')
    for row in csvfile:
        for index, collumn in enumerate(tree_hierarchy):
            en_key = (tree_hierarchy)[index]
            de_key = header_en_de_mapping[en_key]
            cell=row[en_key]
            cell_de=row[de_key]
            # print(f'EN: {cell}\nDE:{cell_de}')
            print(f'\nSECTION: {index} {collumn}')
            print(f'INDEX: {index} COL:{collumn} CELL: {cell}')

            # current 
            if index == 3: 
                cell_id = row['Subject Number']
                cell_label = cell 
                cell_label_de = cell_de
            else:
                cell_id, cell_label = split_id_label(id_n_label=row[tree_hierarchy[index]])
                cell_label_de = 'DE'
                cell_id_de, cell_label_de = split_id_label(id_n_label=cell_de)
            current = f'{cell_id} - {cell_label}'
            print(f'CELL ID: <<<<{cell_id}>>>')
             # parent
            if index == 0:
                parent_id = None
            else:
                parent_id, parent_label = split_id_label(id_n_label=row[(tree_hierarchy)[index - 1]]) 
            print(f'CURRENT: {current}')
            print(f'PARENT: <<<{parent_id}>>>')

            create_class(graph=g_classes, 
                         ns=namespace, 
                         node_name=cell_id, 
                         labels=[cell_label, cell_label_de],
                         parent=parent_id)


# join g_metadata + g_classes graphs into g_joint
g_joint = Graph() # after the g_classes
g_joint = g_metadata + g_classes

print('\n\nSERIALIZE\n\n')
print(g_joint.serialize())
#with open(dfg_onto_fn, 'w') as dfg_onto:
#    dfg_onto.write(g_joint.serialize())
g_joint.serialize(destination=dfg_onto_fn, encoding="utf-8")
g_joint.serialize(destination=dfg_onto_fn_owl, encoding="utf-8", format="xml")