From cda83a06d25655214120df57fe3aba363b8c8631 Mon Sep 17 00:00:00 2001 From: Ivica Kostric Date: Fri, 2 Feb 2024 07:59:39 +0100 Subject: [PATCH] Feature/82-NLtoAPI-Implement-basic-entity-linker * Simplify Triple (Only use Concept instead of Union[str, Concept]) Fixes #87 * [NLtoAPI] Implement basic entity linker Fixes #82 * Update tests * Update tests to use Concept * simplify checks in get_preference * simplify check for existance of object in triple * Merge branch 'feature/87-Simplify-Triple-Only-use-Concept-instead-of-Unionstr-Concept' of https://github.com/iai-group/pkg-api into feature/82-NLtoAPI-Implement-basic-entity-linker * Add tests * Add TripleElement class for SPO * Merge branch 'feature/87-Simplify-Triple-Only-use-Concept-instead-of-Unionstr-Concept' of https://github.com/iai-group/pkg-api into feature/82-NLtoAPI-Implement-basic-entity-linker * Migrate to new dataclass structure * Add linking for predicate and object * Address review comments * Merge branch 'main' of https://github.com/iai-group/pkg-api into feature/87-Simplify-Triple-Only-use-Concept-instead-of-Unionstr-Concept * fix imports * Fix pkg inserts with new dataclass structure * [NLtoAPI] Implement basic entity linker Fixes #82 * Merge branch 'main' of https://github.com/iai-group/pkg-api into feature/82-NLtoAPI-Implement-basic-entity-linker * Swap value and reference * Merge branch 'main' of https://github.com/iai-group/pkg-api into feature/82-NLtoAPI-Implement-basic-entity-linker * Address review comments * Address review comments * format docstrings * Add file docstrings * Address review comments --- config/entity_linking/dbpedia_spotlight.yaml | 7 ++ .../nl_to_pkg/entity_linking/entity_linker.py | 2 +- .../entity_linking/spotlight_entity_linker.py | 93 ++++++++++++++++ pkg_api/nl_to_pkg/nl_to_pkg.py | 2 +- pkg_api/util/load_config.py | 31 ++++++ requirements.txt | 5 +- tests/nl_to_pkg/test_nl_to_pkg.py | 2 +- .../nl_to_pkg/test_spotlight_entity_linker.py | 104 ++++++++++++++++++ 8 files changed, 242 insertions(+), 4 deletions(-) create mode 100644 config/entity_linking/dbpedia_spotlight.yaml create mode 100644 pkg_api/nl_to_pkg/entity_linking/spotlight_entity_linker.py create mode 100644 pkg_api/util/load_config.py create mode 100644 tests/nl_to_pkg/test_spotlight_entity_linker.py diff --git a/config/entity_linking/dbpedia_spotlight.yaml b/config/entity_linking/dbpedia_spotlight.yaml new file mode 100644 index 0000000..0193818 --- /dev/null +++ b/config/entity_linking/dbpedia_spotlight.yaml @@ -0,0 +1,7 @@ + url: "https://api.dbpedia-spotlight.org/en/annotate" + headers: + accept: "application/json" + params: + confidence: 0.5 + support: 50 + types: null diff --git a/pkg_api/nl_to_pkg/entity_linking/entity_linker.py b/pkg_api/nl_to_pkg/entity_linking/entity_linker.py index cd2b8ed..f3fb9aa 100644 --- a/pkg_api/nl_to_pkg/entity_linking/entity_linker.py +++ b/pkg_api/nl_to_pkg/entity_linking/entity_linker.py @@ -10,7 +10,7 @@ class EntityLinker(ABC): """Entity linker for linking entities to the PKG or available KGs.""" @abstractmethod - def link_annotation_entities(self, pkg_data: PKGData) -> PKGData: + def link_entities(self, pkg_data: PKGData) -> PKGData: """Resolves the pkg data annotations if possible. Args: diff --git a/pkg_api/nl_to_pkg/entity_linking/spotlight_entity_linker.py b/pkg_api/nl_to_pkg/entity_linking/spotlight_entity_linker.py new file mode 100644 index 0000000..0549f92 --- /dev/null +++ b/pkg_api/nl_to_pkg/entity_linking/spotlight_entity_linker.py @@ -0,0 +1,93 @@ +"""Contains the DBpedia Spotlight entity linker.""" + +from typing import Any, Dict, Union + +import requests + +from pkg_api.core.annotation import Concept, PKGData, TripleElement +from pkg_api.core.pkg_types import URI +from pkg_api.nl_to_pkg.entity_linking.entity_linker import EntityLinker +from pkg_api.util.load_config import load_yaml_config + +_DEFAULT_CONFIG_PATH = "config/entity_linking/dbpedia_spotlight.yaml" + + +class SpotlightEntityLinker(EntityLinker): + def __init__(self, path: str = _DEFAULT_CONFIG_PATH) -> None: + """Initializes the DBpedia Spotlight entity linker. + + Args: + path: The path to the config file. Defaults to _DEFAULT_CONFIG_PATH. + """ + self._config = load_yaml_config(path) + + def link_entities(self, pkg_data: PKGData) -> PKGData: + """Returns the PKG data with linked entities. + + Only the predicate and object of the triple are linked to a public KG, + as the subject should be retrieved from the PKG. + + Args: + pkg_data: The PKG data to be linked. + + Returns: + The PKG data with linked entities. + """ + if pkg_data.triple is None: + return pkg_data + + for attr in ["predicate", "object"]: + triple_element: TripleElement = getattr(pkg_data.triple, attr) + if triple_element is not None: + triple_element.value = self._get_linked_text( + triple_element.reference + ) + + return pkg_data + + def _get_linked_text(self, reference: str) -> Union[URI, Concept, str]: + """Returns the linked object as URI, Concept or literal. + + Args: + reference: The reference text to be linked. + + Returns: + The linked object. + """ + # Return Concept as default as we cannot distinguish between Concept + # and literal. + linked_entities = self._get_linker_response(reference) + if linked_entities is None or "Resources" not in linked_entities: + return Concept(reference) + + # If the entire value is a single entity, return the URI. + if ( + len(linked_entities["Resources"]) == 1 + and linked_entities["Resources"][0]["@surfaceForm"] == reference + ): + return URI(linked_entities["Resources"][0]["@URI"]) + + # Otherwise, return a concept with the linked entities. + value = Concept(reference) + for entity in linked_entities["Resources"]: + value.related_entities.append(entity["@URI"]) + + return value + + def _get_linker_response(self, text: str) -> Dict[str, Any]: + """Returns the response from the DBpedia Spotlight API. + + Args: + text: The text to be annotated. + + Returns: + The response from the DBpedia Spotlight API. + """ + params = {**self._config["params"], "text": text} + response = requests.get( + self._config["url"], headers=self._config["headers"], params=params + ) + if response.status_code == 200: + return response.json() + else: + return {"error": response.text} diff --git a/pkg_api/nl_to_pkg/nl_to_pkg.py b/pkg_api/nl_to_pkg/nl_to_pkg.py index 9b8aab1..c01005a 100644 --- a/pkg_api/nl_to_pkg/nl_to_pkg.py +++ b/pkg_api/nl_to_pkg/nl_to_pkg.py @@ -30,6 +30,6 @@ def annotate(self, statement: str) -> Tuple[Intent, PKGData]: A tuple of the intent and the annotated and linked statement. """ intent, pkg_data = self._annotator.get_annotations(statement) - linked_pkg_data = self._entity_linker.link_annotation_entities(pkg_data) + linked_pkg_data = self._entity_linker.link_entities(pkg_data) return intent, linked_pkg_data diff --git a/pkg_api/util/load_config.py b/pkg_api/util/load_config.py new file mode 100644 index 0000000..1672520 --- /dev/null +++ b/pkg_api/util/load_config.py @@ -0,0 +1,31 @@ +"""Utility function for loading configuration data from YAML files.""" + +import os +from typing import Any, Dict + +import yaml + + +def load_yaml_config(path: str) -> Dict[str, Any]: + """Loads configuration from a YAML file at the given path. + + Args: + path: The file path to the YAML configuration file. + + Raises: + FileNotFoundError: If the specified file does not exist. + ValueError: If the path is not a file. + + Returns: + A dictionary containing the configuration data. + """ + if not os.path.exists(path): + raise FileNotFoundError(f"The file at {path} does not exist.") + + if not os.path.isfile(path): + raise ValueError(f"The path {path} is not a file.") + + with open(path, "r") as file: + config = yaml.safe_load(file) + + return config diff --git a/requirements.txt b/requirements.txt index 5188482..6abe224 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,9 @@ pytest-cov Flask>=2.3.3 Flask-RESTful>=0.3.10 Flask-SQLAlchemy>=3.1.1 +requests +types-requests +pyyaml +types-pyyaml ollama -types-PyYAML rfc3987 diff --git a/tests/nl_to_pkg/test_nl_to_pkg.py b/tests/nl_to_pkg/test_nl_to_pkg.py index 07b14a3..370d8f1 100644 --- a/tests/nl_to_pkg/test_nl_to_pkg.py +++ b/tests/nl_to_pkg/test_nl_to_pkg.py @@ -52,7 +52,7 @@ def link_annotation_side_effect(*args, **kwargs): pkg_data.triple.object.value = "Linked Object" return pkg_data - mock.link_annotation_entities.side_effect = link_annotation_side_effect + mock.link_entities.side_effect = link_annotation_side_effect return mock diff --git a/tests/nl_to_pkg/test_spotlight_entity_linker.py b/tests/nl_to_pkg/test_spotlight_entity_linker.py new file mode 100644 index 0000000..b41f9a6 --- /dev/null +++ b/tests/nl_to_pkg/test_spotlight_entity_linker.py @@ -0,0 +1,104 @@ +"""Tests the Spotlight entity linker class.""" +from unittest.mock import Mock, patch + +import pytest + +from pkg_api.core.annotation import Concept, PKGData, Triple, TripleElement +from pkg_api.core.pkg_types import URI +from pkg_api.nl_to_pkg.entity_linking.spotlight_entity_linker import ( + SpotlightEntityLinker, +) + + +@pytest.fixture +def sample_pkg_data() -> PKGData: + """Returns a test PKG data.""" + return PKGData( + "Test statement", + Triple( + TripleElement("Test Subject"), + TripleElement("Test Predicate"), + TripleElement("Test Object"), + ), + ) + + +@pytest.fixture +def linker() -> SpotlightEntityLinker: + """Returns a SpotlightEntityLinker instance.""" + return SpotlightEntityLinker() + + +def test_spotlight_entity_linker_initialization( + linker: SpotlightEntityLinker, +) -> None: + """Test the initialization of the SpotlightEntityLinker.""" + assert "url" in linker._config + assert "params" in linker._config + assert "headers" in linker._config + + +@patch("pkg_api.nl_to_pkg.entity_linking.spotlight_entity_linker.requests.get") +def test_link_annotation_uri( + mock_get: Mock, sample_pkg_data: PKGData, linker: SpotlightEntityLinker +) -> None: + """Test the link_entities method.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "Resources": [ + { + "@surfaceForm": "Test Object", + "@URI": "http://dbpedia.org/resource/Test_Object", + } + ] + } + mock_get.return_value = mock_response + annotated_pkg_data = linker.link_entities(sample_pkg_data) + + assert annotated_pkg_data == sample_pkg_data + assert isinstance(annotated_pkg_data.triple, Triple) + assert isinstance(annotated_pkg_data.triple.object, TripleElement) + assert isinstance(annotated_pkg_data.triple.object.value, URI) + assert ( + annotated_pkg_data.triple.object.value + == "http://dbpedia.org/resource/Test_Object" + ) + + +@patch("pkg_api.nl_to_pkg.entity_linking.spotlight_entity_linker.requests.get") +def test_link_annotation_concept( + mock_get: Mock, sample_pkg_data: PKGData, linker: SpotlightEntityLinker +) -> None: + """Test the link_entities method.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "Resources": [ + { + "@surfaceForm": "Object", + "@URI": "http://dbpedia.org/resource/Object", + } + ] + } + mock_get.return_value = mock_response + annotated_pkg_data = linker.link_entities(sample_pkg_data) + + assert isinstance(annotated_pkg_data.triple, Triple) + assert isinstance(annotated_pkg_data.triple.object, TripleElement) + assert isinstance(annotated_pkg_data.triple.object.value, Concept) + assert len(annotated_pkg_data.triple.object.value.related_entities) == 1 + assert ( + annotated_pkg_data.triple.object.value.related_entities[0] + == "http://dbpedia.org/resource/Object" + ) + + +def test_link_entities_no_change( + sample_pkg_data: PKGData, linker: SpotlightEntityLinker +) -> None: + """Test the link_entities method when no entities are linked.""" + original_pkg_data = sample_pkg_data + annotated_pkg_data = linker.link_entities(original_pkg_data) + + assert annotated_pkg_data == original_pkg_data