Add all additional information for wikidata title and label

5ebf4eb2 · Marc Feger · 5962eeef · 5ebf4eb2 · 5ebf4eb2 · 5ebf4eb2
Commit 5ebf4eb2 authored 6 years ago by Marc Feger
--- a/app.py
+++ b/app.py
-from flask import Flask, Response
+from flask import Flask
 from flask.json import jsonify

-from src.main import assemble_wikidata_groundtruth, assemble_dbpedia_groundtruth, assemble_wikidata_triples, \
-    assemble_dbpedia_triples
+from src.main import assemble_wikidata_groundtruth_english, assemble_dbpedia_groundtruth_english, \
+    assemble_wikidata_triples, \
+    assemble_dbpedia_triples, collect_wikidata_results_with_title

 app = Flask(__name__)


+@app.route('/wikidata/collect/title')
+def wikidata_collect_title():
+    return jsonify(collect_wikidata_results_with_title())
+
+
 @app.route('/dbpedia/n3')
 def dbpedia_n3():
    return jsonify(assemble_dbpedia_triples())
@@ -17,14 +23,14 @@ def wikidata_n3():
    return jsonify(assemble_wikidata_triples())


-@app.route('/wikidata/groundtruth')
+@app.route('/wikidata/groundtruth/english')
 def wikidata_groundtruth():
-    return jsonify(assemble_wikidata_groundtruth())
+    return jsonify(assemble_wikidata_groundtruth_english())


-@app.route('/dbpedia/groundtruth')
+@app.route('/dbpedia/groundtruth/english')
 def dbpedia_groundtruth():
-    return jsonify(assemble_dbpedia_groundtruth())
+    return jsonify(assemble_dbpedia_groundtruth_english())


 @app.route('/')

--- a/src/lib/engine.py
+++ b/src/lib/engine.py
@@ -33,7 +33,7 @@ class SPARQLEngine(object):
        self.results = self.engine.query().convert()
        return self.results

-    def get_json_test(self) -> Dict:
+    def get_json_with_query(self) -> Dict:
        """
        This method returns the results as JSON.


--- a/src/lib/writer.py
+++ b/src/lib/writer.py
@@ -23,6 +23,15 @@ class FileWriter(object):
        with open(self.destination, 'w+') as outfile:
            json.dump(self.data["results"]["bindings"], outfile)

+    def as_filtered_json(self) -> None:
+        """
+        This method writes the data as json to the destination.
+
+        :return: None
+        """
+        with open(self.destination, 'w+') as outfile:
+            json.dump(self.data, outfile)
+
    def as_string(self) -> None:
        """
        This method writes the data as a string linewise to the destination.

--- a/src/main.py
+++ b/src/main.py
+import json
 from tokenize import String
 from typing import Dict, List

@@ -11,7 +12,7 @@ from src.wikidata.keys import ResultKeys
 from src.dbpedia.keys import ResultKeys


-def assemble_wikidata_groundtruth() -> Dict:
+def assemble_wikidata_groundtruth_english() -> Dict:
    """
    This method assembles the groundtruth for the data in wikidata.
    It also writes them to /static/data/wikidata_groundtruth.txt
@@ -25,7 +26,7 @@ def assemble_wikidata_groundtruth() -> Dict:
    return data


-def assemble_dbpedia_groundtruth() -> Dict:
+def assemble_dbpedia_groundtruth_english() -> Dict:
    """
    This method assembles the groundtruth for the data in dbpedia.
    It also writes them to /static/data/dbpedia_groundtruth.txt
@@ -74,6 +75,7 @@ def assemble_wikidata_triples() -> List:
                    triples += [NTriple(subject=movie, predicate=wdt + 'P161', value=cast).as_string()]

        if ResultKeys.published.value in result.keys():
+            # Todo: Complete date -> Maybe baby there exists different dates -> New published -> okay !
            for published in Decapper(result[ResultKeys.published.value]).unpack().split(
                    ResultKeys.line_separator.value):
                if published:
@@ -141,6 +143,7 @@ def assemble_dbpedia_triples() -> List:
                    triples += [NTriple(subject=movie, predicate=dbo + 'starring', value=cast).as_string()]

        if ResultKeys.published.value in result.keys():
+            # Todo: Complete date -> Maybe baby there exists different dates -> New published -> okay !
            for published in Decapper(result[ResultKeys.published.value]).unpack().split(
                    ResultKeys.line_separator.value):
                if published:
@@ -181,3 +184,55 @@ def assemble_dbpedia_triples() -> List:
                    triples += [NTriple(subject=movie, predicate=dbp + 'productionCompanies',
                                        value=production_companies).as_string()]
    return triples
+
+
+def collect_wikidata_results_with_title() -> Dict:
+    """
+    This method collects the wikidata results with a Title in different languages.
+    Therefore rdfs:label and wdt:title are used.
+
+    :warning: Takes a long time.
+    :return: The List of all titles grouped in a dict with the corresponding film.
+    """
+
+    data = FileReader(source='static/wikidata_groundtruth.txt').as_json()
+    titles = {}
+    progress = 0
+
+    print('[GET] Collect Wikidata film labels and titles in different languages: ')
+    print(str('\t') + str(progress) + ' of ' + str(len(data)))
+
+    for result in data:
+        movie_id = Decapper(result[ResultKeys.movie.value]).unpack()
+
+        query_label = FileReader(source='static/wikidata_title.sparql') \
+            .as_string() \
+            .replace('subject', '<' + movie_id + '>') \
+            .replace('predicate', 'rdfs:label') \
+            .replace('value', 'Label')
+        engine_label = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query=query_label)
+
+        query_title = FileReader(source='static/wikidata_title.sparql') \
+            .as_string() \
+            .replace('subject', '<' + movie_id + '>') \
+            .replace('predicate', 'wdt:P1476') \
+            .replace('value', 'Title')
+        engine_title = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query=query_title)
+
+        titles[movie_id] = {
+            'Labels': engine_label.get_json_with_query()["results"]["bindings"],
+            'Titles': engine_title.get_json_with_query()["results"]["bindings"]
+        }
+
+        progress += 1
+        if progress % 100 == 0:
+            print(str('\t') + str(progress) + ' of ' + str(len(data)))
+
+    print('Done')
+
+    print('[WRITE] Data to static/wikidata_titles_and_labels.txt')
+    writer = FileWriter(destination='static/wikidata_titles_and_labels.txt', data=titles)
+    writer.as_filtered_json()
+    print('Done')
+
+    return titles
--- a/src/wikidata/keys.py
+++ b/src/wikidata/keys.py
@@ -13,3 +13,8 @@ class ResultKeys(Enum):
    description = 'Description'
    production_company = 'ProductionCompany'
    line_separator = '|'
+
+
+class TitleKey(Enum):
+    title = 'Title'
+    label = 'Label'
--- a/static/wikidata_groundtruth.txt
+++ b/static/wikidata_groundtruth.txt
--- a/static/wikidata_title.sparql
+++ b/static/wikidata_title.sparql
+PREFIX wd: <http://www.wikidata.org/entity/>
+PREFIX wdt: <http://www.wikidata.org/prop/direct/>
+PREFIX schema: <http://schema.org/>
+
+SELECT DISTINCT ?value
+WHERE{
+	  OPTIONAL {subject predicate ?value.}
+}
\ No newline at end of file
--- a/static/wikidata_titles_and_labels.txt
+++ b/static/wikidata_titles_and_labels.txt