From 484acdc56de108dd81558942cc21b49af8d6ed6d Mon Sep 17 00:00:00 2001
From: feger <marc.feger@hhu.de>
Date: Mon, 13 May 2019 21:28:10 +0200
Subject: [PATCH] Add statistics for DBPedia; Add Documentation for it

---
 app.py                                     |   7 +-
 dbpedia_analyser.py                        | 267 +++++++++++++++++++++
 doc/swagger-ui/index.html                  |   2 +-
 src/lib/keys.py                            |   5 -
 static/swagger/{api_doc.yaml => docs.yaml} |  16 ++
 5 files changed, 290 insertions(+), 7 deletions(-)
 create mode 100644 dbpedia_analyser.py
 rename static/swagger/{api_doc.yaml => docs.yaml} (88%)

diff --git a/app.py b/app.py
index b8b4349..622e9bf 100644
--- a/app.py
+++ b/app.py
@@ -1,14 +1,19 @@
 from flask import Flask, render_template
 from flask.json import jsonify
 
+from dbpedia_analyser import get_all_statistics
 from src.main import assemble_wikidata_groundtruth_english, assemble_dbpedia_groundtruth_english, \
     assemble_wikidata_triples, \
     assemble_dbpedia_triples, test_if_data_is_enlargeable_for_dbpedia, assemble_similarity_triples
 
-# app = Flask(__name__)
 app = Flask(__name__, template_folder='.')
 
 
+@app.route('/dbpedia/statistics')
+def statistics():
+    return jsonify(get_all_statistics())
+
+
 @app.route('/coreferences')
 def coreferences():
     return jsonify(assemble_similarity_triples())
diff --git a/dbpedia_analyser.py b/dbpedia_analyser.py
new file mode 100644
index 0000000..a30c1e5
--- /dev/null
+++ b/dbpedia_analyser.py
@@ -0,0 +1,267 @@
+from tokenize import String
+from typing import Dict
+
+from src import DBPEDIA_ENTRYPOINT
+from src.lib.decapper import Decapper
+from src.lib.engine import SPARQLEngine
+from src.lib.keys import ResultKeys
+
+
+def __get_all_films_with_offset(offset: int = 0) -> String:
+    """
+    This method returns the query to find all movies reachable in DBPedia.
+    :param offset: Offset to start at.
+    :return: The query
+    """
+    return """
+            PREFIX dbo: <http://dbpedia.org/ontology/>
+            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+            SELECT DISTINCT (?movie AS ?Movie)
+            WHERE {
+                ?movie rdf:type dbo:Film
+            } 
+            ORDER BY DESC(?movie)
+            OFFSET 
+        """ + str(offset)
+
+
+def __get_all_comedy_films_with_offset(offset: int = 0) -> String:
+    """
+    This method returns the query to find all comedy movies reachable in DBPedia.
+    :param offset: Offset to start at.
+    :return: The query
+    """
+    return """
+            PREFIX dbo: <http://dbpedia.org/ontology/>
+            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+            SELECT DISTINCT (?movie AS ?Movie)
+            WHERE {
+                ?movie a <http://dbpedia.org/ontology/Film> .
+                ?movie dct:subject ?subject.
+                FILTER( regex(?subject, "comedy","i" ))
+            } 
+            ORDER BY DESC(?movie)
+            OFFSET 
+        """ + str(offset)
+
+
+def __get_all_comedy_films_with_directors_birthday_offset(offset: int = 0) -> String:
+    """
+    This method returns the query to find all comedy movies reachable in DBPedia who have a director with birthday.
+    :param offset: Offset to start at.
+    :return: The query
+    """
+    return """
+            PREFIX dbo: <http://dbpedia.org/ontology/>
+            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+            SELECT DISTINCT (?movie AS ?Movie)
+            WHERE {
+                ?movie a <http://dbpedia.org/ontology/Film> .
+                ?movie dct:subject ?subject.
+                FILTER( regex(?subject, "comedy","i" ))
+                ?movie dbo:director [dbo:birthDate ?dob].
+            } 
+            ORDER BY DESC(?movie)
+            OFFSET 
+        """ + str(offset)
+
+
+def __get_all_comedy_films_with_directors_birthday_before_1970_offset(offset: int = 0) -> String:
+    """
+    This method returns the query to find all comedy movies reachable in DBPedia who have a director with birthday before 1970.
+    :param offset: Offset to start at.
+    :return: The query
+    """
+    return """
+            PREFIX dbo: <http://dbpedia.org/ontology/>
+            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+            SELECT DISTINCT (?movie AS ?Movie)
+            WHERE {
+                ?movie a <http://dbpedia.org/ontology/Film> .
+                ?movie dct:subject ?subject.
+                FILTER( regex(?subject, "comedy","i" ))
+                ?movie dbo:director [dbo:birthDate ?dob].
+            	FILTER (xsd:integer(substr(xsd:string(?dob), 0, 4)) < 1970).
+            }  
+            ORDER BY DESC(?movie)
+            OFFSET 
+        """ + str(offset)
+
+
+def __get_all_comedy_films_with_directors_birthday_after_or_on_1970_offset(offset: int = 0) -> String:
+    """
+    This method returns the query to find all comedy movies reachable in DBPedia who have a director with birthday on or after 1970.
+    :param offset: Offset to start at.
+    :return: The query
+    """
+    return """
+            PREFIX dbo: <http://dbpedia.org/ontology/>
+            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+            SELECT DISTINCT (?movie AS ?Movie)
+            WHERE {
+                ?movie a <http://dbpedia.org/ontology/Film> .
+                ?movie dct:subject ?subject.
+                FILTER( regex(?subject, "comedy","i" ))
+                ?movie dbo:director [dbo:birthDate ?dob].
+            	FILTER (xsd:integer(substr(xsd:string(?dob), 0, 4)) >= 1970).
+            }  
+            ORDER BY DESC(?movie)
+            OFFSET 
+        """ + str(offset)
+
+
+def __get_interlinks_from_dbpedia_to_wikidata() -> String:
+    """
+    This method gets the interlink information from the dbpedia data to the wikidata data.
+
+    :return: Information about the interlinks
+    """
+
+    return """
+    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    PREFIX dbc: <http://dbpedia.org/resource/Category:>
+    PREFIX dbo: <http://dbpedia.org/ontology/>
+    PREFIX dct: <http://purl.org/dc/terms/>
+    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+    PREFIX dbp:	<http://dbpedia.org/property/>
+
+    SELECT
+    (COUNT(DISTINCT ?movie) AS ?Movie)
+    (COUNT(DISTINCT ?same) AS ?MovieLink)
+    (COUNT(DISTINCT ?name) AS ?Title)
+    (COUNT(DISTINCT ?director) AS ?Director)
+    (COUNT(DISTINCT ?sameDirector) AS ?DirectorLink)
+    (COUNT(DISTINCT ?directorName) AS ?DirectorName)
+    WHERE {
+       {
+        SELECT DISTINCT ?movie
+        WHERE {
+            SELECT DISTINCT ?movie ?min_year ?max_year
+            WHERE {
+                SELECT DISTINCT ?movie ?director (MIN(?year) AS ?min_year) (MAX(?year) AS ?max_year)
+                WHERE{
+                        # X type Film
+                        {?movie rdf:type dbo:Film.}
+
+                        # Imagine a inner join
+
+                        # X subject Comedy
+                        {?movie dct:subject dbc:Comedy}
+                        UNION
+                        # X subject Y; Y like "Comedy".
+                        {?movie dct:subject ?y FILTER CONTAINS(lcase(str(?y)), "comedy").}
+                        UNION
+                        # X genre Y; Y like "Comedy".
+                        {?movie dbo:genre ?y FILTER CONTAINS (lcase(str(?y)), "comedy").}
+                        # X director  Person; Person birth date year
+                        {?movie dbo:director ?director.
+                        ?director dbo:birthDate ?dob.
+                        BIND(xsd:integer(substr(xsd:string(?dob), 0, 4)) as ?year) .
+                        }
+                } GROUP BY ?movie ?director
+            }GROUP BY ?movie HAVING (?min_year = ?max_year)
+        }GROUP BY ?movie HAVING (?min_year >= 1970)
+     }
+    OPTIONAL{?movie foaf:name ?name FILTER(LANG(?name)="en").}
+    OPTIONAL{
+        ?movie dbo:director ?director.
+        OPTIONAL{
+            ?director owl:sameAs ?sameDirector FILTER CONTAINS (LCASE(STR(?sameDirector)), 'wikidata.org').
+        }
+    }
+    OPTIONAL{
+        {?movie dbo:director ?director.
+        ?director rdfs:label ?directorName FILTER(LANG(?directorName)="en").}
+    }
+    OPTIONAL{?movie owl:sameAs ?same FILTER CONTAINS (LCASE(STR(?same)), 'wikidata.org').}
+}
+    """
+
+
+def get_statistics(entrypoint: String, query: String, info: String) -> Dict:
+    """
+    This methods counts all movies for given query.
+
+    :param info: What is the info
+    :param entrypoint: Entrypoint to use.
+    :param query: The query to be looked for.
+    :return: Dict with information of the result
+    """
+    offset = 0
+    itterateable = True
+    entities = []
+    while itterateable:
+
+        engine = SPARQLEngine(entrypoint=entrypoint, query=query(offset))
+        results = engine.get_json_with_query()['results']['bindings']
+        if len(results) == 0:
+            itterateable = False
+
+        results = [Decapper(result[ResultKeys.movie.value]).unpack() for result in results]
+        entities += set(results)
+        offset += 10000
+
+    independent = len(set(entities))
+    return {
+        'info': info,
+        'entrypoint': entrypoint,
+        'independent': independent
+    }
+
+
+def get_interlinks(entrypoint: String, query: String, info: String) -> Dict:
+    """
+    This method gets all interlink information of the dbpedia results.
+
+    :param info: What is the info
+    :param entrypoint: Entrypoint to use.
+    :param query: The query to be looked for.
+    :return: Dict with information of the result
+    """
+    engine = SPARQLEngine(entrypoint=entrypoint, query=query).get_json_with_query()
+    return {
+        'info': info,
+        'entrypoint': entrypoint,
+        'movies': int(Decapper(engine['results']['bindings'][0][ResultKeys.movie.value]).unpack()),
+        'moviesLinked': int(Decapper(engine['results']['bindings'][0][ResultKeys.movie_link.value]).unpack()),
+        'moviesNames': int(Decapper(engine['results']['bindings'][0][ResultKeys.title.value]).unpack()),
+        'directors': int(Decapper(engine['results']['bindings'][0][ResultKeys.director.value]).unpack()),
+        'directorsLinked': int(Decapper(engine['results']['bindings'][0][ResultKeys.director_link.value]).unpack()),
+        'directorsNames': int(Decapper(engine['results']['bindings'][0]['DirectorName']).unpack())
+    }
+
+
+def get_all_statistics() -> Dict:
+    """
+    This method gets all statistics of dbpedia.
+
+    :return: Dict with all infos.
+    """
+    return {
+        'all_films_in_dbpedia': get_statistics(DBPEDIA_ENTRYPOINT, __get_all_films_with_offset, 'all_films_in_dbpedia'),
+        'all_comedy_films_in_dbpedia': get_statistics(DBPEDIA_ENTRYPOINT, __get_all_comedy_films_with_offset,
+                                                      'all_comedy_films_in_dbpedia'),
+        'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday':
+            get_statistics(DBPEDIA_ENTRYPOINT, __get_all_comedy_films_with_directors_birthday_offset,
+                           'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday'),
+        'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday_before_1970':
+            get_statistics(DBPEDIA_ENTRYPOINT, __get_all_comedy_films_with_directors_birthday_before_1970_offset,
+                           'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday_before_1970'),
+        'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday_after_or_on_1970':
+            get_statistics(DBPEDIA_ENTRYPOINT, __get_all_comedy_films_with_directors_birthday_after_or_on_1970_offset,
+                           'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday_after_or_on_1970'),
+        'interlinks_from_dbpedia_to_wikidata': get_interlinks(DBPEDIA_ENTRYPOINT,
+                                                              __get_interlinks_from_dbpedia_to_wikidata(),
+                                                              'interlinks_from_dbpedia_to_wikidata')
+    }
diff --git a/doc/swagger-ui/index.html b/doc/swagger-ui/index.html
index dfbd08a..fc747b8 100755
--- a/doc/swagger-ui/index.html
+++ b/doc/swagger-ui/index.html
@@ -39,7 +39,7 @@
     window.onload = function() {
       // Begin Swagger UI call region
       const ui = SwaggerUIBundle({
-        url: "../static/swagger/api_doc.yaml",
+        url: "../static/swagger/docs.yaml",
         dom_id: '#swagger-ui',
         deepLinking: true,
         presets: [
diff --git a/src/lib/keys.py b/src/lib/keys.py
index fcb1f65..c22460e 100644
--- a/src/lib/keys.py
+++ b/src/lib/keys.py
@@ -18,8 +18,3 @@ class ResultKeys(Enum):
     production_company = 'ProductionCompany'
     production_companies = 'ProductionCompanies'
     line_separator = '|'
-
-
-class TitleKey(Enum):
-    title = 'Title'
-    label = 'Label'
diff --git a/static/swagger/api_doc.yaml b/static/swagger/docs.yaml
similarity index 88%
rename from static/swagger/api_doc.yaml
rename to static/swagger/docs.yaml
index 93f21ab..364cab3 100644
--- a/static/swagger/api_doc.yaml
+++ b/static/swagger/docs.yaml
@@ -96,6 +96,22 @@ paths:
         In the first instance the owl:sameAs is used.
         In the second instance the name of the film and the directors are used.
         If two films share the same name and at least one director they are the same.
+      tags:
+        - Coreferences
+      responses:
+        200:
+          description: OK
+          content:
+            application/json:
+              schema:
+                properties:
+                  type: array
+                  items:
+                    type: string
+
+  /dbpedia/statistics:
+    get:
+      summary: This route returns all information about the interlinks from DBPedia to Wikidata.
       tags:
         - Coreferences
       responses:
-- 
GitLab