From ee1e613061be5c666138b06010177c957c57730c Mon Sep 17 00:00:00 2001 From: feger <marc.feger@hhu.de> Date: Fri, 17 May 2019 13:36:47 +0200 Subject: [PATCH] Add check for valid published data; move melter --- src/main.py | 2 +- melter.py => src/melter.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) rename melter.py => src/melter.py (95%) diff --git a/src/main.py b/src/main.py index db7a9ab..8855ca7 100644 --- a/src/main.py +++ b/src/main.py @@ -1,7 +1,7 @@ from tokenize import String from typing import Dict, List -from melter import melt_wikidata_and_dbpedia, clean_wikidata +from src.melter import melt_wikidata_and_dbpedia, clean_wikidata from src import WIKIDATA_ENTRYPOINT, DBPEDIA_ENTRYPOINT from src.lib.composer import Composer from src.lib.decapper import Decapper diff --git a/melter.py b/src/melter.py similarity index 95% rename from melter.py rename to src/melter.py index 19a40d5..621a024 100644 --- a/melter.py +++ b/src/melter.py @@ -149,6 +149,7 @@ def clean_wikidata() -> List: for data in melted_data: movie = Decapper(data[ResultKeys.movie.value]).unpack() duration = Decapper(data[ResultKeys.duration.value]).unpack() + published = Decapper(data[ResultKeys.published.value]).unpack() if movie in bad_films: continue if duration: @@ -156,6 +157,11 @@ def clean_wikidata() -> List: continue if not all([(int(i) >= 0) for i in duration.split(ResultKeys.line_separator.value)]): continue + if published: + if not all( + [(i[10] == 'T' and i[19] == 'Z' and i.count(':') == 2 and i.count('-') == 2) + for i in published.split(ResultKeys.line_separator.value)]): + continue cleaned += [data] FileWriter(destination='./static/wikidata/cleaned.txt', data=cleaned).as_json_of_list() return cleaned -- GitLab