diff --git a/src/main.py b/src/main.py index db7a9ab737e5484aa0391ee0a8a40d9e05000025..8855ca71d9b5d849bd381fe67904bb8c3fa8d1bb 100644 --- a/src/main.py +++ b/src/main.py @@ -1,7 +1,7 @@ from tokenize import String from typing import Dict, List -from melter import melt_wikidata_and_dbpedia, clean_wikidata +from src.melter import melt_wikidata_and_dbpedia, clean_wikidata from src import WIKIDATA_ENTRYPOINT, DBPEDIA_ENTRYPOINT from src.lib.composer import Composer from src.lib.decapper import Decapper diff --git a/melter.py b/src/melter.py similarity index 95% rename from melter.py rename to src/melter.py index 19a40d51a244e60d0e131ff6a08ba67555a1662e..621a024a7d6abd7e3f3e7339cf0d807cec309c09 100644 --- a/melter.py +++ b/src/melter.py @@ -149,6 +149,7 @@ def clean_wikidata() -> List: for data in melted_data: movie = Decapper(data[ResultKeys.movie.value]).unpack() duration = Decapper(data[ResultKeys.duration.value]).unpack() + published = Decapper(data[ResultKeys.published.value]).unpack() if movie in bad_films: continue if duration: @@ -156,6 +157,11 @@ def clean_wikidata() -> List: continue if not all([(int(i) >= 0) for i in duration.split(ResultKeys.line_separator.value)]): continue + if published: + if not all( + [(i[10] == 'T' and i[19] == 'Z' and i.count(':') == 2 and i.count('-') == 2) + for i in published.split(ResultKeys.line_separator.value)]): + continue cleaned += [data] FileWriter(destination='./static/wikidata/cleaned.txt', data=cleaned).as_json_of_list() return cleaned