diff --git a/data/unified_datasets/reddit/preprocess.py b/data/unified_datasets/reddit/preprocess.py index 4aa1f03fe584590645bf2b0d7b9548581baedff4..35824e8a023b7bbaead5593487c6cf427df43a48 100644 --- a/data/unified_datasets/reddit/preprocess.py +++ b/data/unified_datasets/reddit/preprocess.py @@ -40,6 +40,8 @@ def preprocess(): if len(utterance) > 256: # remove dialogs that contain too long utterances return None + if 'http://' in utterance or 'https://' in utterance: + return None speaker = 'system' if i % 2 == 1 else 'user' turn = { 'speaker': speaker,