From 46df55a34966d25ab1c495d28a15c8956e12f381 Mon Sep 17 00:00:00 2001 From: zqwerty <zhuq96@hotmail.com> Date: Fri, 15 Jul 2022 10:41:52 +0800 Subject: [PATCH] rm reddit dial that have url --- data/unified_datasets/reddit/preprocess.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data/unified_datasets/reddit/preprocess.py b/data/unified_datasets/reddit/preprocess.py index 4aa1f03f..35824e8a 100644 --- a/data/unified_datasets/reddit/preprocess.py +++ b/data/unified_datasets/reddit/preprocess.py @@ -40,6 +40,8 @@ def preprocess(): if len(utterance) > 256: # remove dialogs that contain too long utterances return None + if 'http://' in utterance or 'https://' in utterance: + return None speaker = 'system' if i % 2 == 1 else 'user' turn = { 'speaker': speaker, -- GitLab