From 46df55a34966d25ab1c495d28a15c8956e12f381 Mon Sep 17 00:00:00 2001
From: zqwerty <zhuq96@hotmail.com>
Date: Fri, 15 Jul 2022 10:41:52 +0800
Subject: [PATCH] rm reddit dial that have url

---
 data/unified_datasets/reddit/preprocess.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/data/unified_datasets/reddit/preprocess.py b/data/unified_datasets/reddit/preprocess.py
index 4aa1f03f..35824e8a 100644
--- a/data/unified_datasets/reddit/preprocess.py
+++ b/data/unified_datasets/reddit/preprocess.py
@@ -40,6 +40,8 @@ def preprocess():
             if len(utterance) > 256:
                 # remove dialogs that contain too long utterances
                 return None
+            if 'http://' in utterance or 'https://' in utterance:
+                return None
             speaker = 'system' if i % 2 == 1 else 'user'
             turn = {
                 'speaker': speaker,
-- 
GitLab