Skip to content
Snippets Groups Projects
Commit d8d08fbb authored by Jan Lukas Steimann's avatar Jan Lukas Steimann
Browse files

Inital commit

parents
No related branches found
No related tags found
No related merge requests found
Pipeline #64531 failed
*.csv
__pycache__/
venv
NewYorkTimesComments/
.idea/
\ No newline at end of file
image: "python:3.7"
python_run:
script:
- pip install flair
- pip install torch
- pip install pandas
- python3 ComputeCommentEmbeddings.py
\ No newline at end of file
import torch
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from tqdm import tqdm
from NewYorkTimesDataset import get_argument_list
def compute_embedding(comments: dict) -> dict:
for article_id in tqdm(comments.keys()):
embedding_model = TransformerDocumentEmbeddings('bert-large-uncased', fine_tune=False)
for comment in comments[article_id]:
comment_embedding_sentence = Sentence(comment["commentBody"])
embedding_model.embed(comment_embedding_sentence)
comment["embedding"] = comment_embedding_sentence.embedding
return comments
if __name__ == '__main__':
comments = get_argument_list(path="", filename="NewYorkTimesComments/CommentsApril2017_DEV.csv")
comments = compute_embedding(comments)
torch.save(comments, "embedded_comments.pt")
from typing import List
import pandas as pd
def read_article_from_dataset(path: str, filename: str) -> pd.DataFrame:
return pd.read_csv(path + filename)[["articleID", "headline", "keywords"]]
def get_article_list(path: str, filename: str) -> dict:
dataset = read_article_from_dataset(path, filename)
return {article.articleID: article for article in list(dataset.itertuples())}
def read_arguments_from_dataset(path: str, filename: str) -> pd.DataFrame:
return pd.read_csv(path + filename)[
["commentID", "commentBody", "commentSequence", "articleID", "parentID"]]
def get_argument_list(path: str, filename: str) -> dict:
dataset = read_arguments_from_dataset(path, filename)
article_argument_dict = {}
for argument_tuple in dataset.itertuples():
argument = {
"commentID": argument_tuple.commentID,
"commentBody": argument_tuple.commentBody,
"commentSequence": argument_tuple.commentSequence,
"parentID": argument_tuple.parentID
}
if argument_tuple.articleID in article_argument_dict.keys():
article_argument_dict[argument_tuple.articleID].append(argument)
else:
article_argument_dict[argument_tuple.articleID] = [argument]
return article_argument_dict
def get_discussion_for_article(arguments: List) -> dict:
discussion = {}
for argument in arguments:
argument_parent_id = argument["parentID"]
if argument_parent_id in discussion.keys():
discussion[argument_parent_id].append([argument["commentID"], argument["commentBody"]])
else:
discussion[argument_parent_id] = [[argument["commentID"], argument["commentBody"]]]
return discussion
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment