Skip to content
Snippets Groups Projects
Commit d8d08fbb authored by Jan Lukas Steimann's avatar Jan Lukas Steimann
Browse files

Inital commit

parents
Branches
Tags
No related merge requests found
Pipeline #64531 failed
*.csv
__pycache__/
venv
NewYorkTimesComments/
.idea/
\ No newline at end of file
image: "python:3.7"
python_run:
script:
- pip install flair
- pip install torch
- pip install pandas
- python3 ComputeCommentEmbeddings.py
\ No newline at end of file
import torch
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from tqdm import tqdm
from NewYorkTimesDataset import get_argument_list
def compute_embedding(comments: dict) -> dict:
for article_id in tqdm(comments.keys()):
embedding_model = TransformerDocumentEmbeddings('bert-large-uncased', fine_tune=False)
for comment in comments[article_id]:
comment_embedding_sentence = Sentence(comment["commentBody"])
embedding_model.embed(comment_embedding_sentence)
comment["embedding"] = comment_embedding_sentence.embedding
return comments
if __name__ == '__main__':
comments = get_argument_list(path="", filename="NewYorkTimesComments/CommentsApril2017_DEV.csv")
comments = compute_embedding(comments)
torch.save(comments, "embedded_comments.pt")
from typing import List
import pandas as pd
def read_article_from_dataset(path: str, filename: str) -> pd.DataFrame:
return pd.read_csv(path + filename)[["articleID", "headline", "keywords"]]
def get_article_list(path: str, filename: str) -> dict:
dataset = read_article_from_dataset(path, filename)
return {article.articleID: article for article in list(dataset.itertuples())}
def read_arguments_from_dataset(path: str, filename: str) -> pd.DataFrame:
return pd.read_csv(path + filename)[
["commentID", "commentBody", "commentSequence", "articleID", "parentID"]]
def get_argument_list(path: str, filename: str) -> dict:
dataset = read_arguments_from_dataset(path, filename)
article_argument_dict = {}
for argument_tuple in dataset.itertuples():
argument = {
"commentID": argument_tuple.commentID,
"commentBody": argument_tuple.commentBody,
"commentSequence": argument_tuple.commentSequence,
"parentID": argument_tuple.parentID
}
if argument_tuple.articleID in article_argument_dict.keys():
article_argument_dict[argument_tuple.articleID].append(argument)
else:
article_argument_dict[argument_tuple.articleID] = [argument]
return article_argument_dict
def get_discussion_for_article(arguments: List) -> dict:
discussion = {}
for argument in arguments:
argument_parent_id = argument["parentID"]
if argument_parent_id in discussion.keys():
discussion[argument_parent_id].append([argument["commentID"], argument["commentBody"]])
else:
discussion[argument_parent_id] = [[argument["commentID"], argument["commentBody"]]]
return discussion
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment