-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
98 lines (77 loc) · 3.26 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import pandas as pd
csv_path = "chat_pp_sample.csv"
df = pd.read_csv(csv_path, index_col=0)
df.drop(columns=['import'], inplace=True)
from textblob import TextBlob
token=['love','I miss you', 'baby', 'babe','honey','I love you', 'darling','soulmate']
# Token find
token_index=dict()
for tok in token:
token_index[tok]=(df[df['message_text'].str.contains(tok)].index.tolist())
for index, row in df.iterrows():
blob=TextBlob(row['message_text'])
polarity =blob.sentiment.polarity
subjectivity =blob.sentiment.subjectivity
#print(polarity, subjectivity)
df.at[index, 'polarity']=polarity
df.at[index, 'subjectivity']=subjectivity
polarity_df = df.sort_values(by=['polarity'], ascending=False).head(5)
subjectivyty_df = df.sort_values(by=['subjectivity'], ascending=False).head(5)
### Converseraiton
time_gap_threshold = pd.Timedelta(minutes=30)
df['Timestamp'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='mixed')
session_id = 0
session_start_time = df.iloc[0]['Timestamp']
df['Session'] = None
for index, row in df.iterrows():
if row['Timestamp'] - session_start_time > time_gap_threshold:
session_id += 1
session_start_time = row['Timestamp']
df.at[index, 'Session'] = session_id
df.at[index, 'Session'] = session_id
for session_id, session_group in df.groupby('Session'):
corpus = ' '.join(session_group['message_text'])
blob = TextBlob(corpus)
polarity =blob.sentiment.polarity
subjectivity =blob.sentiment.subjectivity
counter = 0
token_counts = {}
for tok in token:
if (tok in corpus):
counter += 1
# token_counts[tok] = count
total_count = counter
df.at[index, 'polarity']=polarity
df.at[index, 'subjectivity']=subjectivity
df.at[index, 'appreciation']=total_count
polarity_df = df.sort_values(by=['polarity'], ascending=False).head(5)
subjectivyty_df = df.sort_values(by=['subjectivity'], ascending=False).head(5)
appreciation_df = df.sort_values(by=['appreciation'], ascending=False).head(5)
conversation_dfs = [] # Initialize a list to store DataFrames for each conversation
context_size = 4
for loc_index in subjectivyty_df.index:
session_id = subjectivyty_df.at[loc_index, 'Session']
filtered_df = df[df['Session'] == session_id]
context_df = filtered_df.iloc[max(0, loc_index - context_size):min(len(filtered_df), loc_index + context_size)]
context_df.reset_index(drop=True, inplace=True)
context_df = context_df.filter(items=['date', 'time', 'name', 'message_text'])
conversation_dfs.append(context_df)
for loc_index in appreciation_df.index:
session_id = appreciation_df.at[loc_index, 'Session']
filtered_df = df[df['Session'] == session_id]
context_df = filtered_df.iloc[max(0, loc_index - context_size):min(len(filtered_df), loc_index + context_size)]
context_df = context_df.filter(items=['date', 'time', 'name', 'message_text'])
context_df.reset_index(drop=True, inplace=True)
conversation_dfs.append(context_df)
response = {
"data": [],
"plots": []
}
for frame in conversation_dfs:
if frame.shape[0] == 0:
pass
conv_json1 = frame.to_json(orient="index")
response["data"].append(conv_json1)
print(conv_json1)
print("new conv\n\n")