Skip to content

Commit

Permalink
Merge pull request #534 from datactive/dev_scraping_abc
Browse files Browse the repository at this point in the history
Idea on abstraction of scraping classes
  • Loading branch information
Christovis authored Jan 5, 2022
2 parents 2c41b5c + 9ef9816 commit 78f8e1b
Show file tree
Hide file tree
Showing 19 changed files with 2,229 additions and 1,419 deletions.
2 changes: 2 additions & 0 deletions bigbang/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from . import ingress
from . import analysis
from . import visualisation
from . import bigbang_io
115 changes: 66 additions & 49 deletions bigbang/analysis/listserv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,7 @@

from config.config import CONFIG

from bigbang.bigbang_io import (
ListservMessageIO,
ListservListIO,
ListservArchiveIO,
)
import bigbang.bigbang_io as bio
from bigbang.utils import (
get_paths_to_files_in_directory,
get_paths_to_dirs_in_directory,
Expand Down Expand Up @@ -59,19 +55,19 @@
# =?utf-8?b?J++9llNSVkND?=


class ListservListWarning(BaseException):
"""Base class for Archive class specific exceptions"""
class ListservMailListWarning(BaseException):
"""Base class for ListservMailList class specific exceptions"""

pass


class ListservArchiveWarning(BaseException):
"""Base class for Archive class specific exceptions"""
class ListservMailListDomainWarning(BaseException):
"""Base class for ListservMailListDomain class specific exceptions"""

pass


class ListservList:
class ListservMailList:
"""
Note
----
Expand Down Expand Up @@ -129,8 +125,8 @@ def from_mbox(
name: str,
filepath: str,
include_body: bool = True,
) -> "ListservList":
df = ListservListIO.from_mbox_to_pandas_dataframe(filepath)
) -> "ListservMailList":
df = bio.mlist_from_mbox_to_pandas_dataframe(filepath)
return cls.from_pandas_dataframe(df, name, filepath)

@classmethod
Expand All @@ -139,7 +135,7 @@ def from_pandas_dataframe(
df: pd.DataFrame,
name: Optional[str] = None,
filepath: Optional[str] = None,
) -> "ListservList":
) -> "ListservMailList":
return cls(name, filepath, df)

@staticmethod
Expand All @@ -164,10 +160,15 @@ def contract(count: np.array, label: list, contract: float) -> dict:
@staticmethod
def get_name_localpart_domain(string: str) -> tuple:
"""
Split an address field which has a format as
'Heinrich von Kleist <Heinrich.vonKleist@SELBST.ORG>' into name,
Split an address field which has (ideally) a format as
'Heinrich von Kleist <Heinrich.vonKleist@SELBST.org>' into name,
local-part, and domain.
All strings are returned in lower case only to avoid duplicates.
Note
----
Test whether the incorporation of email.utils.parseaddr() can improve
this function.
"""
# test if name is in string or if address is duplicated
test = string.split(" ")
Expand All @@ -176,19 +177,27 @@ def get_name_localpart_domain(string: str) -> tuple:
and all("@" in st for st in test)
and (test[0] == test[1])
):
# identifies addresses of the form:
# localpart@domain localpart@domain
addr = test[0]
localpart, domain = addr.split("@")
return None, localpart.lower(), domain.lower()
elif (len(test) > 2) and "@" in test[-1]:
elif (len(test) > 2) and ("@" in test[-1]):
# identifies addresses of the form:
# abc def ghi localpart@domain
name = (" ").join(test[:-1])
name = name.replace('"', "").replace('"', "")
addr = test[-1]
addr = addr.replace("<", "").replace(">", "")
localpart, domain = addr.split("@")
return name.lower(), localpart.lower(), domain.lower()
else:
name, addr = email.utils.parseaddr(string)
addr = addr.split(" ")[-1]
# identifies addresses of the form:
# abc localpart@domain
name = string.split(" ")[0]
name = name.replace('"', "")
addr = string.split(" ")[-1]
addr = addr.replace("<", "").replace(">", "")
if "@" in addr:
localpart, domain = addr.split("@")
return name.lower(), localpart.lower(), domain.lower()
Expand All @@ -203,12 +212,16 @@ def get_name_localpart_domain(string: str) -> tuple:
def iterator_name_localpart_domain(li: list) -> tuple:
"""Generator for the self.get_name_localpart_domain() function."""
for string in li:
if ("<" in string) and (">" in string):
# identify whether there are multiple addresses in one header field
if (string.count("<") > 1) and (string.count(">") > 1):
addresses = re.findall(r"(?<=\<).+?(?=\>)", string)
for addr in addresses:
yield ListservList.get_name_localpart_domain(addr)
if "@" in addr:
yield ListservMailList.get_name_localpart_domain(addr)
else:
continue
else:
yield ListservList.get_name_localpart_domain(string)
yield ListservMailList.get_name_localpart_domain(string)

def period_of_activity(self) -> list:
"""
Expand All @@ -222,7 +235,7 @@ def period_of_activity(self) -> list:
]
return period_of_activity

def crop_by_year(self, yrs: Union[int, list]) -> "ListservList":
def crop_by_year(self, yrs: Union[int, list]) -> "ListservMailList":
"""
Filter `self.df` DataFrame by year in message date.
Expand All @@ -233,7 +246,7 @@ def crop_by_year(self, yrs: Union[int, list]) -> "ListservList":
Returns
-------
`ListservList` object cropped to specification.
`ListservMailList` object cropped to specification.
"""
index = get_index_of_msgs_with_datetime(self.df)
_df = self.df.loc[index]
Expand All @@ -244,7 +257,7 @@ def crop_by_year(self, yrs: Union[int, list]) -> "ListservList":
(dt.year >= min(yrs)) & (dt.year < max([yrs]))
for dt in _df["date"].values
]
return ListservList.from_pandas_dataframe(
return ListservMailList.from_pandas_dataframe(
df=_df.loc[mask],
name=self.name,
filepath=self.filepath,
Expand All @@ -254,7 +267,7 @@ def crop_by_address(
self,
header_field: str,
per_address_field: Dict[str, List[str]],
) -> "ListservList":
) -> "ListservMailList":
"""
Parameters
----------
Expand All @@ -267,13 +280,13 @@ def crop_by_address(
Returns
-------
`ListservList` object cropped to specification.
`ListservMailList` object cropped to specification.
"""
mlist = self.df.dropna(subset=["from"])
if "domain" in list(per_address_field.keys()):
_addr = pd.Series(
[
ListservList.get_name_localpart_domain(addr)[-1]
ListservMailList.get_name_localpart_domain(addr)[-1]
for addr in mlist["from"].values
],
index=mlist.index.values,
Expand All @@ -283,20 +296,20 @@ def crop_by_address(
if "localpart" in list(per_address_field.keys()):
_addr = pd.Series(
[
ListservList.get_name_localpart_domain(addr)[1]
ListservMailList.get_name_localpart_domain(addr)[1]
for addr in mlist["from"].values
],
index=mlist.index.values,
).dropna()
mlist = mlist.loc[_addr.index]
mlist = mlist[_addr.isin(per_address_field["localpart"])]
return ListservList.from_pandas_dataframe(
return ListservMailList.from_pandas_dataframe(
df=mlist,
name=self.name,
filepath=self.filepath,
)

def crop_by_subject(self, match=str, place: int = 2) -> "ListservList":
def crop_by_subject(self, match=str, place: int = 2) -> "ListservMailList":
"""
Parameters
----------
Expand All @@ -308,7 +321,7 @@ def crop_by_subject(self, match=str, place: int = 2) -> "ListservList":
Returns
-------
`ListservList` object cropped to message subject.
`ListservMailList` object cropped to message subject.
"""
index = get_index_of_msgs_with_subject(self.df)
_df = self.df.loc[index]
Expand All @@ -333,7 +346,7 @@ def crop_by_subject(self, match=str, place: int = 2) -> "ListservList":
else False
)
mask = _df["subject"].apply(func).values # returns bool-type array
return ListservList.from_pandas_dataframe(
return ListservMailList.from_pandas_dataframe(
df=_df.loc[mask],
name=self.name,
filepath=self.filepath,
Expand Down Expand Up @@ -365,7 +378,7 @@ def get_domains(
df = self.df
domains = {}
for header_field in header_fields:
generator = ListservList.iterator_name_localpart_domain(
generator = ListservMailList.iterator_name_localpart_domain(
df[header_field].dropna().values
)
# collect domain labels
Expand Down Expand Up @@ -443,7 +456,7 @@ def get_localparts(
if per_domain:
# TODO: Needs two for loop. Find way to reduce it.
localparts[header_field] = {}
generator = ListservList.iterator_name_localpart_domain(
generator = ListservMailList.iterator_name_localpart_domain(
df[header_field].dropna().values
)
_domains = list(
Expand All @@ -456,7 +469,7 @@ def get_localparts(
)
)
_localparts = {d: [] for d in _domains}
generator = ListservList.iterator_name_localpart_domain(
generator = ListservMailList.iterator_name_localpart_domain(
df[header_field].dropna().values
)
for _, localpart, domain in generator:
Expand All @@ -469,7 +482,7 @@ def get_localparts(
_localparts[_domain] = _dolpsu
localparts[header_field] = _localparts
else:
generator = ListservList.iterator_name_localpart_domain(
generator = ListservMailList.iterator_name_localpart_domain(
df[header_field].dropna().values
)
_localparts = [
Expand Down Expand Up @@ -653,7 +666,7 @@ def get_threadsroot(
}
# sort into address field
if per_address_field:
generator = ListservList.iterator_name_localpart_domain(
generator = ListservMailList.iterator_name_localpart_domain(
_df["from"].values
)
if "domain" in per_address_field:
Expand Down Expand Up @@ -839,13 +852,15 @@ def get_sender_receiver_dict(
dic = {lp: {} for lp in localparts}
# loop through messages
for index, row in df.iterrows():
_, f_localpart, f_domain = ListservList.get_name_localpart_domain(
row["from"]
)
(
_,
f_localpart,
f_domain,
) = ListservMailList.get_name_localpart_domain(row["from"])
if f_domain is None:
continue

ct_generator = ListservList.iterator_name_localpart_domain(
ct_generator = ListservMailList.iterator_name_localpart_domain(
[df.loc[index]["comments-to"]]
)
for _, ct_localpart, ct_domain in ct_generator:
Expand Down Expand Up @@ -910,7 +925,7 @@ def create_sender_receiver_digraph(
):
"""
Create directed graph from messaging network created with
ListservList.get_sender_receiver_dict().
ListservMailList.get_sender_receiver_dict().
Parameters
----------
Expand Down Expand Up @@ -1012,16 +1027,16 @@ def get_graph_prop_per_domain_per_year(
return dic_evol


class ListservArchive:
class ListservMailListDomain:
"""
Parameters
----------
name
The of whom the archive is (e.g. 3GPP, IEEE, ...)
The of whom the mail list domain is (e.g. 3GPP, IEEE, ...)
filedsc
The file description of the archive
The file description of the mail list domain
lists
A list containing the mailing lists as `ListservList` types
A list containing the mailing lists as `ListservMailList` types
Methods
-------
Expand All @@ -1039,13 +1054,15 @@ def from_mbox(
name: str,
directorypath: str,
filedsc: str = "*.mbox",
) -> "ListservArchive":
) -> "ListservMailListDomain":
filepaths = get_paths_to_files_in_directory(directorypath, filedsc)
if len(filepaths) > 0:
ListservArchiveWarning("No files found fitting the description")
ListservMailListDomainWarning(
"No files found fitting the description"
)
for count, filepath in enumerate(filepaths):
name = filepath.split("/")[-1].split(".")[0]
mlist = ListservList.from_mbox(name, filepath).df
mlist = ListservMailList.from_mbox(name, filepath).df
mlist["mailing-list"] = name
if count == 0:
mlists = mlist
Expand Down
Loading

0 comments on commit 78f8e1b

Please sign in to comment.