Skip to content

Commit

Permalink
refactored
Browse files Browse the repository at this point in the history
  • Loading branch information
joeyism committed Oct 26, 2019
1 parent ca15855 commit 865a05a
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 67 deletions.
47 changes: 18 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,24 @@ To get a company's latest 5 10-Ks, run
``` python
from edgar import Company
company = Company("Oracle Corp", "0001341439")
tree = company.getAllFilings(filingType = "10-K")
docs = edgar.getDocuments(tree, noOfDocuments=5)
tree = company.get_all_filings(filing_type = "10-K")
docs = edgar.get_documents(tree, no_of_documents=5)
```
or
```python
from edgar import Company, TXTML

company = Company("INTERNATIONAL BUSINESS MACHINES CORP", "0000051143")
doc = company.get10K()
text = TXTML.parseFull10K(doc)
doc = company.get_10K()
text = TXTML.parse_full_10K(doc)
```

To get all companies and find a specific one, run

``` python
from edgar import Edgar
edgar = Edgar()
possible_companies = edgar.findCompanyName("Cisco System")
possible_companies = edgar.find_company_name("Cisco System")
```

## API
Expand All @@ -41,49 +41,38 @@ The **Company** class has two fields:
* name (company name)
* cik (company CIK number)

##### getFilingsUrl
##### get_filings_url
Returns a url to fetch filings data
* **Input**
* filingType: The type of document you want. i.e. 10-K, S-8, 8-K. If not specified, it'll return all documents
* priorTo: Time prior which documents are to be retrieved. If not specified, it'll return all documents
* filing_type: The type of document you want. i.e. 10-K, S-8, 8-K. If not specified, it'll return all documents
* prior_to: Time prior which documents are to be retrieved. If not specified, it'll return all documents
* ownership: defaults to include. Options are include, exclude, only.
* noOfEntries: defaults to 100. Returns the number of entries to be returned. Maximum is 100.
* no_of_entries: defaults to 100. Returns the number of entries to be returned. Maximum is 100.

##### getAllFilings
##### get_all_filings
Returns the HTML in the form of [lxml.html](http://lxml.de/lxmlhtml.html)
* **Input**
* filingType: The type of document you want. i.e. 10-K, S-8, 8-K. If not specified, it'll return all documents
* priorTo: Time prior which documents are to be retrieved. If not specified, it'll return all documents
* filing_type: The type of document you want. i.e. 10-K, S-8, 8-K. If not specified, it'll return all documents
* prior_to: Time prior which documents are to be retrieved. If not specified, it'll return all documents
* ownership: defaults to include. Options are include, exclude, only.
* noOfEntries: defaults to 100. Returns the number of entries to be returned. Maximum is 100.
* no_of_entries: defaults to 100. Returns the number of entries to be returned. Maximum is 100.

### Edgar
Gets all companies from EDGAR
##### getCikByCompanyName
##### get_cik_by_company_name
* **Input**
* name: name of the company

##### getCompanyNameByCik
##### get_company_name_by_cik
* **Input**
* cik: cik of the company

##### findCompanyName
##### find_company_name
* **Input**
* words: input words to search the company

### getDocuments
### get_documents
Returns a list of strings, each string contains the body of the specified document from input
* **Input**
* tree: lxml.html form that is returned from Company.getAllFilings
* noOfDocuments: number of document returned. If it is 1, the returned result is just one string, instead of a list of strings. Defaults to 1.


## Release Notes
**0.3.0**
* Added findCompanyName to Edgar

**0.2.0**
* Added Edgar

**0.1.0**
* First release
* no_of_documents: number of document returned. If it is 1, the returned result is just one string, instead of a list of strings. Defaults to 1.
50 changes: 24 additions & 26 deletions edgar/company.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,59 +9,57 @@ def __init__(self, name, cik):
self.name = name
self.cik = cik

def _getFilingsUrl(self, filingType="", priorTo="", ownership="include", noOfEntries=100):
url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=" + self.cik + "&type=" + filingType + "&dateb=" + priorTo + "&owner=" + ownership + "&count=" + str(noOfEntries)
def _get_filings_url(self, filing_type="", prior_to="", ownership="include", no_of_entries=100):
url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=" + self.cik + "&type=" + filing_type + "&dateb=" + prior_to + "&owner=" + ownership + "&count=" + str(no_of_entries)
return url

def getAllFilings(self, filingType="", priorTo="", ownership="include", noOfEntries=100):
url = self._getFilingsUrl(filingType, priorTo, ownership, noOfEntries)
def get_all_filings(self, filing_type="", prior_to="", ownership="include", no_of_entries=100):
url = self._get_filings_url(filing_type, prior_to, ownership, no_of_entries)
page = requests.get(url)
return html.fromstring(page.content)

def get10Ks(self, noOfDocuments=1):
tree = self.getAllFilings(filingType="10-K")
elems = tree.xpath('//*[@id="documentsbutton"]')[:noOfDocuments]
def get_10Ks(self, no_of_documents=1):
tree = self.get_all_filings(filing_type="10-K")
elems = tree.xpath('//*[@id="documentsbutton"]')[:no_of_documents]
result = []
for elem in elems:
url = BASE_URL + elem.attrib["href"]
contentPage = getRequest(url)
table = contentPage.find_class("tableFile")[0]
lastRow = table.getchildren()[-1]
href = lastRow.getchildren()[2].getchildren()[0].attrib["href"]
content_page = get_request(url)
table = content_page.find_class("tableFile")[0]
last_row = table.getchildren()[-1]
href = last_row.getchildren()[2].getchildren()[0].attrib["href"]
href = BASE_URL + href
doc = getRequest(href)
doc = get_request(href)
result.append(doc)
return result

def get10K(self):
return self.get10Ks(noOfDocuments=1)[0]
def get_10K(self):
return self.get_10Ks(no_of_documents=1)[0]


def getRequest(href):
def get_request(href):
page = requests.get(href)
return html.fromstring(page.content)

def getDocuments(tree, noOfDocuments=1):
def get_documents(tree, no_of_documents=1):
BASE_URL = "https://www.sec.gov"
elems = tree.xpath('//*[@id="documentsbutton"]')[:noOfDocuments]
elems = tree.xpath('//*[@id="documentsbutton"]')[:no_of_documents]
result = []
for elem in elems:
url = BASE_URL + elem.attrib["href"]
contentPage = getRequest(url)
url = BASE_URL + contentPage.xpath('//*[@id="formDiv"]/div/table/tr[2]/td[3]/a')[0].attrib["href"]
filing = getRequest(url)
content_page = get_request(url)
url = BASE_URL + content_page.xpath('//*[@id="formDiv"]/div/table/tr[2]/td[3]/a')[0].attrib["href"]
filing = get_request(url)
result.append(filing.body.text_content())

if len(result) == 1:
return result[0]
return result

def getCIKFromCompany(companyName):
tree = getRequest("https://www.sec.gov/cgi-bin/browse-edgar?company=" + companyName)
def get_CIK_from_company(company_name):
tree = get_request("https://www.sec.gov/cgi-bin/browse-edgar?company=" + company_name)
CIKList = tree.xpath('//*[@id="seriesDiv"]/table/tr[*]/td[1]/a/text()')
namesList = []
names_list = []
for elem in tree.xpath('//*[@id="seriesDiv"]/table/tr[*]/td[2]'):
namesList.append(elem.text_content())
names_list.append(elem.text_content())
return list(zip(CIKList, namesList))


16 changes: 8 additions & 8 deletions edgar/edgar.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ def __init__(self):
self.all_companies_dict = dict(all_companies_array)
self.all_companies_dict_rev = dict(all_companies_array_rev)

def getCikByCompanyName(self, name):
def get_cik_by_company_name(self, name):
return self.all_companies_dict[name]

def getCompanyNameByCik(self, cik):
def get_company_name_by_cik(self, cik):
return self.all_companies_dict_rev[cik]

def findCompanyName(self, words):
possibleCompanies = []
def find_company_name(self, words):
possible_companies = []
words = words.lower()
for company in self.all_companies_dict:
if all(word in company.lower() for word in words.split(" ")):
possibleCompanies.append(company)
return possibleCompanies
possible_companies.append(company)
return possible_companies

def test():
com = Company("Oracle Corp", "0001341439")
tree = com.getAllFilings(filingType = "10-K")
return getDocuments(tree)
tree = com.get_all_filings(filingType = "10-K")
return get_documents(tree)
8 changes: 4 additions & 4 deletions edgar/txtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ def _clean_text_(cls, text):
return text.replace('\n', '')

@classmethod
def getDocumentType(cls, document):
def get_document_type(cls, document):
return document.getchildren()[0].text

@classmethod
def getHTMLFromDocument(cls, document):
def get_HTML_from_document(cls, document):
properties = {}

while document.tag != 'text':
Expand All @@ -19,12 +19,12 @@ def getHTMLFromDocument(cls, document):
return document, properties

@classmethod
def parseFull10K(cls, doc):
def parse_full_10K(cls, doc):
text = ""
for child in doc.getchildren():
if child.tag == 'sec-header':
continue
html, properties = TXTML.getHTMLFromDocument(child)
html, properties = TXTML.get_HTML_from_document(child)
if properties['type'] == '10-K':
text = text + html.text_content()
return text

0 comments on commit 865a05a

Please sign in to comment.