[QF] Building a Simple Financial Statements Python WebScraper Using YahooFinance
[Detailed Explanation Pending]
Final Product Class
Below is all the codes compiled together into a class called YFinanceEquityFundamentalDataSource:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request as ur
class YFinanceEquityFundamentalDataSource(object):
def __init__(self,
ticker:str,
*args,
**kwargs
):
# URL link
self._ticker = ticker
self.url_is = 'https://finance.yahoo.com/quote/' + ticker + '/financials?p=' + ticker
self.url_bs = 'https://finance.yahoo.com/quote/' + ticker + '/balance-sheet?p=' + ticker
self.url_cf = 'https://finance.yahoo.com/quote/' + ticker + '/cash-flow?p='+ ticker
#---| Properties
@property
def income_statement(self):
return self._extract_and_clean(
self._read_div_ls(self.url_is)
)
@property
def balance_sheet(self):
return self._extract_and_clean(
self._read_div_ls(self.url_bs)
)
@property
def cash_flow_statement(self):
return self._extract_and_clean(
self._read_div_ls(self.url_cf)
)
#---| Back-end
def _read_div_ls(self, url):
# INSPIRATION:
# https://towardsdatascience.com/web-scraping-for-accounting-analysis-using-python-part-1-b5fc016a1c9a
read_data = ur.urlopen(url).read()
soup= BeautifulSoup(read_data,'lxml')
raw_ls= [] # Create empty list
for l in soup.find_all('div'):
#Find all data structure that is ‘div’
raw_ls.append(l.string) # add each element one by one to the list
# Exclude certain columns if needed
#raw_ls = [e for e in raw_ls if e not in ('Operating Expenses','Non-recurring Events')]
ls = list(filter(None,raw_ls))
return ls
#---| Check if string can be converted to pd.Timestamp
def _date_convertible(self, string:str):
if "/" not in string:
return False
try:
pd.Timestamp(string)
return True
except:
return False
#---| Check if string can be converted to numeric value
def _is_numeric(self, string:str):
if string == "-":
return True
try:
float(string)
return True
except:
try:
float(string.replace(",",""))
return True
except:
return False
#----| Clean list of data (primarily for numeric checking)
def _clean_data_piece(self, data_piece:list):
cleaned = []
for d in data_piece:
if str(d) == "-":
cleaned.append(np.nan)
elif self._is_numeric(str(d)):
cleaned.append(str(d).replace(",",""))
else:
cleaned.append(str(d))
return cleaned
#---| ETL
def _extract_and_clean(self, ls):
dates = [i for i in ls if self._date_convertible(str(i))]
start_index = ls.index(dates[-1]) + 1
dates = [pd.Timestamp(str(i)).date() for i in dates]
if 'ttm' in ls:
tuple_length = len(dates) + 1
columns = ['Annual','TTM'] + dates
else:
tuple_length = len(dates)
columns = ['Annual'] + dates
data = []
for i in range(start_index,len(ls)):
data_piece = ls[i:i + tuple_length + 1]
if not self._is_numeric(data_piece[0]):
passed = False
for d in data_piece[1:]:
if not self._is_numeric(d):
passed = False
break
else:
passed = True
if passed:
data.append(tuple(self._clean_data_piece(data_piece)))
df = pd.DataFrame(data, columns = columns).set_index("Annual")
for c in df.columns:
df[c] = pd.to_numeric(df[c])
return df