Parsing HTML using regular expressions

To collect structured data from the unstructured transcripts, we can use regular expressions in addition to BeautifulSoup.

They allows us to collect detailed information not only about the earnings call company and timing but also capture who was present and attribute the statements to analysts and company representatives:

def parse_html(html):
date_pattern = re.compile(r'(d{2})-(d{2})-(d{2})')
quarter_pattern = re.compile(r'(Qd)')
soup = BeautifulSoup(html, 'lxml')

meta, participants, content = {}, [], []
h1 = soup.find('h1', itemprop='headline').text
meta['company'] = h1[:h1.find('(')].strip()
meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]

title = soup.find('div', class_='title').text
match = date_pattern.search(title)
if match:
m, d, y = match.groups()
meta['month'] = int(m)
meta['day'] = int(d)
meta['year'] = int(y)

match = quarter_pattern.search(title)
if match:
meta['quarter'] = match.group(0)

qa = 0
speaker_types = ['Executives', 'Analysts']
for header in [p.parent for p in soup.find_all('strong')]:
text = header.text.strip()
if text.lower().startswith('copyright'):
continue
elif text.lower().startswith('question-and'):
qa = 1
continue
elif any([type in text for type in speaker_types]):
for participant in header.find_next_siblings('p'):
if participant.find('strong'):
break
else:
participants.append([text, participant.text])
else:
p = []
for participant in header.find_next_siblings('p'):
if participant.find('strong'):
break
else:
p.append(participant.text)
content.append([header.text, qa, ' '.join(p)])
return meta, participants, content

We store the result in several .csv files for easy access when we use ML to process natural language:

def store_result(meta, participants, content):
path = transcript_path / 'parsed' / meta['symbol']
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
pd.DataFrame(content, columns=['speaker', 'q&a',
'content']).to_csv(path / 'content.csv', index=False)
pd.DataFrame(participants, columns=['type', 'name']).to_csv(path /
'participants.csv', index=False)
pd.Series(meta).to_csv(path / 'earnings.csv'

See README in the GitHub repo for additional details and references for further resources to develop web-scraping applications.

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.149.214.32