Parsing HTML using regular expressions

To collect structured data from the unstructured transcripts, we can use regular expressions in addition to BeautifulSoup.

They allows us to collect detailed information not only about the earnings call company and timing but also capture who was present and attribute the statements to analysts and company representatives:

def parse_html(html):
    date_pattern = re.compile(r'(d{2})-(d{2})-(d{2})')
    quarter_pattern = re.compile(r'(Qd)')
    soup = BeautifulSoup(html, 'lxml')

    meta, participants, content = {}, [], []
    h1 = soup.find('h1', itemprop='headline').text
    meta['company'] = h1[:h1.find('(')].strip()
    meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]

    title = soup.find('div', class_='title').text
    match = date_pattern.search(title)
    if match:
        m, d, y = match.groups()
        meta['month'] = int(m)
        meta['day'] = int(d)
        meta['year'] = int(y)

    match = quarter_pattern.search(title)
    if match:
        meta['quarter'] = match.group(0)

    qa = 0
    speaker_types = ['Executives', 'Analysts']
    for header in [p.parent for p in soup.find_all('strong')]:
        text = header.text.strip()
        if text.lower().startswith('copyright'):
            continue
        elif text.lower().startswith('question-and'):
            qa = 1
            continue
        elif any([type in text for type in speaker_types]):
            for participant in header.find_next_siblings('p'):
                if participant.find('strong'):
                    break
                else:
                    participants.append([text, participant.text])
        else:
            p = []
            for participant in header.find_next_siblings('p'):
                if participant.find('strong'):
                    break
                else:
                    p.append(participant.text)
            content.append([header.text, qa, '
'.join(p)])
    return meta, participants, content

We store the result in several .csv files for easy access when we use ML to process natural language:

def store_result(meta, participants, content):
    path = transcript_path / 'parsed' / meta['symbol']
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)
    pd.DataFrame(content, columns=['speaker', 'q&a', 
              'content']).to_csv(path / 'content.csv', index=False)
    pd.DataFrame(participants, columns=['type', 'name']).to_csv(path / 
                 'participants.csv', index=False)
    pd.Series(meta).to_csv(path / 'earnings.csv'

See README in the GitHub repo for additional details and references for further resources to develop web-scraping applications.

Table of Contents for Parsing HTML using regular expressions

Create new playlist

Sign In

Sign Up

Table of Contents for
Parsing HTML using regular expressions