Scrap Yahoo Financial Income Statement with Python

I am trying to clear data from earnings reports on Yahoo Finance using Python. In particular, let me say that I want the most recent figure Net income Apple .

The data is structured in a bunch of nested HTML tables. I use the module requeststo access it and get HTML code.

I am using BeautifulSoup 4 to sift through the HTML structure, but I cannot figure out how to get the number.

Here is a screenshot of the analysis from Firefox.

My code is:

from bs4 import BeautifulSoup
import requests

myurl = "https://finance.yahoo.com/q/is?s=AAPL&annual"
html = requests.get(myurl).content
soup = BeautifulSoup(html)

I tried to use

all_strong = soup.find_all("strong")

And then you get the 17th element, which turns out to be the one that contains the figure that I want, but it seems far from elegance. Something like that:

all_strong[16].parent.next_sibling
...

, , BeautifulSoup ( " " "), HTML.

, , , Yahoo Finance.

/:

@wilbur , , , (.. , ) . :

def periodic_figure_values(soup, yahoo_figure):

    values = []
    pattern = re.compile(yahoo_figure)

    title = soup.find("strong", text=pattern)    # works for the figures printed in bold
    if title:
        row = title.parent.parent
    else:
        title = soup.find("td", text=pattern)    # works for any other available figure
        if title:
            row = title.parent
        else:
            sys.exit("Invalid figure '" + yahoo_figure + "' passed.")

    cells = row.find_all("td")[1:]    # exclude the <td> with figure name
    for cell in cells:
        if cell.text.strip() != yahoo_figure:    # needed because some figures are indented
            str_value = cell.text.strip().replace(",", "").replace("(", "-").replace(")", "")
            if str_value == "-":
                str_value = 0
            value = int(str_value) * 1000
            values.append(value)

    return values

yahoo_figure - . , , Yahoo Finance. soup, :

def financials_soup(ticker_symbol, statement="is", quarterly=False):

    if statement == "is" or statement == "bs" or statement == "cf":
        url = "https://finance.yahoo.com/q/" + statement + "?s=" + ticker_symbol
        if not quarterly:
            url += "&annual"
        return BeautifulSoup(requests.get(url).text, "html.parser")

    return sys.exit("Invalid financial statement code '" + statement + "' passed.")

- Apple Inc. :

print(periodic_figure_values(financials_soup("AAPL", "is"), "Income Tax Expense"))

: [19121000000, 13973000000, 13118000000]

soup , , - , . , .

+4
1

, " " <strong>, , , :

import re, requests
from bs4 import BeautifulSoup

url = 'https://finance.yahoo.com/q/is?s=AAPL&annual'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
pattern = re.compile('Net Income')

title = soup.find('strong', text=pattern)
row = title.parent.parent # yes, yes, I know it not the prettiest
cells = row.find_all('td')[1:] #exclude the <td> with 'Net Income'

values = [ c.text.strip() for c in cells ]

values, " " (, , ints - , "," )

In [10]: values
Out[10]: [u'53,394,000', u'39,510,000', u'37,037,000']

(GOOG) - , , (https://finance.yahoo.com/q/is?s=GOOG&annual), Facebook (FB), (https://finance.yahoo.com/q/is?s=FB&annual).

script, URL- , , :

ticker_symbol = 'AAPL' # or 'FB' or any other ticker symbol
url = 'https://finance.yahoo.com/q/is?s={}&annual'.format(ticker_symbol))
+3

Source: https://habr.com/ru/post/1629027/


All Articles