Scrape Web Data

Terms defined: Document Object Model, visitor

Documents as Data

def main():
    """Parse page and visit nodes."""
    options = parse_args()
    text = Path(options.filename).read_text()
    doc = BeautifulSoup(text, 'html.parser')
    visit(doc, options.noblanks)
def visit(node, noblanks, depth=0):
    """Show nodes in DOM tree."""
    prefix = '  ' * depth
    if isinstance(node, NavigableString):
        if not noblanks or node.string.strip():
            print(f'{prefix}text: {repr(node.string)}')
    elif isinstance(node, Tag):
        print(f'{prefix}element: {node.name} with {node.attrs}')
        for child in node:
            visit(child, noblanks, depth + 1)
<html>
  <h1>Page Title</h1>
  <p>paragraph</p>
</html>
python parse_page.py --filename small.html
element: [document] with {}
  element: html with {}
    text: '\n'
    element: h1 with {}
      text: 'Page Title'
    text: '\n'
    element: p with {}
      text: 'paragraph'
    text: '\n'
  text: '\n'
<html>
  <head>
    <title>Example Page</title>
  </head>
  <body>
    <h1>Page Title</h1>
    <ul class="details">
      <li>first point</li>
      <li>second point</li>
    </ul>
  </body>
</html>
python parse_page.py --filename medium.html --noblanks
element: [document] with {}
  element: html with {}
    element: head with {}
      element: title with {}
        text: 'Example Page'
    element: body with {}
      element: h1 with {}
        text: 'Page Title'
      element: ul with {'class': ['details']}
        element: li with {}
          text: 'first point'
        element: li with {}
          text: 'second point'

Fetching

def main():
    """Main driver."""
    args = parse_args()
    homepage = get_page(args.homepage)
    result = []
    for link in homepage.find_all('a'):
        result.append(get_info(args, link['href']))
    print(result)
def get_page(url):
    """Get HTML page as soup."""
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser')
def get_info(args, relative):
    """Get info from staff page."""
    page = get_page(f'{args.homepage}/{relative}')
    result = {'name': page.find('h1').string}
    for row in page.find_all('tr'):
        kind = row.find('th').string.lower()
        count = int(row.find('td').string)
        result[kind] = count
    return result