Robust Analysis

Where can we get data?

How can we collect data from web pages?

import sys
import requests
import re
import time


# Match package URL in main index page.
RE_PACKAGE = re.compile(r'<a href="(.+?)">')

# Match release URL in package index page.
RE_RELEASE = re.compile(r'<a href=".+?">(.+?)</a>')

# PyPI domain.
DOMAIN = 'https://pypi.org'


index_response = requests.get(f'{DOMAIN}/simple/')
print('Package,Releases')
all_packages = RE_PACKAGE.findall(index_response.text)
for package in all_packages:
    name = package.strip('/').split('/')[-1]
    url = f'{DOMAIN}{package}'
    package_response = requests.get(url)
    count = len(RE_RELEASE.findall(package_response.text))
    print(f'{name},{count}')

{: title="get-index-page-initial.py"}

A Small Grumble

Ideally, the code above would morph into the code shown below so that you could see what was the same and what was different. A second-best solution would be an easy way to highlight modified lines, with an emphasis on "easy". One day...

#!/usr/bin/env python

import sys
import requests
import re
import time


# Match package URL in main index page.
RE_PACKAGE = re.compile(r'<a href="(.+?)">')

# Match release URL in package index page.
RE_RELEASE = re.compile(r'<a href=".+?">(.+?)</a>')

# PyPI domain.
DOMAIN = 'https://pypi.org'


# Get main index page.
index_response = requests.get(f'{DOMAIN}/simple/')
assert index_response.status_code == 200, \
    f'Unexpected status for index page {index_response.status_code}'

# Get sub-pages and count releases in each.
num_seen = 0
t_start = time.time()
print('Package,Releases')
all_packages = RE_PACKAGE.findall(index_response.text)
for package in all_packages:
    name = package.strip('/').split('/')[-1]
    url = f'{DOMAIN}{package}'
    package_response = requests.get(url)
    if package_response.status_code != 200:
        print(f'Unexpected status for package page {url}: {package_response.status_code}',
              file=sys.stderr)
        count = 'NA'
    else:
        count = len(RE_RELEASE.findall(package_response.text))
    print(f'{name},{count}')
    num_seen += 1
    if (num_seen % 10) == 0:
        t_elapsed = time.time() - t_start
        t_expected = len(all_packages) * t_elapsed / num_seen
        print(f'{num_seen} @ {t_elapsed:.1f} / {len(all_packages)} @ {t_expected:.1f}',
              file=sys.stderr)

{: title="get-index-page.py"}

How can we report results?

import sys
import pandas as pd

datafile = sys.argv[1]
packages = pd.read_csv(datafile)
print(packages.agg(['mean', 'median', 'var', 'std']))

{: title="version-statistics.py"}

python version-statistics.py release-count.csv
            Releases
mean       11.236351
median      4.000000
var      2501.398099
std        50.013979
min         0.000000
max     10797.000000

How can we display the results

import sys
import pandas as pd
import plotly.express as px

datafile = sys.argv[1]
packages = pd.read_csv(datafile)

print('Distribution of Releases')
print(packages.groupby('Releases').count())
print(f'{packages["Releases"].isna().sum()} missing values')

fig = px.histogram(packages, x='Releases', nbins=100, log_y=True, width=600, height=400)
fig.show()
fig.write_image('figures/release-count.svg')

{: title="version-histogram.py"}

python version-histogram.py release-count.csv
Distribution of Releases
          Package
Releases         
0.0         11721
1.0         33992
2.0         32829
3.0         14999
4.0         18339
5.0          8946
...
4505.0          1
5133.0          1
6460.0          1
10797.0         1

[561 rows x 1 columns]
14 missing values

{% include figure id="release-count" img="figures/release-count.svg" alt="FIXME" cap="Release Count" title="Histogram showing steeply declining curve from X equals 0 to X above 10,000 with peak over 200,000 at X equals 0." %}

slice = packages[packages['Releases'] < 100]
fig = px.histogram(slice, x='Releases', nbins=100, log_y=True, width=600, height=400)
fig.show()
fig.write_image('figures/release-count-low.svg')

{: title="version-histogram.py"}

{% include figure id="release-count-low" img="figures/release-count-low.svg" cap="Release Count (Low End)" alt="FIXME" title="Histogram showing smoothly declining curve from X equals 0 to X equals 100 with over 10,000 values at X equals zero and alternating high and low bars." %}

datafile = sys.argv[1]
packages = pd.read_csv(datafile)
slice = packages[packages['Releases'] < 100]
fig = px.violin(slice, y='Releases', width=600, height=400)
fig.show()
fig.write_image('figures/release-count-violin.svg')

{: title="version-other-plots.py"}

{% include figure id="release-count-violin" img="figures/release-count-violin.svg" cap="Violin Plot (Low End)" alt="FIXME" title="Symmetric vertical violin plot with almost all values in bulge at bottom end." %}

fig = px.box(slice, y='Releases', width=600, height=400)
fig.show()
fig.write_image('figures/release-count-box.svg')

{: title="version-other-plots.py"}

{% include figure id="release-count-box" img="figures/release-count-box.svg" cap="Box-and-Whisker Plot (Low End)" alt="FIXME" title="Symmetric vertical box plot with almost all values at low end." %}

How should we structure data analysis projects?

Main driver

def main():
    '''
    Main driver.
    '''
    args = parse_args()
    all_packages = get_package_list(args)
    progress = initialize_progress(args, all_packages)
    writer = csv.writer(sys.stdout, lineterminator='\n')
    writer.writerow(['Package', 'Release'])
    for package in all_packages:
        get_package_info(package, writer)
        report_progress(progress)

# ...

if __name__ == '__main__':
    main()

{: title="bin/get-all-versions.py"}

Parse command-line arguments

def parse_args():
    '''
    Parse command-line arguments.
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument('--restart', type=str, help='restart from this package')
    parser.add_argument('--verbose', action='store_true', help='show progress')
    return parser.parse_args()

{: title="bin/get-all-versons.py"}

Get the main package list

# Match package URL in main index page.
RE_PACKAGE = re.compile(r'<a href="(.+?)">')

# ...

def get_package_list(args):
    '''
    Get main package listing page and extract values.
    '''
    response = requests.get(f'{BASE_URL}')
    assert response.status_code == 200, \
        f'Unexpected status for index page {response.status_code}'
    all_packages = RE_PACKAGE.findall(response.text)
    all_packages = [p.strip('/').split('/')[-1] for p in all_packages]
    if (args.restart):
        start = all_packages.index(args.restart)
        if (start < 0):
            print(f'Unable to find {args.restart} in package list',
                  file=sys.stderr)
            sys.exit(1)
        all_packages = all_packages[start:]
    return all_packages

{: title="bin/get-all-versons.py"}

Get information about a single package

def get_package_info(name, writer):
    '''
    Get and print information about a specific package.
    '''
    url = f'{BASE_URL}{name}'
    response = requests.get(url)
    if response.status_code != 200:
        print(f'Unexpected status for package page {url}: {response.status_code}',
              file=sys.stderr)
        writer.writerow([name, 'NA'])
    else:
        all_releases = RE_RELEASE.findall(response.text)
        for release in all_releases:
            writer.writerow([name, release])

{: title="bin/get-all-versions.py"}

Report progress

def initialize_progress(args, packages):
    '''
    Initialize record keeping for progress monitoring.
    '''
    return {
        'report': args.verbose,
        'start': time.time(),
        'expected': len(packages),
        'seen': 0
    }


def report_progress(progress):
    '''
    Report progress, updating the progress record as a side effect.
    '''
    if (not progress['report']):
        return
    progress['seen'] += 1
    if (progress['seen'] % 10) == 0:
        elapsed = time.time() - progress['start']
        duration = progress['expected'] * elapsed / progress['seen']
        print(f"{progress['seen']} @ {elapsed:.1f} / {progress['expected']} @ {duration:.1f}",
              file=sys.stderr)

{: title="bin/get-all-versions.py"}

What's missing

What should go where?

Noble's Rules are a way to organize small data analysis projects. Each one lives in a separate Git repository whose subdirectories are organized by purpose:

Documentation is often generated from source code, and it's usually a bad idea to mix handwritten and machine-generated files, so most projects now use separate subdirectories for software documentation (doc) and reports (reports or papers). People also often create a figures directory to hold generated figures rather than putting them in results.

The directories in the top level of each project are organized by purpose, but the directories within data and results are organized chronologically so that it's easy to see when data was gathered and when results were generated. These directories all have names in ISO date format like YYYY-MM-DD to make it easy to sort them chronologically. This naming is particularly helpful when data and results are used in several reports.

At all levels, filenames should be easy to match with simple patterns. For example, a project might use speciesorgantreatment.csv as a file-naming convention, giving filenames like human_kidney_cm200.csv. This allows human_*_cm200.csv to match all human organs or *_kidney_*.csv to match all kidney data. It does produce long filenames, but tab completion means you only have to type them once. Long filenames are just as easy to match in programs: Python's [glob][glob] module will take a pattern and return a list of matching filenames.

How should we keep track of our data?

The last point is often the hardest for people to implement, since many researchers have never seen a well-documented dataset. The data manifest for [Diehm2018] is a good example; each dataset is described by an entry like this:

Header Datatype Description
brand text The full brand name.
style text The cut of each pair of jeans (in our analysis, we combined straight and boot-cut styles and skinny and slim styles, but these remain separated here).
menWomen text Whether the jeans were listed as "men's" or "women's".
name text The name of the specific style of measured pair of jeans as indicated by the tag. (e.g., Fave Super Skinny Jean).
brandSize text The size of jeans we measured. Each size reflects the sizing for each brand closest to a 32-inch waistband as indicated by the brand's website.
waistSize number The waistband size (in inches) of each measured pair as reported on the brand's website.
... ... ...

FAIR play

The [FAIR Principles][fair-principles] [Brock2019] state that data should be findable, accessible, interoperable, and reusable. They are still aspirational for most researchers, but they tell us what to aim for.

How should we keep track of our workflow?

It's easy to run one program to process a single data file, but what happens when our analysis depends on many files, or when we need to re-do the analysis every time new data arrives? What should we do if the analysis has several steps that we have to do in a particular order?

If we try to keep track of this ourselves, we will inevitably forget some crucial steps, and it will be hard for other people to pick up our work. Instead, we should use a build tool to keep track of what depends on what and run our analysis programs automatically. These tools were invented to help programmers rebuild complex software, but can be used to automate any workflow.

Make

The first widely-used build tool, Make, written in 1976. Programmers have created many replacements for it in the decades since then—so many, in fact, that none have attracted enough users to displace it entirely.

When Snakemake runs, it reads build rules from a file called Snakefile. (It can be called other things, but that's the convention.) Each rule explains how to update a target if it is out of date compared to any of its prerequisites. Here's a rule to regenerate a compressed data file data/all-versions.csv.gz if the file is older than the script used to create it:

rule all_versions:
    '''Download all version info from PyPI - takes several hours.'''
    output:
        protected('data/all-versions.csv.gz')
    shell:
        '''
        python bin/get-all-versions.py > data/all-versions.csv
        gzip data/all-versions.csv
        '''

Compressing Data

Compressing this dataset takes file from 103.8 Mbyte to 9.5 Mbyte, which is almost a factor of 11. However, version control can only diff and merge plain text files, so if the file is compressed, Git can't help us track changes to individual lines. On the other hand we probably shouldn't be changing a dataset anyway...

snakemake -j 1 all_versions

Since that will take several hours to complete, let's add another rule to the same file:

rule releases_per_package:
    '''How many releases are there for each package?'''
    input:
        'data/all-versions.csv.gz'
    output:
        'results/releases.csv'
    shell:
        'python bin/count-releases.py {input} > {output}'

count-releases.py is only a few lines long. It takes advantage of the fact that if we give Pandas' read_csv function a string instead of a stream it assumes that parameter is a filename, and that it can read directly from compressed files (which it identifies by looking for common endings like .zip or .gz).

#!/usr/bin/env python

'''
Count how many releases there are per package.
'''

import sys
import pandas as pd

def main():
    '''
    Main driver.
    '''
    data = pd.read_csv(sys.argv[1])
    result = data.groupby('Package').Release.nunique()
    result.to_csv(sys.stdout, header=True)


if __name__ == '__main__':
    main()

{: title="bin/count-releases.py"}

If we run:

snakemake -j1 releases_per_package

and wait a few seconds, we have a file with the following:

Package,Release
0,1
0-0-1,1
0-core-client,9
0-orchestrator,14
00print-lol,2
01d61084-d29e-11e9-96d1-7c5cf84ffe8e,2
021,1
...

Creating a Snakefile may seem like extra work, but few things in life are as satisfying as running one command and watching an entire multi-step analysis run itself:

  1. It reduces errors, since we only have to type commands correctly once instead of over and over again.

  2. More importantly, it documents our workflow so that someone else (including our future self) can see exactly what steps we used in what order.

How can we remove redundant releases?

#!/usr/bin/env python

import sys
import pandas as pd
from collections import Counter

def main():
    '''
    Count frequency of '.'-separated components of names.
    '''
    data = pd.read_csv(sys.argv[1])
    data = data['Release'].str.split('.', expand=True)
    data = data.values.flatten()
    data = Counter(data)
    del data[None]
    data = pd.DataFrame.from_dict(data, orient='index').reset_index()
    data = data.rename(columns={'index': 'Component', 0: 'Count'})
    data = data[~ data['Component'].str.match(r'^\d+$', na=False)]
    data = data.sort_values(by='Count', ascending=False)
    data.to_csv(sys.stdout, header=True, index=False)


if __name__ == '__main__':
    main()

{: title="bin/components.py"}

rule count_name_components:
    '''How often does each component of a dotted name occur?'''
    input:
        'data/all-versions.csv.gz'
    output:
        'results/name-component-count.csv'
    shell:
        'python bin/components.py {input} > {output}'
Component,Count
tar,1298365
gz,1294773
whl,833181
py3-none-any,219230
egg,83174
zip,79232
0-py2,56429
0-py3-none-any,53955
1-py3-none-any,46384
1-py2,42507
2-py3-none-any,29974
2-py2,27282
3-py3-none-any,21383
3-py2,19175
exe,17161
0-py2-none-any,16079
4-py3-none-any,15782
4-py2,14105
5-py3-none-any,12791
1-py2-none-any,11683
5-py2,11233
...
rule all:
    '''Dummy rule to rebuild everything.'''
    input:
        ['results/releases.csv', 'results/name-component-count.csv']
def main():
    '''
    Count releases before and after de-duplication.
    '''
    data = pd.read_csv(sys.argv[1])
    num_packages = len(data)
    data = data.assign(Stripped=data['Release'].str.replace(r'\.(tar\.gz|tar|whl|egg|zip|exe)$', ''))
    result = pd.DataFrame({'Complete': data.groupby('Package').Release.nunique(),
                           'Stripped': data.groupby('Package').Stripped.nunique()})
    num_shorter = len(result[result.Complete > result.Stripped])
    print(f'{num_shorter} / {num_packages} ({(100 * num_shorter / num_packages):6.2}%) shorter')

{: title="bin/remove-duplicates.py"}

rule count_redundant_releases:
    '''How many duplicated (redundant) releases are there?'''
    input:
        'data/all-versions.csv.gz'
    shell:
        'python bin/remove-duplicates.py data/all-versions.csv.gz'
2255 / 2312545 ( 0.098%) shorter

Summary