Hypothesis Testing

How can we get data on file sizes?

def main():
    '''
    Main driver.
    '''
    args = parse_args()
    counts = lengths(args)
    report_filenames(args.output, counts)
    report_counts(args.output, counts)

{: title="bin/file-size.py"}

def parse_args():
    '''
    Handle command-line arguments.
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument('--root', type=str, help='root directory')
    parser.add_argument('--ext', type=str, help='extension')
    parser.add_argument('--output', type=str, help='stem of output files')
    return parser.parse_args()

{: title="bin/file-size.py"}

def lengths(args):
    '''
    Find files and count line lengths.
    '''
    counts = {}
    for (curr_dir, sub_dirs, files) in os.walk(args.root):
        for filename in [x for x in files if x.endswith(args.ext)]:
            path = os.path.join(curr_dir, filename)
            with open(path, 'r') as reader:
                try:
                    counts[path] = Counter()
                    for x in reader.readlines():
                        counts[path][len(x)] += 1
                except Exception as e:
                     print(f'Failed to read {path}: {e}',
                           file=sys.stderr)
                     counts[path] = None
    return counts

{: title="bin/file-size.py"}

def report_filenames(output_stem, counts):
    '''
    Report filename-to-ID as CSV with 'NA' for errored files.
    '''
    with open(f'{output_stem}-filenames.csv', 'w') as writer:
        writer = csv.writer(writer, lineterminator='\n')
        writer.writerow(['Filename', 'FileID'])
        for (file_id, filename) in enumerate(sorted(counts.keys())):
            writer.writerow([filename, file_id if counts[filename] else 'NA'])

{: title="bin/file-size.py"}

def report_counts(output_stem, counts):
    '''
    Report file ID-to-count as CSV for non-errored files.
    '''
    with open(f'{output_stem}-counts.csv', 'w') as writer:
        writer = csv.writer(writer, lineterminator='\n')
        writer.writerow(['FileID', 'Length', 'Count'])
        for (file_id, filename) in enumerate(sorted(counts.keys())):
            if counts[filename]:
                for (length, freq) in counts[filename].most_common():
                    writer.writerow([file_id, length, freq])

{: title="bin/file-size.py"}

Failed to read /anaconda3/pkgs/xlwt-1.3.0-py37_0/lib/python3.7/site-packages/xlwt/BIFFRecords.py: 'utf-8' codec can't decode byte 0x93 in position 68384: invalid start byte
Failed to read /anaconda3/pkgs/xlwt-1.3.0-py37_0/lib/python3.7/site-packages/xlwt/UnicodeUtils.py: 'utf-8' codec can't decode byte 0xb7 in position 1950: invalid start byte
Failed to read /anaconda3/pkgs/pylint-2.3.1-py37_0/lib/python3.7/site-packages/pylint/test/functional/implicit_str_concat_in_sequence_latin1.py: 'utf-8' codec can't decode byte 0xe9 in position 96: invalid continuation byte
Failed to read /anaconda3/pkgs/joblib-0.13.2-py37_0/lib/python3.7/site-packages/joblib/test/test_func_inspect_special_encoding.py: 'utf-8' codec can't decode byte 0xa4 in position 64: invalid start byte
...

{: title="log/file-size-python-errors.txt"}

LANGS = ['javascript', 'python']
KINDS = ['all', 'trimmed']

rule all:
    input:
        expand('figures/{lang}-counts-{kind}.svg', lang=LANGS, kind=KINDS)

rule count_all:
    input:
        'data/{lang}-counts.csv'
    output:
        'figures/{lang}-counts-all.svg'
    shell:
        'python bin/length-frequency-plot.py --data {input} --fig {output} --logx'

rule count_trimmed:
    input:
        'data/{lang}-counts.csv'
    output:
        'figures/{lang}-counts-trimmed.svg'
    shell:
        'python bin/length-frequency-plot.py --data {input} --fig {output} --low 2 --high 200'

{: title="Snakefile"}

{% include figure id="javascript-counts-all" img="figures/javascript-counts-all.svg" cap="Frequency of Line Lengths (JavaScript, All)" alt="FIXME" title="Log-log scatter plot with a point at Y equals 200,000 and X equals 1 and then a noticeable decline for X above 80." %}

{% include figure id="python-counts-all" img="figures/python-counts-all.svg" cap="Frequency of Line Lengths (Python, All)" alt="FIXME" title="Log-log scatter plot with a point at Y equals 2,000,000 at X equals 1 and then a sharp decline for X above 80." %}

{% include figure id="javascript-counts-trimmed" img="figures/javascript-counts-trimmed.svg" cap="Frequency of Line Lengths (JavaScript, Trimmed)" alt="FIXME" title="Log-linear plot showing Y approximately 20,000 for X up to 50 and steady decline thereafter." %}

{% include figure id="python-counts-trimmed" img="figures/python-counts-trimmed.svg" cap="Frequency of Line Lengths (Python, Trimmed)" alt="FIXME" title="Log-linear plot showing Y approximately 20,000 for X below 10, Y between 100,000 and 200,000 for X up to 80, and a very sharp decline thereafter." %}

How can we tell if two populations are different?

Kick the ball then move goal

\(p\) values can be mis-used in several ways. The most obvious is to choose a \(p\) value after the fact in order to get a significant result: if you ever see reports that mix several different \(p\) values or use odd numbers like 0.073, this is probably what's going on.

The second form of abuse, called p hacking, is to re-analyze the data over and over until a "significant" result emerges. Consider: if the odds of getting a false positive for one analysis are 0.05, then the odds of getting a true negative are 0.95. The odds of getting two true negatives in a row are therefore \(0.95^2\), which is 0.9025. If we keep going, the odds of none of our analyses meeting this threshold are 50/50 when we do 14 analyses. One sign that people are p hacking is that they find niche results like, "This treatment was effective for left-handed non-smokers between the ages of 45 and 55." The best way to safeguard against p hacking is to pre-register studies, i.e., to declare before collecting data what analyses are going to be done and how.

#!/usr/bin/env python

import sys
import argparse
import numpy as np
import pandas as pd


def main():
    '''
    Sample two datasets to calculate odds of means being distant.
    '''
    # ...parse arguments...
    # ...read data and calculate actual means and difference...
    # ...repeatedly sample and calculate difference...
    # ...report...

{: title="bin/simulate.py"}

    # parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--left', type=str, help='first dataset')
    parser.add_argument('--right', type=str, help='second dataset')
    parser.add_argument('--low', type=int, help='lower limit')
    parser.add_argument('--high', type=int, help='upper limit')
    parser.add_argument('--trials', type=int, help='number of trials (>0)')
    parser.add_argument('--seed', type=int, help='RNG seed')
    args = parser.parse_args()
    np.random.seed(args.seed)

{: title="bin/simulate.py"}

    # read data and calculate actual means and difference
    data_left = read_data(args.left, args.low, args.high)
    data_right = read_data(args.right, args.low, args.high)
    mean_left = data_left.mean()
    mean_right = data_right.mean()
    actual_diff = mean_left - mean_right

{: title="bin/simulate.py"}

def read_data(filename, low, high):
    '''
    Read data and remove lines of length 1 and very long lines.
    '''
    data = pd.read_csv(filename)
    if (high != None):
        data = data[data.Length <= high]
    if (low != None):
        data = data[data.Length >= low]
    return data.Length.repeat(repeats=data.Count)

{: title="bin/simulate.py"}

    # repeatedly sample and calculate difference
    combined = data_left.append(data_right).reset_index(drop=True)
    split = len(data_left)
    sample_diffs = []
    for t in range(args.trials):
        shuffle = np.random.permutation(len(combined))
        sample_left = combined[shuffle[:split]]
        sample_right = combined[shuffle[split:]]
        sample_diffs.append(sample_left.mean() - sample_right.mean())

    sample_diff_mean = sum(sample_diffs) / args.trials
    success_frac = sum([x <= actual_diff for x in sample_diffs]) / args.trials

{: title="bin/simulate.py"}

    # report
    print(f'parameters:')
    print(f'- left: "{args.left}"')
    print(f'- right: "{args.right}"')
    print(f'- low: {"null" if (args.low is None) else args.low}')
    print(f'- high: {"null" if (args.high is None) else args.high}')
    print(f'- trials: {args.trials}')
    print(f'- seed: {args.seed}')
    print(f'results:')
    print(f'- mean_left: {mean_left}')
    print(f'- mean_right: {mean_right}')
    print(f'- actual_diff: {actual_diff}')
    print(f'- sample_diff: {sample_diff_mean}')
    print(f'- successes: {success_frac}')

{: title="bin/simulate.py"}

How can we test our approach?

python bin/simulate.py --left test/test-a.csv --right test/test-b.csv --trials 10000 --seed 1234567
parameters:
- left: "test/test-a.csv"
- right: "test/test-b.csv"
- low: null
- high: null
- trials: 10000
- seed: 1234567
results:
- mean_left: 1.0
- mean_right: 10.0
- actual_diff: -9.0
- sample_diff: -0.0126
- successes: 0.1661
python bin/simulate.py --left data/javascript-counts.csv --right data/python-counts.csv --trials 5000 --seed 57622
parameters:
- left: "data/javascript-counts.csv"
- right: "data/python-counts.csv"
- low: null
- high: null
- trials: 5000
- seed: 57622
results:
- mean_left: 38.168300030969725
- mean_right: 36.41329390723824
- actual_diff: 1.7550061237314836
- sample_diff: 0.00398256511711528
- successes: 1.0