Basic Tools

How can we get some data?

import sys
print('\n'.join(sys.path))

/anaconda3/lib/python37.zip
/anaconda3/lib/python3.7
/anaconda3/lib/python3.7/lib-dynload
/anaconda3/lib/python3.7/site-packages
find /anaconda3 -name '*.py' -exec wc -l -c {} \;
      27     877 /anaconda3/bin/rst2xetex.py
      26     797 /anaconda3/bin/rst2latex.py
      67    1704 /anaconda3/bin/rst2odt_prepstyles.py
      26     720 /anaconda3/bin/rst2html4.py
      35    1145 /anaconda3/bin/rst2html5.py
...
import sys

print('Lines,Characters,Path')
for line in sys.stdin:
    fields = line.split()
    print('{},{},{}'.format(*fields))

{: title="wc2csv.py"}

find /anaconda3 -name '*.py' -exec wc -l -c {} \; \
  | python wc2csv.py \
  > python-local-package-size.csv
cat python-local-package-size.csv
Lines,Characters,Path
27,877,/anaconda3/bin/rst2xetex.py
26,797,/anaconda3/bin/rst2latex.py
67,1704,/anaconda3/bin/rst2odt_prepstyles.py
26,720,/anaconda3/bin/rst2html4.py
35,1145,/anaconda3/bin/rst2html5.py
...

How can we analyze tabular data?

import pandas

data = pandas.read_csv('python-local-package-size.csv')
print(data)

{: title="pandas-read-display.py"}

       Lines  Characters                                               Path
0         27         877                        /anaconda3/bin/rst2xetex.py
1         26         797                        /anaconda3/bin/rst2latex.py
2         67        1704               /anaconda3/bin/rst2odt_prepstyles.py
...
33243    256       10135  /anaconda3/share/glib-2.0/codegen/codegen_main.py
33244    431       17774     /anaconda3/share/glib-2.0/codegen/dbustypes.py
33245   3469      206544       /anaconda3/share/glib-2.0/codegen/codegen.py

[33246 rows x 3 columns]
print(data.columns)

{: title="pandas-read-display.py"}

Index(['Lines', 'Characters', 'Path'], dtype='object')
print(data.columns.values)

{: title="pandas-read-display.py"}

['Lines' 'Characters' 'Path']
import pandas as pd

packages = pd.read_csv('python-local-package-size.csv')
print(packages['Path'])

{: title="pandas-select-col.py"}

0                              /anaconda3/bin/rst2xetex.py
1                              /anaconda3/bin/rst2latex.py
2                     /anaconda3/bin/rst2odt_prepstyles.py
...
33243    /anaconda3/share/glib-2.0/codegen/codegen_main.py
33244       /anaconda3/share/glib-2.0/codegen/dbustypes.py
33245         /anaconda3/share/glib-2.0/codegen/codegen.py
Name: Path, Length: 33246, dtype: object
print(packages[['Lines', 'Characters']])

{: title="pandas-select-col.py"}

       Lines  Characters
0         27         877
1         26         797
2         67        1704
...
33243    256       10135
33244    431       17774
33245   3469      206544

[33246 rows x 2 columns]
print(packages[0])

{: title="pandas-select-row-fail.py"}

Traceback (most recent call last):
  File "/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2657, in get_loc
    return self._engine.get_loc(key)
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "pandas-select-row-fail.py", line 4, in <module>
    print(packages[0])
  File "/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
    indexer = self.columns.get_loc(key)
  File "/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
print(packages.iloc[0])

{: title="pandas-select-row-iloc.py"}

Lines                                  27
Characters                            877
Path          /anaconda3/bin/rst2xetex.py
Name: 0, dtype: object
print(packages.iloc[0:5])

{: title="pandas-select-row-iloc.py"}

   Lines  Characters                                  Path
0     27         877           /anaconda3/bin/rst2xetex.py
1     26         797           /anaconda3/bin/rst2latex.py
2     67        1704  /anaconda3/bin/rst2odt_prepstyles.py
3     26         720           /anaconda3/bin/rst2html4.py
4     35        1145           /anaconda3/bin/rst2html5.py
print(packages['Characters'][0:3])

{: title="pandas-select-row-iloc.py"}

0     877
1     797
2    1704
Name: Characters, dtype: int64
print(packages.iloc[0:3]['Characters'])

{: title="pandas-select-row-iloc.py"}

0     877
1     797
2    1704
Name: Characters, dtype: int64

How can we visualize what's in a dataframe?

import pandas as pd
import plotly.express as px

packages = pd.read_csv('python-local-package-size.csv')
fig = px.scatter(packages, x='Lines', y='Characters')
fig.show()
fig.write_image('scatter-lines-characters.svg')

{: title="scatter-lines-characters.py"}

{% include figure id="scatter-lines-characters" cap="Characters vs. Lines" alt="FIXME" title="Scatter plot with most values clustered in the range X equals 0 to 5,000 and Y equals 0 to 0.2 million, with some outliers." fixme=true %}

How can we do calculations with dataframes?

import pandas as pd

example = pd.DataFrame(data=[[  1,   2,   3],
                             [ 10,  20,  30],
                             [100, 200, 300]],
                       columns=['left', 'middle', 'right'])
print(example)

{: title="aggregation.py"}

   left  middle  right
0     1       2      3
1    10      20     30
2   100     200    300
print(example['middle'] + example['right'])

{: title="aggregation.py"}

0      5
1     50
2    500
dtype: int64
print(7 * example['left'])

{: title="aggregation.py"}

0      7
1     70
2    700
Name: left, dtype: int64

Not a Number, Not Available, Null, and None

NaN stands for "Not a Number", a special value used to represent things like 0/0 [Kahan1997]. Despite the similarity in their names, it is not the same thing as NA (Not Available), which is a placeholder for missing values. To make things more confusing, SQL (the standard language for querying relational databases) uses null instead of NA to signal missing data, while many programming languages use null to mean "a reference that doesn't refer to anything". Python uses None instead of null, but we must be careful when reading and writing data to distinguish between empty strings, missing values, and the country code for Namibia.

print(example.agg('sum'))

{: title="aggregation.py"}

left      111
middle    222
right     333
dtype: int64
print(example.agg(['sum', 'mean']))

{: title="aggregation.py"}

       left  middle  right
sum   111.0   222.0  333.0
mean   37.0    74.0  111.0

How can we select subsets of data?

import pandas as pd

colors = pd.DataFrame(columns=['name', 'red', 'green', 'blue'],
                      data=[['yellow',  1.0, 1.0, 0.0],
                            ['aqua',    0.0, 1.0, 1.0],
                            ['fuchsia', 1.0, 0.0, 1.0]])
print(colors)

{: title="filter.py"}

      name  red  green  blue
0   yellow  1.0    1.0   0.0
1     aqua  0.0    1.0   1.0
2  fuchsia  1.0    0.0   1.0
red = colors['red']
print(red)

{: title="filter.py"}

0    1.0
1    0.0
2    1.0
Name: red, dtype: float64
has_red = (red == 1.0)
print(has_red)

{: title="filter.py"}

0     True
1    False
2     True
Name: red, dtype: bool
rows_with_red = colors.loc[has_red]
print(rows_with_red)

{: title="filter.py"}

      name  red  green  blue
0   yellow  1.0    1.0   0.0
2  fuchsia  1.0    0.0   1.0
print(colors.agg('mean'))

{: title="filter.py"}

red      0.666667
green    0.666667
blue     0.666667
dtype: float64
print(rows_with_red.agg('mean'))

{: title="filter.py"}

red      1.0
green    0.5
blue     0.5
dtype: float64
print(colors.loc[colors['red'] == 1.0].agg('mean'))

{: title="filter.py"}

red      1.0
green    0.5
blue     0.5
dtype: float64

How are lines and characters in Python files related?

import pandas as pd
import plotly.express as px

packages = pd.read_csv('python-local-package-size.csv')
packages = packages[packages['Lines'] > 0]
packages['ratio'] = packages['Characters'] / packages['Lines']

fig = px.histogram(packages, x='ratio')
fig.show()
fig.write_image('hist-ratio-unscaled.svg', width=600, height=400)

{: title="ratio.py"}

{% include figure id="hist-ratio-unscaled" cap="Ratio of Characters to Lines (Unscaled)" alt="FIXME" title="Linear-linear histogram with a single sharp spike at X equals 0 going up to Y equals 2,200 and nothing else visible up to X equals 9,000." fixme=true %}

fig = px.histogram(packages, x='ratio', nbins=100, log_y=True)
fig.show()
fig.write_image('hist-ratio-scaled.svg')

{: title="ratio.py"}

{% include figure id="hist-ratio-scaled" cap="Ratio of Characters to Lines (Scaled)" alt="FIXME" title="Log-linear histogram with a single sharp spike at X equals 0 going up to Y equals 3,000 and a sharp decline to Y equals 2 near X equals 1,800 and one outlier of Y equals 2 at X equals 9,000." fixme=true %}

print(f"Excluding {len(packages[packages['ratio'] > 100])}/{len(packages)} data points")
fig = px.histogram(packages[packages['ratio'] <= 100], x='ratio', nbins=100)
fig.show()
fig.write_image('hist-ratio-most.svg')

{: title="ratio.py"}

Excluding 92 data points

{% include figure id="hist-ratio-most" cap="Ratio of Characters to Lines (Most)" alt="FIXME" title="Linear-linear histogram with apparently normal distribution peaking at Y equals 2200 near X equals 35." fixme=true %}

Summary