shared/datagen.py
"""Data generator."""
import polars as pl
import random
import sys
NUM = 10
SEX = {
"F": 0.48,
"M": 0.47,
"X": 0.05,
}
WEIGHT = {
"F": 6.3,
"M": 6.1,
"X": 6.2,
}
WEIGHT_STD = 0.4
LENGTH = {
"F": 0.50,
"M": 0.54,
"X": 0.52,
}
LENGTH_STD = 0.08
PRECISION = 2
def generate(seed):
"""Generate data."""
random.seed(seed)
data = []
for _ in range(NUM):
sex = gen_sex()
weight = gen_weight(sex)
length = gen_length(sex, weight)
data.append([sex, weight, length])
return pl.DataFrame(data, schema=["sex", "weight", "length"], orient="row")
def gen_length(sex, weight):
"""Generate length."""
return round(random.gauss(weight * LENGTH[sex], LENGTH_STD), PRECISION)
def gen_sex():
"""Generate sex."""
return random.choices(list(SEX.keys()), SEX.values(), k=1)[0]
def gen_weight(sex):
"""Generate weight."""
return round(random.gauss(WEIGHT[sex], WEIGHT_STD), PRECISION)
if __name__ == "__main__":
seed = int(sys.argv[1]) if len(sys.argv) > 1 else None
df = generate(seed)
df.write_csv(sys.stdout)