Synthetic Data

Terms defined: single nucleotide polymorphism, synthetic data generator

Synthesizing Genomes

def main():
    """Main driver."""
    args = parse_args()
    random.seed(args.seed)
    genomes = random_genomes(args.length, args.num_genomes, args.num_snp, args.prob_other)
    add_susceptibility(genomes)
    save(args.outfile, genomes)
def random_genomes(length, num_genomes, num_snp, prob_other):
    """Generate a set of genomes with specified number of point mutations."""
    assert 0 <= num_snp <= length
    reference = random_bases(length)
    individuals = [reference] * num_genomes
    locations = random.sample(list(range(length)), num_snp)
    for loc in locations:
        candidates = _other_bases(reference, loc)
        bases = [reference[loc]] + random.sample(candidates, k=len(candidates))
        individuals = [_mutate_snps(reference, ind, loc, bases) for ind in individuals]
    other_locations = list(set(range(length)) - set(locations))
    individuals = [_mutate_other(ind, prob_other, other_locations) for ind in individuals]
    individuals.sort()
    locations.sort()
    return GenePool(length=length, reference=reference, individuals=individuals, locations=locations)
def random_bases(length):
    """Generate a random sequence of bases of the specified length."""
    assert 0 < length
    return ''.join(random.choices(DNA, k=length))
def _mutate_snps(reference, genome, loc, bases):
    """Introduce single nucleotide polymorphisms at the specified location."""
    choice = _choose_one(bases, SNP_PROBS)
    return genome[:loc] + choice + genome[loc + 1:]
def _mutate_other(genome, prob, locations):
    """Introduce other mutations at specified locations."""
    if random.random() > prob:
        return genome
    loc = random.sample(locations, k=1)[0]
    base = random.choice(_other_bases(genome, loc))
    genome = genome[:loc] + base + genome[loc + 1:]
    return genome

Synthesizing Samples

def main():
    """Main driver."""
    args = parse_args()
    random.seed(args.seed)
    genomes = json.loads(Path(args.genomes).read_text())
    geo_params = get_geo_params(args)
    samples = generate_samples(args, genomes, geo_params)
    save(args, samples)
site,lon,lat
COW,-124.04519,48.82172
YOU,-124.197,48.87251
HMB,-124.17555,48.81673
GBY,-124.4593,48.9209
label,site,date,num,peak,relative_sd,radius
1748,COW,2023-04-27,23,100.0,0.1,0.1
1749,COW,2023-04-28,11,100.0,0.1,0.1
1755,COW,2023-05-13,15,101.0,0.11,0.1
1781,YOU,2023-05-01,12,90.0,0.15,0.15
1790,HMB,2023-05-02,19,107.0,0.22,0.11
1803,GBY,2023-05-08,8,95.0,0.1,0.14
def get_geo_params(args):
    """Get geographic parameters."""
    sites = pd.read_csv(Path(args.paramsdir, 'sites.csv'))
    surveys = pd.read_csv(Path(args.paramsdir, 'surveys.csv'))
    combined = sites.merge(surveys, how='inner', on='site')
    filtered = combined[combined['site'] == args.site].iloc[0]
    return {'lon': filtered['lon'], 'lat': filtered['lat'], 'radius': filtered['radius']}
def generate_samples(args, genomes, geo_params):
    """Generate snail samples."""
    samples = []
    for sequence in genomes['individuals']:
        point, distance = random_geo_point(**geo_params)
        if sequence[genomes['susceptible_loc']] == genomes['susceptible_base']:
            limit = args.mutant
        else:
            limit = args.normal
        scale = limit * distance / geo_params['radius']
        reading = random.uniform(MIN_SNAIL_SIZE, MIN_SNAIL_SIZE + MAX_SNAIL_SIZE * scale)
        samples.append((point.longitude, point.latitude, sequence, reading))
    df = pd.DataFrame(samples, columns=('lon', 'lat', 'sequence', 'reading'))
    df['lon'] = df['lon'].round(LON_LAT_PRECISION)
    df['lat'] = df['lat'].round(LON_LAT_PRECISION)
    df['reading'] = df['reading'].round(SNAIL_PRECISION)
    return df
CIRCLE = 360.0
LON_LAT_PRECISION = 5
READING_PRECISION = 1
MIN_SNAIL_SIZE = 0.5
MAX_SNAIL_SIZE = 5.0
SNAIL_PRECISION = 1

Analysis

scatterplot of all readings at all locations
Figure 8.1: Reading as a function of location and base (all)
scatterplot of all readings at all locations
Figure 8.2: Reading as a function of location and base (all)
rank plot of readings at mutation locations
Figure 8.3: Reading as a function of location (rank order)