|
| 1 | +import csv |
| 2 | +import itertools |
| 3 | +import sys |
| 4 | + |
| 5 | +PROBS = { |
| 6 | + |
| 7 | + # Unconditional probabilities for having gene |
| 8 | + "gene": { |
| 9 | + 2: 0.01, |
| 10 | + 1: 0.03, |
| 11 | + 0: 0.96 |
| 12 | + }, |
| 13 | + |
| 14 | + "trait": { |
| 15 | + |
| 16 | + # Probability of trait given two copies of gene |
| 17 | + 2: { |
| 18 | + True: 0.65, |
| 19 | + False: 0.35 |
| 20 | + }, |
| 21 | + |
| 22 | + # Probability of trait given one copy of gene |
| 23 | + 1: { |
| 24 | + True: 0.56, |
| 25 | + False: 0.44 |
| 26 | + }, |
| 27 | + |
| 28 | + # Probability of trait given no gene |
| 29 | + 0: { |
| 30 | + True: 0.01, |
| 31 | + False: 0.99 |
| 32 | + } |
| 33 | + }, |
| 34 | + |
| 35 | + # Mutation probability |
| 36 | + "mutation": 0.01 |
| 37 | +} |
| 38 | + |
| 39 | + |
| 40 | +def main(): |
| 41 | + |
| 42 | + # Check for proper usage |
| 43 | + if len(sys.argv) != 2: |
| 44 | + sys.exit("Usage: python heredity.py data.csv") |
| 45 | + people = load_data(sys.argv[1]) |
| 46 | + |
| 47 | + # Keep track of gene and trait probabilities for each person |
| 48 | + probabilities = { |
| 49 | + person: { |
| 50 | + "gene": { |
| 51 | + 2: 0, |
| 52 | + 1: 0, |
| 53 | + 0: 0 |
| 54 | + }, |
| 55 | + "trait": { |
| 56 | + True: 0, |
| 57 | + False: 0 |
| 58 | + } |
| 59 | + } |
| 60 | + for person in people |
| 61 | + } |
| 62 | + |
| 63 | + # Loop over all sets of people who might have the trait |
| 64 | + names = set(people) |
| 65 | + for have_trait in powerset(names): |
| 66 | + |
| 67 | + # Check if current set of people violates known information |
| 68 | + fails_evidence = any( |
| 69 | + (people[person]["trait"] is not None and |
| 70 | + people[person]["trait"] != (person in have_trait)) |
| 71 | + for person in names |
| 72 | + ) |
| 73 | + if fails_evidence: |
| 74 | + continue |
| 75 | + |
| 76 | + # Loop over all sets of people who might have the gene |
| 77 | + for one_gene in powerset(names): |
| 78 | + for two_genes in powerset(names - one_gene): |
| 79 | + |
| 80 | + # Update probabilities with new joint probability |
| 81 | + p = joint_probability(people, one_gene, two_genes, have_trait) |
| 82 | + update(probabilities, one_gene, two_genes, have_trait, p) |
| 83 | + |
| 84 | + # Ensure probabilities sum to 1 |
| 85 | + normalize(probabilities) |
| 86 | + |
| 87 | + # Print results |
| 88 | + for person in people: |
| 89 | + print(f"{person}:") |
| 90 | + for field in probabilities[person]: |
| 91 | + print(f" {field.capitalize()}:") |
| 92 | + for value in probabilities[person][field]: |
| 93 | + p = probabilities[person][field][value] |
| 94 | + print(f" {value}: {p:.4f}") |
| 95 | + |
| 96 | + |
| 97 | +def load_data(filename): |
| 98 | + """ |
| 99 | + Load gene and trait data from a file into a dictionary. |
| 100 | + File assumed to be a CSV containing fields name, mother, father, trait. |
| 101 | + mother, father must both be blank, or both be valid names in the CSV. |
| 102 | + trait should be 0 or 1 if trait is known, blank otherwise. |
| 103 | + """ |
| 104 | + data = dict() |
| 105 | + with open(filename) as f: |
| 106 | + reader = csv.DictReader(f) |
| 107 | + for row in reader: |
| 108 | + name = row["name"] |
| 109 | + data[name] = { |
| 110 | + "name": name, |
| 111 | + "mother": row["mother"] or None, |
| 112 | + "father": row["father"] or None, |
| 113 | + "trait": (True if row["trait"] == "1" else |
| 114 | + False if row["trait"] == "0" else None) |
| 115 | + } |
| 116 | + return data |
| 117 | + |
| 118 | + |
| 119 | +def powerset(s): |
| 120 | + """ |
| 121 | + Return a list of all possible subsets of set s. |
| 122 | + """ |
| 123 | + s = list(s) |
| 124 | + return [ |
| 125 | + set(s) for s in itertools.chain.from_iterable( |
| 126 | + itertools.combinations(s, r) for r in range(len(s) + 1) |
| 127 | + ) |
| 128 | + ] |
| 129 | + |
| 130 | + |
| 131 | +def joint_probability(people, one_gene, two_genes, have_trait): |
| 132 | + """ |
| 133 | + Compute and return a joint probability. |
| 134 | +
|
| 135 | + The probability returned should be the probability that |
| 136 | + * everyone in set `one_gene` has one copy of the gene, and |
| 137 | + * everyone in set `two_genes` has two copies of the gene, and |
| 138 | + * everyone not in `one_gene` or `two_gene` does not have the gene, and |
| 139 | + * everyone in set `have_trait` has the trait, and |
| 140 | + * everyone not in set` have_trait` does not have the trait. |
| 141 | + """ |
| 142 | + probability = 1 |
| 143 | + |
| 144 | + for person in people: |
| 145 | + gene_number = 1 if person in one_gene else 2 if person in two_genes else 0 |
| 146 | + trait = True if person in have_trait else False |
| 147 | + |
| 148 | + gene_numb_prop = PROBS['gene'][gene_number] |
| 149 | + trait_prop = PROBS['trait'][gene_number][trait] |
| 150 | + |
| 151 | + if people[person]['mother'] is None: |
| 152 | + # no parents, use probability distribution |
| 153 | + probability *= gene_numb_prop * trait_prop |
| 154 | + else: |
| 155 | + # info about parents is available |
| 156 | + mother = people[person]['mother'] |
| 157 | + father = people[person]['father'] |
| 158 | + percentages = {} |
| 159 | + |
| 160 | + for ppl in [mother, father]: |
| 161 | + number = 1 if ppl in one_gene else 2 if ppl in two_genes else 0 |
| 162 | + perc = 0 + PROBS['mutation'] if number == 0 else 0.5 if number == 1 else 1 - PROBS['mutation'] |
| 163 | + percentages[ppl] = perc |
| 164 | + |
| 165 | + if gene_number == 0: |
| 166 | + # 0, none of parents gave gene |
| 167 | + probability *= (1 - percentages[mother]) * (1 - percentages[father]) |
| 168 | + elif gene_number == 1: |
| 169 | + # 1, one of parents gave gene |
| 170 | + probability *= (1 - percentages[mother]) * percentages[father] + percentages[mother] * (1 - percentages[father]) |
| 171 | + else: |
| 172 | + # 2, both of parents gave gene |
| 173 | + probability *= percentages[mother] * percentages[father] |
| 174 | + |
| 175 | + probability *= trait_prop |
| 176 | + |
| 177 | + return probability |
| 178 | + |
| 179 | + |
| 180 | +def update(probabilities, one_gene, two_genes, have_trait, p): |
| 181 | + """ |
| 182 | + Add to `probabilities` a new joint probability `p`. |
| 183 | + Each person should have their "gene" and "trait" distributions updated. |
| 184 | + Which value for each distribution is updated depends on whether |
| 185 | + the person is in `have_gene` and `have_trait`, respectively. |
| 186 | + """ |
| 187 | + for person in probabilities: |
| 188 | + gene_number = 1 if person in one_gene else 2 if person in two_genes else 0 |
| 189 | + probabilities[person]["gene"][gene_number] += p |
| 190 | + probabilities[person]["trait"][person in have_trait] += p |
| 191 | + |
| 192 | + |
| 193 | +def normalize(probabilities): |
| 194 | + """ |
| 195 | + Update `probabilities` such that each probability distribution |
| 196 | + is normalized (i.e., sums to 1, with relative proportions the same). |
| 197 | + """ |
| 198 | + normalized = probabilities.copy() |
| 199 | + for person in probabilities: |
| 200 | + for typ in ['gene', 'trait']: |
| 201 | + summed = sum(probabilities[person][typ].values()) |
| 202 | + for category in probabilities[person][typ]: |
| 203 | + val = probabilities[person][typ][category] |
| 204 | + normalized_val = val / summed |
| 205 | + normalized[person][typ][category] = normalized_val |
| 206 | + return normalized |
| 207 | + |
| 208 | + |
| 209 | +if __name__ == "__main__": |
| 210 | + main() |
0 commit comments