Skip to content

Commit 08a6b09

Browse files
committed
s
1 parent 0c6891e commit 08a6b09

35 files changed

+19367
-4
lines changed

2-uncertainty/2a-pagerank/pagerank.py

+166
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
import os
2+
import random
3+
import re
4+
import sys
5+
6+
DAMPING = 0.85
7+
SAMPLES = 10000
8+
9+
10+
def main():
11+
if len(sys.argv) != 2:
12+
sys.exit("Usage: python pagerank.py corpus")
13+
corpus = crawl(sys.argv[1])
14+
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
15+
print(f"PageRank Results from Sampling (n = {SAMPLES})")
16+
for page in sorted(ranks):
17+
print(f" {page}: {ranks[page]:.4f}")
18+
ranks = iterate_pagerank(corpus, DAMPING)
19+
print(f"PageRank Results from Iteration")
20+
for page in sorted(ranks):
21+
print(f" {page}: {ranks[page]:.4f}")
22+
23+
24+
def crawl(directory):
25+
"""
26+
Parse a directory of HTML pages and check for links to other pages.
27+
Return a dictionary where each key is a page, and values are
28+
a list of all other pages in the corpus that are linked to by the page.
29+
"""
30+
pages = dict()
31+
32+
# Extract all links from HTML files
33+
for filename in os.listdir(directory):
34+
if not filename.endswith(".html"):
35+
continue
36+
with open(os.path.join(directory, filename)) as f:
37+
contents = f.read()
38+
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
39+
pages[filename] = set(links) - {filename}
40+
41+
# Only include links to other pages in the corpus
42+
for filename in pages:
43+
pages[filename] = set(
44+
link for link in pages[filename]
45+
if link in pages
46+
)
47+
48+
return pages
49+
50+
51+
def transition_model(corpus, page, damping_factor):
52+
"""
53+
Return a probability distribution over which page to visit next,
54+
given a current page.
55+
56+
With probability `damping_factor`, choose a link at random
57+
linked to by `page`. With probability `1 - damping_factor`, choose
58+
a link at random chosen from all pages in the corpus.
59+
"""
60+
61+
prop_dist = {}
62+
63+
# check if page has outgoing links
64+
dict_len = len(corpus.keys())
65+
pages_len = len(corpus[page])
66+
67+
if len(corpus[page]) < 1:
68+
# no outgoing pages, choosing randomly from all possible pages
69+
for key in corpus.keys():
70+
prop_dist[key] = 1 / dict_len
71+
72+
else:
73+
# there are outgoing pages, calculating distribution
74+
random_factor = (1 - damping_factor) / dict_len
75+
even_factor = damping_factor / pages_len
76+
77+
for key in corpus.keys():
78+
if key not in corpus[page]:
79+
prop_dist[key] = random_factor
80+
else:
81+
prop_dist[key] = even_factor + random_factor
82+
83+
return prop_dist
84+
85+
86+
def sample_pagerank(corpus, damping_factor, n):
87+
"""
88+
Return PageRank values for each page by sampling `n` pages
89+
according to transition model, starting with a page at random.
90+
91+
Return a dictionary where keys are page names, and values are
92+
their estimated PageRank value (a value between 0 and 1). All
93+
PageRank values should sum to 1.
94+
"""
95+
96+
# prepare dictionary with number of samples == 0
97+
samples_dict = corpus.copy()
98+
for i in samples_dict:
99+
samples_dict[i] = 0
100+
sample = None
101+
102+
# itearting n times
103+
for _ in range(n):
104+
if sample:
105+
# previous sample is available, choosing using transition model
106+
dist = transition_model(corpus, sample, damping_factor)
107+
dist_lst = list(dist.keys())
108+
dist_weights = [dist[i] for i in dist]
109+
sample = random.choices(dist_lst, dist_weights, k=1)[0]
110+
else:
111+
# no previous sample, choosing randomly
112+
sample = random.choice(list(corpus.keys()))
113+
114+
# count each sample
115+
samples_dict[sample] += 1
116+
117+
# turn sample count to percentage
118+
for item in samples_dict:
119+
samples_dict[item] /= n
120+
121+
return samples_dict
122+
123+
124+
def iterate_pagerank(corpus, damping_factor):
125+
"""
126+
Return PageRank values for each page by iteratively updating
127+
PageRank values until convergence.
128+
129+
Return a dictionary where keys are page names, and values are
130+
their estimated PageRank value (a value between 0 and 1). All
131+
PageRank values should sum to 1.
132+
"""
133+
pages_number = len(corpus)
134+
old_dict = {}
135+
new_dict = {}
136+
137+
# assigning each page a rank of 1/n, where n is total number of pages in the corpus
138+
for page in corpus:
139+
old_dict[page] = 1 / pages_number
140+
141+
# repeatedly calculating new rank values basing on all of the current rank values
142+
while True:
143+
for page in corpus:
144+
temp = 0
145+
for linking_page in corpus:
146+
# check if page links to our page
147+
if page in corpus[linking_page]:
148+
temp += (old_dict[linking_page] / len(corpus[linking_page]))
149+
# if page has no links, interpret it as having one link for every other page
150+
if len(corpus[linking_page]) == 0:
151+
temp += (old_dict[linking_page]) / len(corpus)
152+
temp *= damping_factor
153+
temp += (1 - damping_factor) / pages_number
154+
155+
new_dict[page] = temp
156+
157+
difference = max([abs(new_dict[x] - old_dict[x]) for x in old_dict])
158+
if difference < 0.001:
159+
break
160+
else:
161+
old_dict = new_dict.copy()
162+
163+
return old_dict
164+
165+
if __name__ == "__main__":
166+
main()

2-uncertainty/2b-heredity/heredity.py

+210
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
import csv
2+
import itertools
3+
import sys
4+
5+
PROBS = {
6+
7+
# Unconditional probabilities for having gene
8+
"gene": {
9+
2: 0.01,
10+
1: 0.03,
11+
0: 0.96
12+
},
13+
14+
"trait": {
15+
16+
# Probability of trait given two copies of gene
17+
2: {
18+
True: 0.65,
19+
False: 0.35
20+
},
21+
22+
# Probability of trait given one copy of gene
23+
1: {
24+
True: 0.56,
25+
False: 0.44
26+
},
27+
28+
# Probability of trait given no gene
29+
0: {
30+
True: 0.01,
31+
False: 0.99
32+
}
33+
},
34+
35+
# Mutation probability
36+
"mutation": 0.01
37+
}
38+
39+
40+
def main():
41+
42+
# Check for proper usage
43+
if len(sys.argv) != 2:
44+
sys.exit("Usage: python heredity.py data.csv")
45+
people = load_data(sys.argv[1])
46+
47+
# Keep track of gene and trait probabilities for each person
48+
probabilities = {
49+
person: {
50+
"gene": {
51+
2: 0,
52+
1: 0,
53+
0: 0
54+
},
55+
"trait": {
56+
True: 0,
57+
False: 0
58+
}
59+
}
60+
for person in people
61+
}
62+
63+
# Loop over all sets of people who might have the trait
64+
names = set(people)
65+
for have_trait in powerset(names):
66+
67+
# Check if current set of people violates known information
68+
fails_evidence = any(
69+
(people[person]["trait"] is not None and
70+
people[person]["trait"] != (person in have_trait))
71+
for person in names
72+
)
73+
if fails_evidence:
74+
continue
75+
76+
# Loop over all sets of people who might have the gene
77+
for one_gene in powerset(names):
78+
for two_genes in powerset(names - one_gene):
79+
80+
# Update probabilities with new joint probability
81+
p = joint_probability(people, one_gene, two_genes, have_trait)
82+
update(probabilities, one_gene, two_genes, have_trait, p)
83+
84+
# Ensure probabilities sum to 1
85+
normalize(probabilities)
86+
87+
# Print results
88+
for person in people:
89+
print(f"{person}:")
90+
for field in probabilities[person]:
91+
print(f" {field.capitalize()}:")
92+
for value in probabilities[person][field]:
93+
p = probabilities[person][field][value]
94+
print(f" {value}: {p:.4f}")
95+
96+
97+
def load_data(filename):
98+
"""
99+
Load gene and trait data from a file into a dictionary.
100+
File assumed to be a CSV containing fields name, mother, father, trait.
101+
mother, father must both be blank, or both be valid names in the CSV.
102+
trait should be 0 or 1 if trait is known, blank otherwise.
103+
"""
104+
data = dict()
105+
with open(filename) as f:
106+
reader = csv.DictReader(f)
107+
for row in reader:
108+
name = row["name"]
109+
data[name] = {
110+
"name": name,
111+
"mother": row["mother"] or None,
112+
"father": row["father"] or None,
113+
"trait": (True if row["trait"] == "1" else
114+
False if row["trait"] == "0" else None)
115+
}
116+
return data
117+
118+
119+
def powerset(s):
120+
"""
121+
Return a list of all possible subsets of set s.
122+
"""
123+
s = list(s)
124+
return [
125+
set(s) for s in itertools.chain.from_iterable(
126+
itertools.combinations(s, r) for r in range(len(s) + 1)
127+
)
128+
]
129+
130+
131+
def joint_probability(people, one_gene, two_genes, have_trait):
132+
"""
133+
Compute and return a joint probability.
134+
135+
The probability returned should be the probability that
136+
* everyone in set `one_gene` has one copy of the gene, and
137+
* everyone in set `two_genes` has two copies of the gene, and
138+
* everyone not in `one_gene` or `two_gene` does not have the gene, and
139+
* everyone in set `have_trait` has the trait, and
140+
* everyone not in set` have_trait` does not have the trait.
141+
"""
142+
probability = 1
143+
144+
for person in people:
145+
gene_number = 1 if person in one_gene else 2 if person in two_genes else 0
146+
trait = True if person in have_trait else False
147+
148+
gene_numb_prop = PROBS['gene'][gene_number]
149+
trait_prop = PROBS['trait'][gene_number][trait]
150+
151+
if people[person]['mother'] is None:
152+
# no parents, use probability distribution
153+
probability *= gene_numb_prop * trait_prop
154+
else:
155+
# info about parents is available
156+
mother = people[person]['mother']
157+
father = people[person]['father']
158+
percentages = {}
159+
160+
for ppl in [mother, father]:
161+
number = 1 if ppl in one_gene else 2 if ppl in two_genes else 0
162+
perc = 0 + PROBS['mutation'] if number == 0 else 0.5 if number == 1 else 1 - PROBS['mutation']
163+
percentages[ppl] = perc
164+
165+
if gene_number == 0:
166+
# 0, none of parents gave gene
167+
probability *= (1 - percentages[mother]) * (1 - percentages[father])
168+
elif gene_number == 1:
169+
# 1, one of parents gave gene
170+
probability *= (1 - percentages[mother]) * percentages[father] + percentages[mother] * (1 - percentages[father])
171+
else:
172+
# 2, both of parents gave gene
173+
probability *= percentages[mother] * percentages[father]
174+
175+
probability *= trait_prop
176+
177+
return probability
178+
179+
180+
def update(probabilities, one_gene, two_genes, have_trait, p):
181+
"""
182+
Add to `probabilities` a new joint probability `p`.
183+
Each person should have their "gene" and "trait" distributions updated.
184+
Which value for each distribution is updated depends on whether
185+
the person is in `have_gene` and `have_trait`, respectively.
186+
"""
187+
for person in probabilities:
188+
gene_number = 1 if person in one_gene else 2 if person in two_genes else 0
189+
probabilities[person]["gene"][gene_number] += p
190+
probabilities[person]["trait"][person in have_trait] += p
191+
192+
193+
def normalize(probabilities):
194+
"""
195+
Update `probabilities` such that each probability distribution
196+
is normalized (i.e., sums to 1, with relative proportions the same).
197+
"""
198+
normalized = probabilities.copy()
199+
for person in probabilities:
200+
for typ in ['gene', 'trait']:
201+
summed = sum(probabilities[person][typ].values())
202+
for category in probabilities[person][typ]:
203+
val = probabilities[person][typ][category]
204+
normalized_val = val / summed
205+
normalized[person][typ][category] = normalized_val
206+
return normalized
207+
208+
209+
if __name__ == "__main__":
210+
main()
Binary file not shown.

0 commit comments

Comments
 (0)