amoshnin
diff --git a/‎2-uncertainty/2a-pagerank/pagerank.py
+166 b/‎2-uncertainty/2a-pagerank/pagerank.py
+166
diff --git a/‎2-uncertainty/2b-heredity/heredity.py
+210 b/‎2-uncertainty/2b-heredity/heredity.py
+210
diff --git a/‎3-optimization/3a-crossword/assets/fonts/OpenSans-Regular.ttf
212 KB b/‎3-optimization/3a-crossword/assets/fonts/OpenSans-Regular.ttf
212 KB
@@ -0,0 +1,166 @@
+import os
+import random
+import re
+import sys
+
+DAMPING = 0.85
+SAMPLES = 10000
+
+
+def main():
+    if len(sys.argv) != 2:
+        sys.exit("Usage: python pagerank.py corpus")
+    corpus = crawl(sys.argv[1])
+    ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
+    print(f"PageRank Results from Sampling (n = {SAMPLES})")
+    for page in sorted(ranks):
+        print(f"  {page}: {ranks[page]:.4f}")
+    ranks = iterate_pagerank(corpus, DAMPING)
+    print(f"PageRank Results from Iteration")
+    for page in sorted(ranks):
+        print(f"  {page}: {ranks[page]:.4f}")
+
+
+def crawl(directory):
+    """
+    Parse a directory of HTML pages and check for links to other pages.
+    Return a dictionary where each key is a page, and values are
+    a list of all other pages in the corpus that are linked to by the page.
+    """
+    pages = dict()
+
+    # Extract all links from HTML files
+    for filename in os.listdir(directory):
+        if not filename.endswith(".html"):
+            continue
+        with open(os.path.join(directory, filename)) as f:
+            contents = f.read()
+            links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
+            pages[filename] = set(links) - {filename}
+
+    # Only include links to other pages in the corpus
+    for filename in pages:
+        pages[filename] = set(
+            link for link in pages[filename]
+            if link in pages
+        )
+
+    return pages
+
+
+def transition_model(corpus, page, damping_factor):
+    """
+    Return a probability distribution over which page to visit next,
+    given a current page.
+
+    With probability `damping_factor`, choose a link at random
+    linked to by `page`. With probability `1 - damping_factor`, choose
+    a link at random chosen from all pages in the corpus.
+    """
+
+    prop_dist = {}
+
+    # check if page has outgoing links
+    dict_len = len(corpus.keys())
+    pages_len = len(corpus[page])
+
+    if len(corpus[page]) < 1:
+        # no outgoing pages, choosing randomly from all possible pages
+        for key in corpus.keys():
+            prop_dist[key] = 1 / dict_len
+
+    else:
+        # there are outgoing pages, calculating distribution
+        random_factor = (1 - damping_factor) / dict_len
+        even_factor = damping_factor / pages_len
+
+        for key in corpus.keys():
+            if key not in corpus[page]:
+                prop_dist[key] = random_factor
+            else:
+                prop_dist[key] = even_factor + random_factor
+
+    return prop_dist
+
+
+def sample_pagerank(corpus, damping_factor, n):
+    """
+    Return PageRank values for each page by sampling `n` pages
+    according to transition model, starting with a page at random.
+
+    Return a dictionary where keys are page names, and values are
+    their estimated PageRank value (a value between 0 and 1). All
+    PageRank values should sum to 1.
+    """
+
+    # prepare dictionary with number of samples == 0
+    samples_dict = corpus.copy()
+    for i in samples_dict:
+        samples_dict[i] = 0
+    sample = None
+
+    # itearting n times
+    for _ in range(n):
+        if sample:
+            # previous sample is available, choosing using transition model
+            dist = transition_model(corpus, sample, damping_factor)
+            dist_lst = list(dist.keys())
+            dist_weights = [dist[i] for i in dist]
+            sample = random.choices(dist_lst, dist_weights, k=1)[0]
+        else:
+            # no previous sample, choosing randomly
+            sample = random.choice(list(corpus.keys()))
+
+        # count each sample
+        samples_dict[sample] += 1
+
+    # turn sample count to percentage
+    for item in samples_dict:
+        samples_dict[item] /= n
+
+    return samples_dict
+
+
+def iterate_pagerank(corpus, damping_factor):
+    """
+    Return PageRank values for each page by iteratively updating
+    PageRank values until convergence.
+
+    Return a dictionary where keys are page names, and values are
+    their estimated PageRank value (a value between 0 and 1). All
+    PageRank values should sum to 1.
+    """
+    pages_number = len(corpus)
+    old_dict = {}
+    new_dict = {}
+
+    # assigning each page a rank of 1/n, where n is total number of pages in the corpus
+    for page in corpus:
+        old_dict[page] = 1 / pages_number
+
+    # repeatedly calculating new rank values basing on all of the current rank values
+    while True:
+        for page in corpus:
+            temp = 0
+            for linking_page in corpus:
+                # check if page links to our page
+                if page in corpus[linking_page]:
+                    temp += (old_dict[linking_page] / len(corpus[linking_page]))
+                # if page has no links, interpret it as having one link for every other page
+                if len(corpus[linking_page]) == 0:
+                    temp += (old_dict[linking_page]) / len(corpus)
+            temp *= damping_factor
+            temp += (1 - damping_factor) / pages_number
+
+            new_dict[page] = temp
+
+        difference = max([abs(new_dict[x] - old_dict[x]) for x in old_dict])
+        if difference < 0.001:
+            break
+        else:
+            old_dict = new_dict.copy()
+
+    return old_dict
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,210 @@
+import csv
+import itertools
+import sys
+
+PROBS = {
+
+    # Unconditional probabilities for having gene
+    "gene": {
+        2: 0.01,
+        1: 0.03,
+        0: 0.96
+    },
+
+    "trait": {
+
+        # Probability of trait given two copies of gene
+        2: {
+            True: 0.65,
+            False: 0.35
+        },
+
+        # Probability of trait given one copy of gene
+        1: {
+            True: 0.56,
+            False: 0.44
+        },
+
+        # Probability of trait given no gene
+        0: {
+            True: 0.01,
+            False: 0.99
+        }
+    },
+
+    # Mutation probability
+    "mutation": 0.01
+}
+
+
+def main():
+
+    # Check for proper usage
+    if len(sys.argv) != 2:
+        sys.exit("Usage: python heredity.py data.csv")
+    people = load_data(sys.argv[1])
+
+    # Keep track of gene and trait probabilities for each person
+    probabilities = {
+        person: {
+            "gene": {
+                2: 0,
+                1: 0,
+                0: 0
+            },
+            "trait": {
+                True: 0,
+                False: 0
+            }
+        }
+        for person in people
+    }
+
+    # Loop over all sets of people who might have the trait
+    names = set(people)
+    for have_trait in powerset(names):
+
+        # Check if current set of people violates known information
+        fails_evidence = any(
+            (people[person]["trait"] is not None and
+             people[person]["trait"] != (person in have_trait))
+            for person in names
+        )
+        if fails_evidence:
+            continue
+
+        # Loop over all sets of people who might have the gene
+        for one_gene in powerset(names):
+            for two_genes in powerset(names - one_gene):
+
+                # Update probabilities with new joint probability
+                p = joint_probability(people, one_gene, two_genes, have_trait)
+                update(probabilities, one_gene, two_genes, have_trait, p)
+
+    # Ensure probabilities sum to 1
+    normalize(probabilities)
+
+    # Print results
+    for person in people:
+        print(f"{person}:")
+        for field in probabilities[person]:
+            print(f"  {field.capitalize()}:")
+            for value in probabilities[person][field]:
+                p = probabilities[person][field][value]
+                print(f"    {value}: {p:.4f}")
+
+
+def load_data(filename):
+    """
+    Load gene and trait data from a file into a dictionary.
+    File assumed to be a CSV containing fields name, mother, father, trait.
+    mother, father must both be blank, or both be valid names in the CSV.
+    trait should be 0 or 1 if trait is known, blank otherwise.
+    """
+    data = dict()
+    with open(filename) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            name = row["name"]
+            data[name] = {
+                "name": name,
+                "mother": row["mother"] or None,
+                "father": row["father"] or None,
+                "trait": (True if row["trait"] == "1" else
+                          False if row["trait"] == "0" else None)
+            }
+    return data
+
+
+def powerset(s):
+    """
+    Return a list of all possible subsets of set s.
+    """
+    s = list(s)
+    return [
+        set(s) for s in itertools.chain.from_iterable(
+            itertools.combinations(s, r) for r in range(len(s) + 1)
+        )
+    ]
+
+
+def joint_probability(people, one_gene, two_genes, have_trait):
+    """
+    Compute and return a joint probability.
+
+    The probability returned should be the probability that
+        * everyone in set `one_gene` has one copy of the gene, and
+        * everyone in set `two_genes` has two copies of the gene, and
+        * everyone not in `one_gene` or `two_gene` does not have the gene, and
+        * everyone in set `have_trait` has the trait, and
+        * everyone not in set` have_trait` does not have the trait.
+    """
+    probability = 1
+
+    for person in people:
+        gene_number = 1 if person in one_gene else 2 if person in two_genes else 0
+        trait = True if person in have_trait else False
+
+        gene_numb_prop = PROBS['gene'][gene_number]
+        trait_prop = PROBS['trait'][gene_number][trait]
+
+        if people[person]['mother'] is None:
+            # no parents, use probability distribution
+            probability *= gene_numb_prop * trait_prop
+        else:
+            # info about parents is available
+            mother = people[person]['mother']
+            father = people[person]['father']
+            percentages = {}
+
+            for ppl in [mother, father]:
+                number = 1 if ppl in one_gene else 2 if ppl in two_genes else 0
+                perc = 0 + PROBS['mutation'] if number == 0 else 0.5 if number == 1 else 1 - PROBS['mutation']
+                percentages[ppl] = perc
+
+            if gene_number == 0:
+                # 0, none of parents gave gene
+                probability *= (1 - percentages[mother]) * (1 - percentages[father])
+            elif gene_number == 1:
+                # 1, one of parents gave gene
+                probability *= (1 - percentages[mother]) * percentages[father] + percentages[mother] * (1 - percentages[father])
+            else:
+                # 2, both of parents gave gene
+                probability *= percentages[mother] * percentages[father]
+
+            probability *= trait_prop
+
+    return probability
+
+
+def update(probabilities, one_gene, two_genes, have_trait, p):
+    """
+    Add to `probabilities` a new joint probability `p`.
+    Each person should have their "gene" and "trait" distributions updated.
+    Which value for each distribution is updated depends on whether
+    the person is in `have_gene` and `have_trait`, respectively.
+    """
+    for person in probabilities:
+        gene_number = 1 if person in one_gene else 2 if person in two_genes else 0
+        probabilities[person]["gene"][gene_number] += p
+        probabilities[person]["trait"][person in have_trait] += p
+
+
+def normalize(probabilities):
+    """
+    Update `probabilities` such that each probability distribution
+    is normalized (i.e., sums to 1, with relative proportions the same).
+    """
+    normalized = probabilities.copy()
+    for person in probabilities:
+        for typ in ['gene', 'trait']:
+            summed = sum(probabilities[person][typ].values())
+            for category in probabilities[person][typ]:
+                val = probabilities[person][typ][category]
+                normalized_val = val / summed
+                normalized[person][typ][category] = normalized_val
+    return normalized
+
+
+if __name__ == "__main__":
+    main()