forked from learn-co-curriculum/dsc-selecting-data-v2-4
-
Notifications
You must be signed in to change notification settings - Fork 0
/
splitter.py
170 lines (121 loc) · 4.58 KB
/
splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import json
import os
import subprocess
from git import Repo, Git, GitCommandError
import sys
# CONSTANTS
SOLUTION_TAG = "__SOLUTION__"
CURRICULUM_BRANCH = "curriculum"
MASTER_BRANCH = "master"
SOLUTION_BRANCH = "solution"
CUSTOM_COMMIT_MSG_FLAG = "-m"
# FUNCTIONS
def get_notebook_json(filename="index.ipynb"):
with open(filename, 'r') as f:
data = json.load(f)
return data
def is_markdown_cell(cell):
return cell["cell_type"] == "markdown"
def contains_tag(line):
# returns true for '# __SOLUTION__' or '#__SOLUTION__'
return any(tag in line.strip().split(" ") for tag in [SOLUTION_TAG, f"#{SOLUTION_TAG}"])
def is_solution_cell(cell):
if cell["cell_type"] != "code":
return False
# does any line of the cell have the SOLUTION tag anywhere in it
found_tag = [True for line in cell["source"] if contains_tag(line)]
return bool(len(found_tag))
# removes __SOLUTON__ line from tagged code cells
def untag(cell):
if cell["cell_type"] != "code":
return cell
source = [line for line in cell["source"] if not contains_tag(line)]
cell.update({"source": source})
return cell
def create_master_notebook(nb):
cells = [
cell for cell in nb["cells"] if for_master(cell)
]
nb.update({"cells": cells})
return nb
def for_master(cell):
return is_markdown_cell(cell) or not is_solution_cell(cell)
def for_sol(cell):
return is_markdown_cell(cell) or is_solution_cell(cell)
def create_sol_notebook(nb):
cells = [
untag(cell) for cell in nb["cells"] if for_sol(cell)
]
nb.update({"cells": cells})
return nb
def write_new_notebook(notebook):
f = open("index.ipynb", "w")
f.write(json.dumps(notebook))
f.close()
def notebook_to_markdown():
subprocess.call(["jupyter", "nbconvert", "index.ipynb", "--to", "markdown"])
subprocess.call(["mv", "index.md", "README.md"])
def sync_branch(repo, branch, notebook, msg="Curriculum Auto-Sync"):
# switch to branch, do nothing if does not exist
try:
repo.git.checkout(branch)
branch_exists = True
except GitCommandError:
branch_exists = False
if branch_exists:
# get all files from curriculum branch and put onto this branch,
# (the notebook and readme will be overwritten in the subsequent steps)
# Interesting use of the `checkout` command
# https://superuser.com/questions/692794/how-can-i-get-all-the-files-from-one-git-branch-and-put-them-into-the-current-b/1431858#1431858
repo.git.checkout(CURRICULUM_BRANCH, ".")
# delete current images, they'll be regenerated along with the notebook
subprocess.call(["rm", "-rf", "index_files"])
# write index.ipynb
write_new_notebook(notebook)
# generate markdown
notebook_to_markdown()
# add, commit, push
add_and_commit(repo, msg)
print(f"pushing to remote {branch} branch")
repo.git.push("origin", branch)
def get_commit_message(repo):
# get commit message from repo or custom flag
sys_args = list(sys.argv)
i = sys_args.index(CUSTOM_COMMIT_MSG_FLAG) if CUSTOM_COMMIT_MSG_FLAG in sys_args else None
return sys_args[i + 1] if i else repo.head.commit.message
def add_and_commit(repo, commit_msg):
repo.git.add(".")
try:
repo.git.commit("-m", commit_msg)
except GitCommandError:
print("Nothing to commit")
# RUN
# ======================
# Identity
git_ssh_identity_file = os.path.expanduser('~/.ssh/id_rsa')
git_ssh_cmd = f'ssh -i {git_ssh_identity_file}'
Git().custom_environment(GIT_SSH_COMMAND=git_ssh_cmd)
repo = Repo(os.getcwd())
# handling for updated main branch naming convention ensuring correct branch name
try:
repo.git.checkout('main')
MASTER_BRANCH = 'main'
except GitCommandError:
print('The main branch is not named "main"')
MASTER_BRANCH = 'master'
try:
repo.git.checkout(CURRICULUM_BRANCH)
except GitCommandError:
raise Exception(f"A branch called {CURRICULUM_BRANCH} must exist")
commit_message = get_commit_message(repo)
notebook_to_markdown()
add_and_commit(repo, commit_message)
print(f"pushing to remote {CURRICULUM_BRANCH} branch")
repo.git.push("origin", CURRICULUM_BRANCH)
notebook_json = get_notebook_json()
master_notebook = create_master_notebook(dict(notebook_json)) # pass a copy
sol_notebook = create_sol_notebook(dict(notebook_json)) # pass a copy
sync_branch(repo, MASTER_BRANCH, master_notebook, msg=commit_message)
sync_branch(repo, SOLUTION_BRANCH, sol_notebook, msg=commit_message)
# leave user on curriculum branch
repo.git.checkout(CURRICULUM_BRANCH)