Skip to content

Commit 4fc7ccb

Browse files
authored
Merge pull request #40 from aws-samples/handle-ghost-kernels
handle kernels stuck in starting state
2 parents 2be03fe + e6f49d1 commit 4fc7ccb

File tree

3 files changed

+43
-39
lines changed

3 files changed

+43
-39
lines changed
-7.92 KB
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
version_info = (0, 1, 1)
1+
version_info = (0, 1, 5)
22
__version__ = ".".join(map(str, version_info))

sagemaker_studio_autoshutdown/idle_checker.py

Lines changed: 42 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,17 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13-
13+
1414
import asyncio
1515
import json
1616
import time
1717
import traceback
1818
from contextlib import suppress
1919
from datetime import datetime
20-
20+
2121
from notebook.utils import url_path_join
22-
23-
22+
23+
2424
class IdleChecker(object):
2525
def __init__(self):
2626
self.interval = 10 # frequency for checking idle sessions in seconds
@@ -36,7 +36,7 @@ def __init__(self):
3636
self.app_url = "http://0.0.0.0:8888"
3737
self.keep_terminals = False
3838
self.inservice_apps = {}
39-
39+
4040
# Function to GET the xsrf token
4141
async def fetch_xsrf_token(self):
4242
url = url_path_join(self.app_url, self.base_url, "tree")
@@ -45,9 +45,9 @@ async def fetch_xsrf_token(self):
4545
self.log.info("response headers: " + str(response.headers))
4646
if "Set-Cookie" in response.headers:
4747
return response.headers["Set-Cookie"].split(";")[0].split("=")[1]
48-
48+
4949
return None
50-
50+
5151
# Invoke idle_checks() function
5252
async def run_idle_checks(self):
5353
while True:
@@ -59,7 +59,7 @@ async def run_idle_checks(self):
5959
except Exception:
6060
self.errors = traceback.format_exc()
6161
self.log.error(self.errors)
62-
62+
6363
# Entrypoint function to get the value from handlers(POST API call) and start background job
6464
def start(self, base_url, log_handler, client, idle_time, keep_terminals):
6565
self.idle_time = idle_time
@@ -68,26 +68,26 @@ def start(self, base_url, log_handler, client, idle_time, keep_terminals):
6868
self.log = log_handler
6969
self.keep_terminals = keep_terminals
7070
self.errors = None # clear error array at start
71-
71+
7272
if not self._running:
7373
self.count += 1
7474
self._running = True
7575
self.task = asyncio.ensure_future(self.run_idle_checks())
76-
76+
7777
async def stop(self):
7878
if self._running:
7979
self._running = False
8080
if self.task:
8181
self.task.cancel()
8282
with suppress(asyncio.CancelledError):
8383
await self.task
84-
84+
8585
def get_runcounts(self):
8686
return self.count
87-
87+
8888
def get_runerrors(self):
8989
return self.errors
90-
90+
9191
# Function to check if the notebook is in Idle state
9292
def is_idle(self, last_activity, seconds=False):
9393
last_activity = datetime.strptime(last_activity, "%Y-%m-%dT%H:%M:%S.%fz")
@@ -107,43 +107,43 @@ def is_idle(self, last_activity, seconds=False):
107107
"Notebook is not idle. Last activity time = " + str(last_activity)
108108
)
109109
return False
110-
110+
111111
# Function to get the list of Kernel sessions
112112
async def get_sessions(self):
113113
url = url_path_join(self.app_url, self.base_url, "api", "sessions")
114114
response = await self.tornado_client.fetch(url, method="GET")
115115
sessions = json.loads(response.body)
116116
self.log.info(" Kernel Session is = " + str(sessions))
117117
return sessions
118-
118+
119119
# Function to get the list of System Terminals
120120
async def get_terminals(self):
121121
terminal_url = url_path_join(self.app_url, self.base_url, "api", "terminals")
122122
terminal_response = await self.tornado_client.fetch(terminal_url, method="GET")
123123
terminals = json.loads(terminal_response.body)
124124
return terminals
125-
125+
126126
# Function to get the list of running Apps
127127
async def get_apps(self):
128128
url = url_path_join(self.app_url, self.base_url, "sagemaker", "api", "apps")
129129
response = await self.tornado_client.fetch(url, method="GET")
130130
apps = json.loads(response.body)
131131
self.log.info(" Running App name is = " + str(apps))
132132
return apps
133-
133+
134134
# Function to build app information ( kernel sessions and image terminals)
135135
async def build_app_info(self):
136136
apps = await self.get_apps()
137137
apps_info = {}
138138
for app in apps:
139139
apps_info[app["app_name"]] = {"app": app, "sessions": [], "terminals": []}
140-
140+
141141
sessions = await self.get_sessions()
142142
for notebook in sessions:
143143
if notebook["kernel"]:
144144
notebook_app_name = notebook["kernel"]["app_name"]
145145
apps_info[notebook_app_name]["sessions"].append(notebook)
146-
146+
147147
terminals = await self.get_terminals()
148148
for terminal in terminals:
149149
if terminal["name"].find("arn:") != 0:
@@ -154,18 +154,18 @@ async def build_app_info(self):
154154
self.log.info("Env Arn = " + str(env_arn))
155155
self.log.info("Terminal Id = " + str(terminal_id))
156156
self.log.info("Instance Type = "+ str(instance_type))
157-
157+
158158
for app in apps:
159159
if (
160160
app["environment_arn"] == env_arn
161161
and app["instance_type"] == instance_type
162162
):
163163
apps_info[app["app_name"]]["terminals"].append(terminal)
164164
break
165-
165+
166166
self.log.info(str(apps_info))
167167
return apps_info
168-
168+
169169
# Function to delete a kernel session
170170
async def delete_session(self, session):
171171
headers = {}
@@ -178,7 +178,7 @@ async def delete_session(self, session):
178178
)
179179
deleted = await self.tornado_client.fetch(url, method="DELETE", headers=headers)
180180
self.log.info("Delete kernel response: " + str(deleted))
181-
181+
182182
# Function to delete an application
183183
async def delete_application(self, app_id):
184184
headers = {}
@@ -194,12 +194,12 @@ async def delete_application(self, app_id):
194194
self.log.info("Delete App response: " + str(deleted_apps))
195195
if deleted_apps.code == 204 or deleted_apps.code == 200:
196196
self.inservice_apps.pop(app_id, None)
197-
197+
198198
# Function to check the notebook status
199199
def check_notebook(self, notebook):
200200
terminate = True
201-
if notebook["kernel"]["execution_state"] == "idle":
202-
self.log.info("found idle session:" + str(notebook))
201+
if notebook["kernel"]["execution_state"] in ("idle", "starting"):
202+
self.log.info("found idle/starting session:" + str(notebook))
203203
if not self.ignore_connections:
204204
if notebook["kernel"]["connections"] == 0:
205205
if not self.is_idle(notebook["kernel"]["last_activity"]):
@@ -212,7 +212,7 @@ def check_notebook(self, notebook):
212212
else:
213213
terminate = False
214214
return terminate
215-
215+
216216
# Run idle checks apps and image terminals
217217
async def idle_checks(self):
218218
apps_info = await self.build_app_info()
@@ -221,38 +221,38 @@ async def idle_checks(self):
221221
for deleted_app in deleted_apps:
222222
inservice_apps.pop(deleted_app, None)
223223
self.log.info("inservice app not inservice anymore : " + str(deleted_app))
224-
224+
225225
for app_name, app in apps_info.items():
226226
num_sessions = len(app["sessions"])
227227
num_terminals = len(app["terminals"])
228-
228+
229229
if num_sessions > 0 or num_terminals > 0:
230230
self.log.info(
231231
"# of sessions: "
232232
+ str(num_sessions)
233233
+ "; # of terminals: "
234234
+ str(num_terminals)
235235
)
236-
236+
237237
if num_sessions == 0 and num_terminals == 0:
238238
# Check if app is active and kill
239239
# Check if the current app is part of the in service apps
240240
if app_name not in inservice_apps:
241241
# Regsiter a new inservice app
242242
inservice_apps[app_name] = time.time()
243-
243+
244244
else:
245245
if int(time.time() - inservice_apps[app_name]) > self.idle_time:
246246
self.log.info(
247247
"Keep alive time for terminal reached : " + str(app_name)
248248
)
249249
await self.delete_application(app_name)
250-
250+
251251
# elif num_sessions < 1 and num_terminals > 0 and self.keep_terminals == True:
252252
elif num_sessions < 1 and num_terminals > 0 and self.keep_terminals:
253253
self.log.info("keep terminals flag is True. Not killing the terminals.")
254254
pass
255-
255+
256256
elif (
257257
# num_sessions < 1 and num_terminals > 0 and self.keep_terminals == False
258258
num_sessions < 1
@@ -262,25 +262,29 @@ async def idle_checks(self):
262262
self.log.info("keep terminals flag: " + str(self.keep_terminals))
263263
# Wait for the inservice app
264264
self.log.info("New inservice app found : " + str(app_name))
265-
265+
266266
# Check if the current app is part of the in service apps
267267
if app_name not in inservice_apps:
268268
# Regsiter a new inservice app
269269
inservice_apps[app_name] = time.time()
270-
270+
271271
else:
272272
if int(time.time() - inservice_apps[app_name]) > self.idle_time:
273273
self.log.info(
274274
"Keepalive time for terminal reached : " + str(app_name)
275275
)
276276
await self.delete_application(app_name)
277-
277+
278278
elif num_sessions > 0:
279279
# let's check if we have idle notebooks to kill
280280
nb_deleted = 0
281281
for notebook in app["sessions"]:
282282
if self.check_notebook(notebook):
283-
await self.delete_session(notebook)
284-
nb_deleted += 1
283+
# handle kernel sessions which are stuck in "starting" state
284+
if notebook["kernel"]["execution_state"] == "starting":
285+
nb_deleted += 1
286+
else:
287+
await self.delete_session(notebook)
288+
nb_deleted += 1
285289
if num_sessions == nb_deleted and (not self.keep_terminals or num_terminals == 0):
286290
await self.delete_application(app_name)

0 commit comments

Comments
 (0)