-
Notifications
You must be signed in to change notification settings - Fork 0
/
rebootinstances.py
220 lines (190 loc) · 6.72 KB
/
rebootinstances.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import json, boto3, time, os
from datetime import datetime, timedelta
#############################################################
# Greg Heywood #
# v 1.2 #
#############################################################
# Run on a schedule, and set the maximum execution time! #
# Set the variables in the Lmabda configuration #
# - You may need to look at the metric dimensions..... #
# depending on how they are configured. #
#############################################################
# The script will:
# Get the target group
# get the instances
# check number of healthy instances. Break if less than three
# For each instance check cloudwatch metric
# For first one that has RAM high (or one with highest RAM)?
# If higher than 80%, remove from TG and reboot
# Add back in to TG
# 1.2 moved AMI check to the instance loop.
elbv2 = boto3.client('elbv2')
ec2 = boto3.resource('ec2')
ec2c = boto3.client('ec2')
cloudwatch = boto3.client('cloudwatch')
def lambda_handler(event, context):
targetGroupName = 'tg1'
targetGroupName80 = 'tg1-80'
autoScalingGroup = 'autoScalingGroup'
instanceMem = {}
tgDelay = 0
myCount = 0
myCount80 = 0
tgInstances = []
# Get the ARN of the Target Group
response = elbv2.describe_target_groups(
Names=[
targetGroupName,
],
)
# The main TG
targetGroupArn = response['TargetGroups'][0]['TargetGroupArn']
response = elbv2.describe_target_groups(
Names=[
targetGroupName80,
],
)
# The TG for port 80
targetGroupArn80 = response['TargetGroups'][0]['TargetGroupArn']
# Get the deregistration delay value
response = elbv2.describe_target_group_attributes(
TargetGroupArn=targetGroupArn
)
tgDelay=response['Attributes'][2]['Value']
# Check if healthy hosts is great than 5 in the main TG:
response = elbv2.describe_target_health(
TargetGroupArn=targetGroupArn,
)
for x in response['TargetHealthDescriptions']:
tgInstances.append(x['Target']['Id'])
if x['TargetHealth']['State'] == 'healthy':
myCount +=1
print(str(myCount) + " healthy hosts")
# Check if healthy hosts is great than 3 in the port 80 TG:
response = elbv2.describe_target_health(
TargetGroupArn=targetGroupArn80,
)
for x in response['TargetHealthDescriptions']:
#tgInstances.append(x['Target']['Id'])
if x['TargetHealth']['State'] == 'healthy':
myCount80 +=1
print(str(myCount80) + " healthy hosts")
if (myCount <= 5) or (myCount80 <= 5):
print(str(myCount) + ' healthy hosts in 443 TG.')
print(str(myCount80) + ' healthy hosts in port 80. Terminating.')
return
# get the instance ID of the host that triggered the alarm
#message = json.loads(event['Records'][0]['Sns']['Message'])
#instanceId = message['Trigger']['Dimensions'][0]['value']
#print('InstanceId = ' + instanceId)
# get hosts % RAM metric
for x in tgInstances:
# Get the AMI ID from the instance #
response = ec2c.describe_instances(
InstanceIds=[
x
]
)
ami = response['Reservations'][0]['Instances'][0]['ImageId']
instanceType = response['Reservations'][0]['Instances'][0]['InstanceType']
response = cloudwatch.get_metric_statistics(
Namespace='CWAgent',
MetricName='Memory % Committed Bytes In Use',
Dimensions=[
{
"Name": "InstanceId",
"Value": x
},
{
"Name": "AutoScalingGroupName",
"Value": autoScalingGroup
},
{
"Name": "ImageId",
"Value": ami
},
{
"Name": "objectname",
"Value": "Memory"
},
{
"Name": "InstanceType",
"Value": instanceType
}
],
StartTime=datetime.utcnow() - timedelta(seconds=300),
EndTime=datetime.utcnow(),
Period=300,
Statistics=[
'Average'
]
)
#Add an entry to the dictionary with instance ID and the Average RAM
print(x,":",response['Datapoints'][0]['Average'])
instanceMem[x]=response['Datapoints'][0]['Average']
#Find the instance with the highest used RAM
instanceId = sorted(instanceMem, key=instanceMem.__getitem__)[-1]
print('Instance with highest RAM load: ' + instanceId)
print('Memory used: ' + str(instanceMem[instanceId]))
if instanceMem[instanceId] > 65:
print('Memory too high, going to reboot.')
# remove the instance from the TGs
print('Removing from the target groups.')
response = elbv2.deregister_targets(
TargetGroupArn=targetGroupArn,
Targets=[
{
'Id': instanceId,
'Port': 443
},
]
)
print(response)
response = elbv2.deregister_targets(
TargetGroupArn=targetGroupArn80,
Targets=[
{
'Id': instanceId,
'Port': 80
},
]
)
print(response)
# Wait for the instance to degregister
print('Waiting for degregistration from TG...')
print('Target group delay: ', tgDelay)
time.sleep(int(tgDelay))
# Reboot it
print('Performing reboot')
response = ec2c.reboot_instances(
InstanceIds=[
instanceId,
],
DryRun=False
)
print(response)
# Wait for the instance to start
print('Waiting before placing the instance back in the TG...')
time.sleep(120)
# add it back to the TGs
print('Adding ' + instanceId + ' to the TG')
response = elbv2.register_targets(
TargetGroupArn=targetGroupArn,
Targets=[
{
'Id': instanceId,
'Port': 443
},
]
)
print(response)
response = elbv2.register_targets(
TargetGroupArn=targetGroupArn80,
Targets=[
{
'Id': instanceId,
'Port': 80
},
]
)
print(response)