-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_server_hardware_sh
executable file
·2135 lines (1992 loc) · 103 KB
/
get_server_hardware_sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash
#****************************************************************#
# ScriptName: get_server_hardware.sh
# Author: [email protected]
# Create Date: 2014-01-01 17:09
# Function: check server hardware
#***************************************************************#
#v1.0;2012-07-15;created by longjiang; amalgamate B2B Aliyun Taobao's check_hw ;and add some function like light LED、memcheck、suport idcfree and dragoon.
#2012-07-31; modified by zhilin.lkw; remove tools installation functions.
#2012-08-28; modification completed by zhilin.lkw; testing started.
export PATH="/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin"
export LANG="en_US.UTF-8"
VER="v59"
# COMMAND MACROS
if grep -q "6\.[0-9]" /etc/redhat-release 2>/dev/null; then
CMD_YUM="/usr/bin/python2.6 /usr/bin/yum"
elif grep -q "5\.[0-9]" /etc/redhat-release 2>/dev/null; then
CMD_YUM="/usr/bin/python2.4 /usr/bin/yum"
elif grep -q "release 4" /etc/redhat-release 2>/dev/null; then
CMD_YUM="/usr/bin/python2.3 /usr/bin/yum"
else
CMD_YUM="/usr/bin/yum"
fi
CMD_RPM="/bin/rpm"
CMD_TAR="/bin/tar"
CMD_LN="/bin/ln"
CMD_TOUCH="/bin/touch"
CMD_SED="/bin/sed"
CMD_DMESG="/bin/dmesg"
CMD_MKDIR="/bin/mkdir"
CMD_MKNOD="/bin/mknod"
CMD_CHKCONFIG="/sbin/chkconfig"
CMD_SERVICE="/sbin/service"
CMD_IP="/sbin/ip"
CMD_IFCONFIG="/sbin/ifconfig"
CMD_LSPCI="/sbin/lspci"
CMD_MDADM="/sbin/mdadm"
CMD_MODPROBE="/sbin/modprobe"
CMD_FILE="/usr/bin/file"
CMD_IPMITOOL="/usr/bin/ipmitool"
CMD_WGET="/usr/bin/wget -T10 -t3 -w3 -c -q -P /tmp"
if [ -e /usr/bin/curl ]; then
CMD_CURL="/usr/bin/curl -m 10 --connect-timeout 5 -4 -s"
else
curl_dir=`which curl`
CMD_CURL="$curl_dir -m 10 --connect-timeout 5 -4 -s"
fi
CMD_INSTALL="/usr/bin/install"
CMD_TEE="/usr/bin/tee"
CMD_DMIDECODE="/usr/sbin/dmidecode"
CMD_XM="/usr/sbin/xm"
CMD_SMARTCTL="/usr/sbin/smartctl"
CMD_LSIUTIL="/usr/local/sbin/lsiutil"
CMD_HPACUCLI="/usr/sbin/hpacucli"
CMD_HPACUCLI8="/usr/local/sbin/hpacucli-8.60-8.0/hpacucli"
test `uname -i` = "x86_64" && CMD_MEGACLI="/opt/MegaRAID/MegaCli/MegaCli64" || CMD_MEGACLI="/opt/MegaRAID/MegaCli/MegaCli"
CMD_MEGARC="/usr/local/sbin/megarc.bin"
CMD_CFGGEN="/usr/local/sbin/cfggen"
CMD_SAS2IRCU="/usr/local/sbin/sas2ircu"
CMD_ARCCONF="/usr/local/sbin/arcconf"
CMD_ARCCONF_SUN="/usr/StorMan/arcconf"
#by luxue
CMD_HWINFO_HELP="/usr/alisys/dragoon/libexec/armory/hwinfo/helper.py"
errorfileinfo="/dev/shm/errorfileinfo.log"
CHECK_NIC_BONDING0="/proc/net/bonding/"
# status
ok=0
debug=1
warning=1
error=2
critical=3
unknown=4
exitstatus=$ok
bbustatus=$ok
flagstatus=$ok
# message handling
msg=""
err_msg=""
# common global vars
suf="`id -u`"
[ x"$IS_VENDOR" = "x1" ] && IS_HWQC=1
([ -e /etc/ramos-release ] || hostname | grep -qE 'RAMOS|NGIS') && IS_RAMOS=YES || IS_RAMOS=NO
pub_alitype=
pub_sn=
# for logging
pub_log_time=`date`
pub_start_time=`date +'%Y-%m-%d %H:%M:%S'`
pub_start_time_stamp=`date +%s`
# common URL, directories and hwinfo integration
pub_tools_server="http://yum.tbsite.net/aliyun/5Server/x86_64/server/hardware/tool"
pub_chkhw_logdir="/var/log/check_hw"
pub_lock_file="/dev/shm/check_hardware.pid"
pub_hwinfo="/etc/hwinfo/hwinfo.conf"
pub_disk_raw=
pub_slot=
function getArmoryUrl(){
#弹外的用a.am.alibaba-inc.com 开发测试用a.alibaba-inc.com 主站内网用api.a.alibaba-inc.com
armoryApiUlr=''
apiUrls=('api.a.alibaba-inc.com' 'a.alibaba-inc.com' 'a.am.alibaba-inc.com')
for ((i=0;i<${#apiUrls[@]};i++))
do
/usr/bin/nc -vzw2 ${apiUrls[$i]} 80 >/dev/null 2>&1
if [ "$?" == "0" ];then
echo ${apiUrls[$i]}
break
fi
done
}
# format json output
function pub_json_output() {
[ -n "$err_msg" ] && flagstatus=$debug
# [ -z "$bbu_msg" ] && bbu_msg="OK"
# {"name":"raidbbu", "status":$bbustatus, "msg":"$bbu_msg"}
cat<<EOF
{
"collection_flag":$flagstatus,
"error_info":"${err_msg//\"/\"}",
"MSG":[
{"name":"general", "status":$exitstatus, "msg":"${msg//\"/\"}"}
]
}
EOF
}
# write running log to /dev/shm/check_hardware.runtime and exit the script
function pub_exit() {
local pri_chkhw_runtime=/dev/shm/check_hardware.runtime
if [[ `wc -l ${pri_chkhw_runtime} 2>/dev/null |awk '{print $1}'` -gt "500" ]]; then
${CMD_SUDO} ${CMD_SED} -i '1,5d' ${pri_chkhw_runtime} &>/dev/null
fi
echo "$pub_start_time - `date +'%Y-%m-%d %H:%M:%S'` `basename $0` run complete as uid:`id -u` with ${VER}." | ${CMD_SUDO} ${CMD_TEE} -a ${pri_chkhw_runtime} >/dev/null 2>&1
if [ $exitstatus -eq $critical ]; then
exit $critical
elif [ $exitstatus -eq $error ]; then
exit $error
elif [ $exitstatus -eq $warning -o $flagstatus -eq $debug ]; then
exit 1
elif [ $exitstatus -eq $ok -a $flagstatus -eq $ok ]; then
exit 0
else
exit $unknown
fi
}
function pub_messageAppend() {
if [ -z "$msg" ]; then
msg="$1"
else
msg="$msg $1"
fi
}
# append message to /var/log/check_hw/check_hardware.log
function pub_logfileAppend() {
[ -z "$1" ] && return
local pri_mes=$1
local pri_chkhw_log=${pub_chkhw_logdir}/check_hardware.log
if [[ `wc -l ${pri_chkhw_log} 2>/dev/null |awk '{print $1}'` -gt "2000" ]]; then
${CMD_SED} -i '1,5d' ${pri_chkhw_log} &>/dev/null
fi
echo "$pub_log_time: $pri_mes" | ${CMD_SUDO} ${CMD_TEE} -a ${pri_chkhw_log} >/dev/null 2>&1
}
# append message to /var/log/messages
function pub_messageSend() {
[ x"$IS_RAMOS" = "xYES" ] && return
local pri_MSG_CHN=$1 # MSG_CHN 为报告的频道: debug,version,selfmon,raidpd,raidld,raidbbu,storage,filesystem,mem
local pri_MSG_TAG=$2 # MSG_TAG 为报告关键字,其值范围为:[ OK ] 、[ DEBUG ] 、[ CRITICAL ]、 [ UNKNOW ]
local pri_MSG_TXT=$3 # MSG_TXT 为报告的内容(这个消息可考虑追加给$pub_message)
${CMD_SUDO} logger -p daemon.info -t HWBMC "checkhw.${pri_MSG_CHN} ${pri_MSG_TAG} ${pri_MSG_TXT}" &>/dev/null
}
# Update message to /etc/motd
function pub_motdEdit() {
[ -n "$bbu_msg" ] && bbu_msg=" $bbu_msg"
if [[ -z "$msg" && -z "$bbu_msg" ]]; then
msg="OK - Health check passed."
elif [ -z "$msg" ]; then
msg="OK - $bbu_msg"
elif [ $exitstatus -eq $critical ]; then
msg="Critical - ${msg} ${bbu_msg}"
elif [ $exitstatus -eq $error ]; then
msg="Error - ${msg} ${bbu_msg}"
else
msg="Warning - ${msg} ${bbu_msg}"
fi
${CMD_SUDO} ${CMD_SED} '/Check Hardware Notice/d' -i /etc/motd &>/dev/null
#if echo $msg|grep -qE "^Critical|^Error"; then
# echo "Check Hardware Notice:["$msg"], check it in http://idc.alibaba-inc.com/" | ${CMD_SUDO} ${CMD_TEE} -a /etc/motd >/dev/null 2>&1
#elif echo $msg | grep -q "^Warning"; then
# echo "Check Hardware Notice:["$msg"], not reported to IDCFree as it's not critical yet" | ${CMD_SUDO} ${CMD_TEE} -a /etc/motd >/dev/null 2>&1
#elif [ -n "$bbu_msg" ]; then
# bbu_msg=$(echo $bbu_msg)
# echo "Check Hardware Notice:["$bbu_msg"]" | ${CMD_SUDO} ${CMD_TEE} -a /etc/motd >/dev/null 2>&1
#fi
}
# report to IDCFree
function pub_IdcFreeCreate() {
# if scheduled by HWQC in Vendor environment.
[ x"$IS_VENDOR" = "x1" ] && return
local pri_error_disk=$1 #坏盘
local pri_error_log=$2 #错误日志
# 错误类型 1:硬盘故障,2:内存故障,3:电源故障,4:主板故障,5:其他,6:CPU故障,7:带外故障,8:SAS/RAID卡故障,9:raid卡电池故障,0:LogicDrive故障,a:mdstat故障
local pri_error_type=$3
local pri_slot=$4
local pri_raw=""
local pri_error_slot=""
local pri_api="repairapi"
# select pri_api
#[ x"$IS_HWQC" = "x1" ] && pri_api=hwqcAPI
#[ x"$IS_IDCNEW" = "x1" ] && pri_api=xinsunAPI
if [ x"$pri_error_type" = "x1" ]; then
if [ -n "$pri_slot" ]; then
pub_fetch_hwinfo "$pri_slot"
fi
if [ -n "$pub_disk_raw" ]; then
pri_raw="&rawData=$pub_disk_raw"
#pri_error_log="Slot=Y | $pri_error_log"
else
pri_raw=""
fi
fi
[ x"$pub_slot" != "x" ] && pri_error_slot="&errorSlot=$pub_slot" || pri_error_slot=""
if [ x"$IS_HWQC" = "x1" ]; then
err_msg="Error - report to IDCFree failed: &type=$pri_error_type&errorDisk=$pri_error_disk&remark=$pri_error_log$pri_raw$pri_error_slot"
else
${CMD_CURL} -d "userName=$pri_api&sn=$pub_sn&type=$pri_error_type&errorDisk=$pri_error_disk&remark=$pri_error_log$pri_raw$pri_error_slot" 'http://idc.alibaba-inc.com/repairapi!create.jspa' >/dev/null
if [ $? -ne 0 ]; then #如果失败就报修到公网API地址
sleep 3
${CMD_CURL} -d "userName=$pri_api&sn=$pub_sn&type=$pri_error_type&errorDisk=$pri_error_disk&remark=$pri_error_log$pri_raw$pri_error_slot" 'http://42.156.166.86/repairapi!create.jspa' >/dev/null
if [ $? -ne 0 ]; then
sleep 2
${CMD_CURL} -d "userName=$pri_api&sn=$pub_sn&type=$pri_error_type&errorDisk=$pri_error_disk&remark=$pri_error_log$pri_raw$pri_error_slot" 'http://100.67.16.32/repairapi!create.jspa' >/dev/null
if [ $? -ne 0 ]; then
sleep 3
${CMD_CURL} -d "userName=$pri_api&sn=$pub_sn&type=$pri_error_type&errorDisk=$pri_error_disk&remark=$pri_error_log$pri_raw$pri_error_slot" 'http://110.75.103.51/repairapi!create.jspa' >/dev/null
if [ $? -eq 0 ]; then
err_msg="Error - report to IDCFree failed: &type=$pri_error_type&errorDisk=$pri_error_disk&remark=$pri_error_log$pri_raw$pri_error_slot"
pub_logfileAppend "$err_msg"
fi
fi
fi
fi
fi
}
# Query idcfree records
function pub_IdcFreeQuery() {
[ x"$IS_VENDOR" = "x1" ] && return 0
local pri_error_type=`echo "$1"|grep -E "^[0-9]+$"`
[ -z "$pri_error_type" ] && pri_error_type=1
local pri_idcfree_record
local pri_idcfree_count
pri_idcfree_record=`${CMD_CURL} -d "isCount=true&sn=$pub_sn&types=$pri_error_type&states=2,3,a,b,c,e,t,u,j,k,l" 'http://idc.alibaba-inc.com/repairapi!search.jspa' 2>/dev/null`
if [ -z $pri_idcfree_record ]; then
pri_idcfree_record=`${CMD_CURL} -d "isCount=true&sn=$pub_sn&types=$pri_error_type&states=2,3,a,b,c,e,t,u,j,k,l" 'http://42.156.166.86/repairapi!search.jspa' 2>/dev/null`
if [ -z $pri_idcfree_record ]; then
pri_idcfree_record=`${CMD_CURL} -d "isCount=true&sn=$pub_sn&types=$pri_error_type&states=2,3,a,b,c,e,t,u,j,k,l" 'http://100.67.16.32/repairapi!search.jspa' 2>/dev/null`
if [ -z $pri_idcfree_record ]; then
pri_idcfree_record=`${CMD_CURL} -d "isCount=true&sn=$pub_sn&types=$pri_error_type&states=2,3,a,b,c,e,t,u,j,k,l" 'http://110.75.103.51/repairapi!search.jspa' 2>/dev/null`
fi
fi
fi
pri_idcfree_count=`echo "$pri_idcfree_record" | grep -E "^[0-9]+$"`
return $pri_idcfree_count
}
# avoid duplicated running
function pub_lock() {
if [ -e $pub_lock_file ] && ${CMD_SUDO} grep -qE "^[0-9]+$" $pub_lock_file; then
pri_proc_cmdline="`${CMD_SUDO} cat /proc/$(cat $pub_lock_file)/cmdline 2>/dev/null`"
if echo "$pri_proc_cmdline" | grep -q "`basename $0`"; then
[[ x"$IS_HWQC" = "x1" ]] && ignoreinfo="[Ignore_duplicated_run]" || ignoreinfo=""
exitstatus=$unknown
err_msg="$ignoreinfo Error - pid $(cat $pub_lock_file) is running! my pid is $$; conflict: $pri_proc_cmdline"
pub_json_output
exit $unknown
fi
fi
echo $$ | ${CMD_SUDO} ${CMD_TEE} $pub_lock_file >/dev/null 2>&1
}
function pub_vm_exit() {
msg="OK - VM box pass."
exitstatus=$ok
pub_json_output
pub_exit
}
# check if it's root
# check if non-root accounts have sudo priv
function pub_check_root() {
# root
if [ $suf -eq 0 ]; then
CMD_SUDO=""
return
fi
# not root, no sudo
if [ ! -x /usr/bin/sudo ]; then
exitstatus=$unknown
err_msg="Error - sudo package is not installed!"
pub_json_output
exit $unknown
fi
# not root, has sudo
CMD_SUDO="/usr/bin/sudo"
sudo true 2>/dev/null && return
# if no sudo priv, check if it's vm first
cat /etc/hwinfo/hwinfo.conf 2>/dev/null | grep -oE "system_hypervisor_type[^,]+," | grep -qE "xen-domU|container" && pub_vm_exit
cat /proc/self/cgroup 2>/dev/null | grep -qE "^[0-9]+:.+:/.+$" && pub_vm_exit
if [ -x ${CMD_FILE} ]; then
${CMD_FILE} ${CMD_DMIDECODE} 2>/dev/null | grep -qE "ELF.*executable" || pub_vm_exit
fi
# if it's not VM
exitstatus=$unknown
err_msg="Error - sudo execution failed!"
pub_json_output
exit $unknown
}
# check if basic commands like lspci and dmidecode exists.
function pub_basetool() {
test -x ${CMD_LSPCI} || ${CMD_SUDO} ${CMD_YUM} -y pciutils &>/dev/null
test -x ${CMD_DMIDECODE} || ${CMD_SUDO} ${CMD_YUM} -y dmidecode &>/dev/null
if [ ! -x ${CMD_LSPCI} -o ! -x ${CMD_DMIDECODE} ]; then
err_msg="Error - dmidecode or lspci does not exist!"
pub_messageSend "debug" "[ DEBUG ]" "dmidecode or lspci does not exist!"
exitstatus=$unknown
pub_json_output
exit $unknown
fi
}
# we don't check VM issues.
# touch a file named /etc/HOSTISVM if VM
function pub_isVirtualHost() {
# check hwinfo
hyper_type=`cat /etc/hwinfo/hwinfo.conf 2>/dev/null | grep -oE "system_hypervisor_type[^,]+,"`
echo "$hyper_type" | grep -qiE "xen-domu|container|xen-hvm|qemu|vmware|virtualbox|linux_vserver" && pub_vm_exit
#echo "$hyper_type" | grep -qwiE "xen|kvm" && pub_vm_exit
echo "$hyper_type" | grep -qE "xen xen-dom0" && return
#echo "$hyper_type" | grep -qE "xen-domU|container" && pub_vm_exit
# 'control_d' if dom0, empty if domU
if [ -f /proc/xen/capabilities ]; then
${CMD_SUDO} grep -iq 'control_d' /proc/xen/capabilities && return || pub_vm_exit
fi
# LXC container check
LXC_flag=""
if [ -e /proc/1/environ ]; then
${CMD_SUDO} grep -iqE 'lxc|container=' /proc/1/environ
if [ $? -eq 0 ]; then
LXC_flag="yes"
fi
fi
#cat /proc/self/cgroup 2>/dev/null | grep -qE "^[0-9]+:.+:/.+$" && pub_vm_exit
cat /proc/self/cgroup 2>/dev/null | grep -qE "^[0-9]+:.+:/.+$" && [ -n "$LXC_flag" ] && pub_vm_exit
# script or text file in some guest
if [ -x ${CMD_FILE} ]; then
${CMD_FILE} ${CMD_DMIDECODE} 2>/dev/null | grep -qE "ELF.*executable" || pub_vm_exit
fi
# console=xvc0 if guest
grep -q 'console=xvc0' /proc/cmdline && pub_vm_exit
# dmidecode check
pri_result=`${CMD_SUDO} ${CMD_DMIDECODE} 2>/dev/null`
echo "$pri_result" | grep "Vendor" | grep -qi 'xen' && pub_vm_exit
echo "$pri_result" | grep "Prod" | grep -qE "KVM|VMware|Bochs" && pub_vm_exit
}
function pub_raid_type() {
raidtype=""
lsmod=`/sbin/lsmod`
lspci=`${CMD_SUDO} ${CMD_LSPCI} 2>/dev/null`
# mptSAS/mpt2SAS
echo $lsmod | egrep -qw "mptsas|mptbase|mpt2sas"
if [ $? -eq 0 ] && [ -z $raidtype ]; then
if echo $lsmod | egrep -qw "mpt2sas" && echo $lspci | grep -q "SAS2"; then
raidtype="mpt2SAS"
elif echo $lsmod | egrep -qw "mptsas" && echo $lspci | grep -q "SAS1" ||
echo $lsmod | egrep -qw "megaraid_sas,mptsas"; then
raidtype="mptSAS"
fi
if ! test -c /dev/mptctl; then
${CMD_SUDO} ${CMD_MKNOD} /dev/mptctl c 10 220
${CMD_SUDO} ${CMD_MODPROBE} mptctl &>/dev/null
fi
fi
# MegaRAID SCSI
echo $lsmod | egrep -qw "megaraid_mbox|megaraid2"
if [ $? -eq 0 ] && [ -z $raidtype ]; then
raidtype="megaRAIDSCSI"
fi
# MegaRAID SAS
echo $lsmod | egrep -qw "megaraid_sas"
if [ $? -eq 0 ] && [ -z $raidtype ]; then
raidtype="megaRAIDSAS"
fi
# aacRAID
echo $lsmod | egrep -qw "aacraid"
if [ $? -eq 0 ] && [ -z $raidtype ]; then
raidtype="aacraid"
fi
# HP RAID
echo "$lspci" | grep -iE "RAID|SCSI|SAS|SATA" | grep -q "Hewlett-Packard" && echo $lsmod | grep -qE "cciss|hpsa"
if [ $? -eq 0 ] && [ -z $raidtype ]; then
raidtype="hpraid"
fi
# MegaRAID SAS
echo "$lspci" | grep -qE "MegaRAID|Dell PowerEdge Expandable RAID controller|MegaRAID SAS"
if [ $? -eq 0 ] && [ -z $raidtype ]; then
raidtype="megaRAIDSAS"
fi
if [ -z $raidtype ]; then
raidtype="unknown"
# echo "this host raid is unknown raid"
fi
lsiutil_hang=0
pri_raidcard=${pub_chkhw_logdir}/raidcard
if [ "$raidtype" = "mptSAS" ] || [ "$raidtype" = "mpt2SAS" ]; then
#3.04版本mptsas驱动程序会导致lsiutil、cfggen等执行过程中造成kernel panic
if cat /sys/module/mptsas/version 2>/dev/null | grep -qE "^3\.04"; then
lsiutil_hang=1
fi
fi
if [ "$raidtype" = "aacraid" -o "$raidtype" = "hpraid" ]; then
echo "raidcard" | ${CMD_SUDO} ${CMD_TEE} ${pri_raidcard} &>/dev/null
elif [ "$raidtype" != "megaRAIDSAS" ]; then
[ -e ${pri_raidcard} ] && echo "" | ${CMD_SUDO} ${CMD_TEE} ${pri_raidcard} &>/dev/null
fi
}
# install missing tools
function pub_tools_install() {
if [ ! -e /dev/shm/check_hardware.install ]; then
pub_check_download_server
if pub_check_hw_install; then
${CMD_SUDO} ${CMD_TOUCH} /dev/shm/check_hardware.install
fi
${CMD_SUDO} ${CMD_IPMITOOL} sel time set "`date +%m/%d/%Y\ %H:%M:%S`" >/dev/null 2>&1 &
fi
}
function pub_check_download_server() {
# get reachable download_server ip
if hostname | grep -qE "\.cn[0-9]+|\.l2cn[0-9]+"; then
ips="yum.corp.taobao.com yum.tbsite.net 172.24.102.213 42.120.195.193"
else
ips="yum.tbsite.net 172.24.102.213 yum.corp.taobao.com 42.120.195.193"
fi
download_server=yum.tbsite.net
for ip in $ips; do
if ping -w 2 -q $ip >/dev/null 2>&1; then
download_server=$ip
break;
fi
done
export pub_tools_server="http://${download_server}/aliyun/5Server/x86_64/server/hardware/tool"
}
# installation entry function
function pub_check_hw_install() {
# remove out-dated tops-checkhardware
${CMD_RPM} -q --quiet tops-checkhardware &>/dev/null && ${CMD_SUDO} ${CMD_RPM} -e tops-checkhardware &>/dev/null
# install hdmontools
${CMD_RPM} -q --quiet tops-hdmontools &>/dev/null || ${CMD_SUDO} ${CMD_YUM} -y install tops-hdmontools &>/dev/null
# install mce/ipmi
${CMD_RPM} -q --quiet mcelog &>/dev/null || ${CMD_SUDO} ${CMD_YUM} -y install mcelog &>/dev/null
pub_install_ipmi
# install smartctl
[ -x ${CMD_SMARTCTL} ] || ${CMD_SUDO} ${CMD_YUM} -y install smartmontools &>/dev/null
uname -i | grep -q 64 && smartctltool='smartctl' || smartctltool='smartctl32'
pub_install "$smartctltool" "${CMD_SMARTCTL}"
# install raid tools
if test "$raidtype" = "megaRAIDSAS"; then
pub_install_megacli
exitstatus=$?
pub_install_lsiutil
return $exitstatus
elif test "$raidtype" = "hpraid"; then
pub_install_hpacucli
exitstatus=$?
return $exitstatus
elif test "$raidtype" = "megaRAIDSCSI"; then
pub_install_megarc
exitstatus=$?
return $exitstatus
elif test "$raidtype" = "mptSAS"; then
pub_install "cfggen" "${CMD_CFGGEN}"
exitstatus=$?
pub_install_lsiutil
return $exitstatus
elif test "$raidtype" = "aacraid"; then
pub_install_arcconf
exitstatus=$?
return $exitstatus
elif test "$raidtype" = "mpt2SAS"; then
pub_install "sas2ircu" "${CMD_SAS2IRCU}"
exitstatus=$?
pub_install_lsiutil
return $exitstatus
elif test "$raidtype" = "unknown"; then
return $exitstatus
fi
return 0
}
function pub_install_ipmi() {
${CMD_RPM} --quiet -q OpenIPMI || ${CMD_SUDO} ${CMD_YUM} -y install OpenIPMI &>/dev/null
if [ -e /etc/init.d/ipmi ]; then
${CMD_SUDO} ${CMD_CHKCONFIG} ipmi on &>/dev/null
if ps ax 2>/dev/null|awk '$3~/D/'|grep -q ipmi; then
# TODO: OpenIPMI service hang
:
else
if ! /etc/init.d/ipmi status &>/dev/null; then
${CMD_SUDO} ${CMD_SERVICE} ipmi start &>/dev/null &
fi
fi
else
pub_messageAppend "OpenIPMI service installation failed!"
pub_messageSend "debug" "[ DEBUG ]" "$msg"
fi
if grep -q "6\.[0-9]" /etc/redhat-release 2>/dev/null; then
${CMD_RPM} --quiet -q ipmitool || ${CMD_SUDO} ${CMD_YUM} -y install ipmitool &>/dev/null
else
${CMD_RPM} --quiet -q OpenIPMI-tools || ${CMD_SUDO} ${CMD_YUM} -y install OpenIPMI-tools &>/dev/null
fi
[ -x ${CMD_IPMITOOL} ] || {
pub_messageAppend "ipmitool installation failed!"
pub_messageSend "debug" "[ DEBUG ]" "$msg"
return 1
}
}
function pub_install() {
pri_tool_download=$1
pri_tool_name=$2
[ -x ${pri_tool_name} ] || {
if ${CMD_WGET} ${pub_tools_server}/${pri_tool_download} ; then
${CMD_SUDO} ${CMD_INSTALL} -g root -o root -m 755 /tmp/${pri_tool_download} ${pri_tool_name}
else
pub_messageAppend "wget FAIL, ${pri_tool_name} installation failed."
pub_messageSend "debug" "[ DEBUG ]" "$msg"
return 1
fi
[ -x ${pri_tool_name} ] || {
pub_messageAppend "${pri_tool_name} installation failed."
pub_messageSend "debug" "[ DEBUG ]" "$msg"
return 1
}
}
}
function pub_install_megacli() {
local megacli_installed=0
if [ -x ${CMD_MEGACLI} ]; then
${CMD_SUDO} ${CMD_MEGACLI} -v -NoLog 2>/dev/null | grep -qE "8.02.16|8.04.07"
if [ $? -ne 0 ]; then
${CMD_SUDO} ${CMD_RPM} -e MegaCli &>/dev/null
${CMD_SUDO} ${CMD_YUM} -y install MegaCli-8.04.07-1 &>/dev/null
fi
else
${CMD_SUDO} ${CMD_YUM} -y install MegaCli-8.04.07-1 &>/dev/null
fi
if [ ! -x ${CMD_MEGACLI} ]; then
pub_messageAppend "MegaCli installation failed!"
pub_messageSend "debug" "[ DEBUG ]" "$msg"
return 1
fi
}
function pub_install_lsiutil() {
local lsiutil_installed=0
if [ -x ${CMD_LSIUTIL} ]; then
lsiutil_installed=1
${CMD_SUDO} ${CMD_LSIUTIL} -b &>/dev/null || lsiutil_installed=0
${CMD_SUDO} ${CMD_LSIUTIL} -b 2>/dev/null | grep -qE "Version 1.63|Version 1.67" || lsiutil_installed=0
fi
if [ $lsiutil_installed -eq 0 ]; then
test `uname -i` = "x86_64" && local pri_cmd=lsiutil || local pri_cmd=lsiutil32
rm -f /tmp/${pri_cmd} &>/dev/null
if ${CMD_WGET} ${pub_tools_server}/${pri_cmd} ; then
${CMD_SUDO} ${CMD_INSTALL} -g root -o root -m 755 /tmp/${pri_cmd} ${CMD_LSIUTIL}
else
pub_messageAppend "wget FAIL, lsiutil installation failed."
pub_messageSend "debug" "[ DEBUG ]" "$msg"
return 1
fi
[ -x ${CMD_LSIUTIL} ] || {
pub_messageAppend "lsiutil installation failed!"
pub_messageSend "debug" "[ DEBUG ]" "$msg"
return 1
}
fi
}
# TODO: use hpssacli
function pub_install_hpacucli() {
if echo $pub_model | grep -q "DL185 G5"; then
if ! ${CMD_SUDO} [ -x ${CMD_HPACUCLI} ] || ! ${CMD_SUDO} ${CMD_HPACUCLI} version 2>/dev/null | grep -q "Version: 8"; then
${CMD_SUDO} ${CMD_RPM} -e hpacucli --nodeps &>/dev/null
# FIXME: use yum instead
if ${CMD_WGET} ${pub_tools_server}/hpacucli-8.60-8.0.noarch.rpm ; then
${CMD_SUDO} ${CMD_YUM} localinstall -y /tmp/hpacucli-8.60-8.0.noarch.rpm &>/dev/null || {
pub_messageAppend "hpacucli-8.60-8.0.noarch.rpm installation failed."
pub_messageSend "debug" "[ DEBUG ]" "$msg"
return 1
}
else
pub_messageAppend "wget hpacucli-8.60-8.0.noarch.rpm failed."
pub_messageSend "debug" "[ DEBUG ]" "$msg"
return 1
fi
fi
else
if ! ${CMD_SUDO} [ -x ${CMD_HPACUCLI} ] || ! ${CMD_SUDO} ${CMD_HPACUCLI} version 2>/dev/null | grep -q "Version: 9"; then
${CMD_SUDO} ${CMD_RPM} -e hpacucli --nodeps &>/dev/null
${CMD_SUDO} ${CMD_YUM} -y install hpacucli-9.10 &>/dev/null
fi
fi
}
function pub_install_megarc() {
pub_install "megarc.bin" "/usr/local/bin/megarc.bin"
[ -x ${CMD_MEGARC} ] || ${CMD_SUDO} ${CMD_LN} -s "/usr/local/bin/megarc.bin" "${CMD_MEGARC}"
}
function pub_install_arcconf() {
if grep -Eq "6\.[0-9]" /etc/redhat-release 2>/dev/null; then
${CMD_RPM} --quiet -q compat-libstdc++-33.i686 || ${CMD_SUDO} ${CMD_YUM} -y install compat-libstdc++-33.i686 &>/dev/null
else
${CMD_RPM} --quiet -q compat-libstdc++-33.i386 || ${CMD_SUDO} ${CMD_YUM} -y install compat-libstdc++-33.i386 &>/dev/null
fi
pub_install "arcconf" "${CMD_ARCCONF}"
}
function pub_firstrun() {
${CMD_SUDO} ${CMD_TOUCH} /dev/shm/NOT_RUN_DRAGOON_CHECK_HARDWARE >/dev/null 2>&1
test -d ${pub_chkhw_logdir} || ${CMD_SUDO} ${CMD_MKDIR} -p ${pub_chkhw_logdir} 2>/dev/null
export pub_sn=$(echo $(${CMD_SUDO} ${CMD_DMIDECODE} -s system-serial-number 2>/dev/null | tail -1))
export pub_alitype=$(echo $(${CMD_SUDO} ${CMD_DMIDECODE} -s chassis-version 2>/dev/null | tail -1))
export pub_model=$(echo $(${CMD_SUDO} ${CMD_DMIDECODE} -s system-product-name 2>/dev/null | tail -1))
export pub_vendor=$(echo $(${CMD_SUDO} ${CMD_DMIDECODE} -s system-manufacturer 2>/dev/null | tail -1))
}
function pub_sas2ircu_timeout() {
waitfor=10
${CMD_SUDO} ${CMD_SAS2IRCU} LIST >/dev/null 2>&1 &
while pgrep sas2ircu -P $$ >/dev/null ; do
sleep 1
((waitfor--))
if [ $waitfor -eq 0 ]; then
${CMD_SUDO} killall -9 sas2ircu -P $$ >/dev/null 2>&1 &
wait $!
return 127
fi
done
return 0
}
function pub_has_huatuo() {
if ${CMD_SUDO} [ -e /home/admin/badnode_monitor/huatuo_version ]; then
return 0
fi
if [ -e /dev/shm/DISK_NOT_RUN_CHECK_HARDWARE ]; then
return 0
fi
return 1
}
function pub_is_ecs_host() {
if [ x"`uname -r`" = "x2.6.32.36xen" ] || uname -r | grep -q "houyi"; then
return 0
fi
return 1
}
function pub_fetch_hwinfo() {
local pri_diskinfos=
local pri_diskinfo_inuse=""
local pri_diskinfo_unknown=""
local pri_slot=$1
if [[ -e $pub_hwinfo ]]; then
pri_diskinfos=`grep -oE "\{[^{}]+slot[^{}]+$pri_slot[^0-9]+[^{}]+}" $pub_hwinfo`
pri_diskinfo_inuse=`echo "$pri_diskinfos" | grep "inuse" | head -1`
pri_diskinfo_unknown=`echo "$pri_diskinfos" | grep "unknown" | head -1`
if [[ -n "$pri_diskinfo_inuse" ]]; then # disk is available
pub_disk_raw="$pri_diskinfo_inuse"
elif [[ -n "$pri_diskinfo_unknown" ]]; then # disk was available
pub_disk_raw="$pri_diskinfo_unknown"
else
pub_disk_raw=""
return
fi
fi
}
#######################################################################
######################## CHECK FUNCTIONS ###############################
########################################################################
# Memory health check
function pub_check_memoryHealth () {
local pri_Memtotal
local pri_MemtotalG
local pri_Dmitotal
local pri_DmitotalG
local pri_i
local pri_size
local pri_dif
local pri_is_xenServer
pri_is_xenServer=`cat /sys/hypervisor/type 2>/dev/null`
# don't check if mem=XXXM in /proc/cmdline
if grep -q 'mem=' /proc/cmdline; then
if [ -z "$pri_is_xenServer" ]; then
return
fi
fi
#pri_is_xenServer=`cat /sys/hypervisor/type 2>/dev/null`
#检查dmidecode和meminfo或者xm info看到的内存是否一致
if uname -r | grep -iq ESX; then # ESX-i Server
pri_Memtotal=`cat /proc/meminfo | grep MachineMem | awk '{ print $2 }'`
pri_MemtotalG=`expr $pri_Memtotal \/ 1024 \/ 1024`
elif [ -n "$pri_is_xenServer" ]; then # Xen Hypervisor
if [ -x ${CMD_XM} ]; then
${CMD_SUDO} ${CMD_XM} info &>/dev/null
if [ $? -eq 0 ]; then
pri_Memtotal=`${CMD_SUDO} ${CMD_XM} 2>/dev/null info|grep "total_memory"|awk '{print $NF}'`
pri_MemtotalG=`echo "scale=3; $pri_Memtotal / 1024" | bc 2>/dev/null`
pri_MemtotalG=`echo $pri_MemtotalG |awk '{printf "%.0f",$1}'`
[ $pri_MemtotalG -eq 0 ] && return #Xen的BUG,有时取内存会取到0GB,取到则跳过
else
return
fi
fi
else #非xen/ESX宿主机的物理机,通过/proc/meminfo看系统当前认到的内存总GB数。
pri_Memtotal=`cat /proc/meminfo | grep MemTotal | awk '{print $2}'`
# BIOS might reserve some memory
pri_reserved=`cat /var/log/dmesg 2>/dev/null | grep Memory | grep -oE ", [0-9]+k reserved" | awk '{ print $2 }' | sed 's/k//'`
[[ -n "$pri_reserved" ]] || pri_reserved=0
pri_Memtotal=`expr $pri_Memtotal + $pri_reserved`
pri_MemtotalG=`echo "scale=3; $pri_Memtotal / 1024 / 1024 " | bc 2>/dev/null`
[ -z "$pri_MemtotalG" ] && return #假如未安装bc,则跳过检查
pri_MemtotalG=`echo $pri_MemtotalG |awk '{printf "%.0f",$1}'`
fi
if uname -i 2>/dev/null | grep -q "i[0-9]86"; then
[ $pri_MemtotalG -ge 3 ] && return
fi
# 1G Huge Page 会占用物理内存,但是dmesg里的reserved内存会包括这一段。
if ([ -z "$pri_reserved" ] || [ $pri_reserved -eq 0 ]) && cat /proc/cmdline | grep -qE "hugepagesz=1G hugepages=[0-9]+"; then
pri_hpages=$(cat /proc/cmdline | grep -oE "hugepagesz=1G hugepages=[0-9]+" | awk -F= '{ print $NF }')
pri_MemtotalG=`expr $pri_MemtotalG + $pri_hpages`
fi
# 通过dmidecode获取物理内存总GB数$DmitotalG
pri_mems=$(${CMD_SUDO} ${CMD_DMIDECODE} -t memory 2>/dev/null | grep -w "\WSize" | grep -w "MB" | awk '$2!~/No/ {print $2}')
for pri_i in $pri_mems; do
pri_Dmitotal=`expr $pri_Dmitotal + $pri_i`
done
pri_DmitotalG=`expr $pri_Dmitotal / 1024`
if [ $pri_MemtotalG = $pri_DmitotalG ]; then
return
fi
# probably script bug
if [ $pri_DmitotalG -lt $pri_MemtotalG ]; then
# exitstatus=$warning
# pub_messageAppend "Check Memory, memTotal:${pri_MemtotalG}G > dmidecode:${pri_DmitotalG}G."
# pub_messageSend "mem" "[ WARNING ]" "Check Memory,memTotal:${pri_MemtotalG}G > dmidecode:${pri_DmitotalG}G."
return
fi
pri_dif=`expr $pri_DmitotalG - $pri_MemtotalG`
for pri_size in `echo "$pri_mems" | sort | uniq`; do
pri_size=`expr $pri_size / 1024` # 如2G/4G/8G的内存
[[ $pri_size -ge 1 ]] || continue # 最少1G,否则不认为是内存
if [ $pri_dif -ge $pri_size ]; then # 如果dmidecode认出的内存比系统内存多了单根内存容量(GB),即:有内存条坏了。
exitstatus=$error
pub_messageAppend "Memory less than $pri_DmitotalG G [IN DMIDECODE], now $pri_MemtotalG G."
pub_IdcFreeQuery 2 && pub_IdcFreeCreate "unknown" "Memory less than $pri_DmitotalG G [IN DMIDECODE], now $pri_MemtotalG G." "2"
pub_messageSend "mem" "[ ERROR ]" "Memory less than $pri_DmitotalG G [IN DMIDECODE], now $pri_MemtotalG G."
return
fi
if pub_is_ecs_host; then
if echo $pub_alitype | grep -qiE 's10-3s|c7'; then
pri_diff1=`expr 128 - $pri_DmitotalG`
if [ $pri_diff1 -ge $pri_size ]; then
exitstatus=$error
pub_messageAppend "Memory baseline is 128G,now is $pri_DmitotalG G [IN DMIDECODE]."
pub_IdcFreeQuery 2 && pub_IdcFreeCreate "unknown" "Memory baseline is 128G,now is $pri_DmitotalG G [IN DMIDECODE]." "2"
pub_messageSend "mem" "[ ERROR ]" "Memory baseline is 128G,now is $pri_DmitotalG G [IN DMIDECODE]."
return
fi
elif echo $pub_alitype | grep -qiE 'n32|c8'; then
pri_diff1=`expr 256 - $pri_DmitotalG`
if [ $pri_diff1 -ge $pri_size ]; then
exitstatus=$error
pub_messageAppend "Memory baseline is 256G,now is $pri_DmitotalG G [IN DMIDECODE]."
pub_IdcFreeQuery 2 && pub_IdcFreeCreate "unknown" "Memory baseline is 256G,now is $pri_DmitotalG G [IN DMIDECODE]." "2"
pub_messageSend "mem" "[ ERROR ]" "Memory baseline is 256G,now is $pri_DmitotalG G [IN DMIDECODE]."
return
fi
fi
fi
done
}
#by luxue
#检查网卡故障
function pub_check_nic(){
local EthernetSpeed
local RX_bytes
local Frame
local BER
local divisor=1
local bondingfile=`cat /sys/class/net/bonding_masters`
local eth_num=`ls /sys/class/net | grep "eth" | wc -l`
for ((i=0; i<$eth_num; i++)); do
EthernetSpeed=`${CMD_SUDO} ethtool eth$i | grep Speed | awk -F":" '{print $NF}' | sed -e 's/^\s\+//;s/\s\+$//'`
if [ $EthernetSpeed != "1000Mb/s" ]; then
if [ $EthernetSpeed != "10000Mb/s" ]; then
exitstatus=$error
pub_messageAppend "Ethernet:Error Speed"
pub_messageSend "Error Ethernet Speed" "[ ERROR ]" "$msg"
pub_slot="Nic$i"
pub_IdcFreeCreate "Nic$i" "Please check the cable firstly, then switch port, and check nic last." "n" #报修
pub_slot=""
fi
fi
Frame=`ifconfig eth$i | grep "frame" | awk -F":" '{print $NF}' | sed -e 's/^\s\+//;s/\s\+$//'`
RX_bytes=`ifconfig eth$i | grep "RX bytes" | awk -F" " '{print $2}' | awk -F":" '{print $NF}' | sed -e 's/^\s\+//;s/\s\+$//'`
[[ $EthernetSpeed == "1000Mb/s" ]] && divisor=1250000000
[[ $EthernetSpeed == "10000Mb/s" ]] && divisor=125000000000
BER=`echo $RX_bytes $divisor | awk '{printf "%d\n",$1/$2}'`
if [ $BER -lt $Frame ]; then
exitstatus=$error
pub_messageAppend "Ethernet:Error packets"
pub_messageSend "Error Ethernet packets" "[ ERROR ]" "$msg"
pub_slot="Nic$i"
pub_IdcFreeCreate "Nic$i" "Please check the cable firstly, then switch port, and check nic last." "n" #报修
pub_slot=""
fi
if [ -e "$CHECK_NIC_BONDING0/$bondingfile" ]; then
nic_bonding_status=`cat "$CHECK_NIC_BONDING0/$bondingfile" | grep "Slave Interface: eth$i" -A 2 | grep "MII Status" | awk -F":" '{print $NF}' | tr [a-z] [A-Z] | sed -e 's/^\s\+//;s/\s\+$//'`
if [ $nic_bonding_status != "UP" ]; then
exitstatus=$error
pub_messageAppend "Ethernet: bonding is not up."
pub_messageSend "Bonding is not up." "[ ERROR ]" "$msg"
pub_slot="Nic$i"
pub_IdcFreeCreate "Nic$i" "Please check the cable firstly, then switch port, and check nic last." "n" #报修
pub_slot=""
fi
fi
done
}
function pub_check_mdstat() {
local pri_md_device
local pri_i
local pri_md_fail
local pri_disk_log
local pri_fail_log
local pri_fail_disk
local pri_disk
local pri_model
local pri_disk_slot
local pri_fail_slot
local pri_fail_model
if grep -q "active raid" /proc/mdstat &>/dev/null; then #检查软Raid状态
for pri_md_device in `cat /proc/partitions | grep md | awk '{print $NF}' | grep -v "p[0-9]"`; do
pri_i=`${CMD_SUDO} ${CMD_MDADM} 2>/dev/null --detail /dev/$pri_md_device | grep -w "\WState" | awk -F: '{print $2}'`
if echo $pri_i |egrep -iwq "fail|degraded"; then
#exitstatus=$warning
#pub_messageAppend "$pri_md_device is $pri_i."
#pub_messageSend "raidld" "[ WARNING ]" "$pri_md_device is $pri_i."
pri_disk_log=`cat /proc/mdstat | grep $pri_md_device | grep "(F)"`
if [ -n "$pri_disk_log" ]; then
#exitstatus=$warning
for pri_disk in `echo "$pri_disk_log"|tr " " "\n"|grep "(F)"|awk -F\[ '{print $1}'|sed 's/[0-9]\+$//'|sed 's/p$//'`; do
pri_model=`cat /sys/block/$pri_disk/device/model 2>/dev/null|sed 's/[ ]*$//g'`
# pri_disk_slot=$(pub_disk_slot $pri_disk)
if [ -z $pri_fail_disk ]; then
pri_fail_disk="/dev/$pri_disk"
pri_fail_model="$pri_model"
# pri_fail_slot="$pri_disk_slot"
elif echo $pri_fail_disk | grep -qw $pri_disk; then
:
else
pri_fail_disk="$pri_fail_disk,/dev/$pri_disk"
pri_fail_model="$pri_fail_model,$pri_model"
# pri_fail_slot="$pri_fail_slot,$pri_disk_slot"
fi
pri_fail_log="$pri_fail_log $pri_disk_log"
done
fi
fi
done
if [ -n "$pri_fail_disk" ]; then
pub_mdstat_fail=1
exitstatus=$error
pub_messageAppend "Failed disk: $pri_fail_disk, model: $pri_fail_model."
pub_IdcFreeCreate "$pri_fail_disk" "mdstat failed: $pri_fail_log, disk model: $pri_fail_model." "a"
pub_messageSend "raidpd" "[ ERROR ]" "Failed disk: $pri_fail_disk, model: $pri_fail_model."
fi
fi
}
function pub_check_std_disks() {
# Only check in RamOS env
if [ x"$IS_RAMOS" = x"NO" ]; then
return
fi
if echo $pub_alitype | grep -qE "S9|S10|A7|A8"; then
alitype="$pub_alitype"
else
#armory_api="http://a.alibaba-inc.com/page/api/free/opsfreeInterface/search.htm?from=device&q=sn==${pub_sn}&select=sm_name&num=100&_username=droid/rms"
strDNS=$(getArmoryUrl)
armory_api="http://${strDNS}/page/api/free/opsfreeInterface/search.htm?from=device&q=sn==${pub_sn}&select=sm_name&num=100&_username=srmp"
alitype=$(${CMD_CURL} $armory_api | awk -F'sm_name":"' '{print $2}' | awk -F\" '{print $1}')
fi
case $alitype in
*S9*|*S10*)
disk_nm=$(ls /dev/sd? | wc -l)
if [ $disk_nm -eq 0 ]; then
pub_messageAppend "no disk detected, please check the enclosure!"
pub_messageSend "raidpd" "[ ERROR ]" "$msg"
#pub_IdcFreeCreate "no_disk" "no disk detected, please check the enclosure!" "1" #报修
echo -e "===================IDC===================\nSlot: All slots\nDisk SN: \nVendor: \n\n==================Device==================\nDevice:\nDisk Lable:\n\n================Error Info================\nS9 or S10 server. No disk detected, please check the enclosure." > $errorfileinfo
pub_IdcFreeCreate "no_disk" "`cat $errorfileinfo`" "1" #报修
exitstatus=$error
rm -f $errorfileinfo
fi
;;
*A7*|*A8*)
if [ ! -e /dev/sda ]; then
exitstatus=$error
pub_messageAppend "sda is missing"
pub_messageSend "raidpd" "[ ERROR ]" "sda is missing"
pub_slot="Slot0"
echo -e "===================IDC===================\nSlot: Slot0\nDisk SN:\nVendor:\n\n==================Device==================\nDevice:\nDisk Lable:\n\n================Error Info================\nA7 or A8 server has only one disk. Sda is missing." > $errorfileinfo
pub_IdcFreeCreate "Slot0" "`cat $errorfileinfo`" "1"
pub_slot=""
rm -f $errorfileinfo
fi
;;
*)
;;
esac