11import logging as log
2+ import time
23
34from virttest import utils_stress
45from virttest import error_context
56from virttest import utils_test
7+ from virttest import virsh
8+ from virttest .libvirt_xml import vm_xml
69
710
811# Using as lower capital is not the best way to do, but this is just a
@@ -20,38 +23,210 @@ def run(test, params, env):
2023
2124 guest_stress = params .get ("guest_stress" , "no" ) == "yes"
2225 host_stress = params .get ("host_stress" , "no" ) == "yes"
23- stress_events = params .get ("stress_events" , "reboot" )
26+ stress_events = params .get ("stress_events" , "" )
27+ stress_time = params .get ("stress_time" , "30" )
28+ debug_dir = params .get ("debug_dir" , "/home/" )
29+ dump_options = params .get ("dump_options" , "--memory-only --bypass-cache" )
2430 vms = env .get_all_vms ()
2531 vms_uptime_init = {}
32+
2633 if "reboot" not in stress_events :
2734 for vm in vms :
2835 vms_uptime_init [vm .name ] = vm .uptime ()
29- stress_event = utils_stress . VMStressEvents ( params , env )
36+
3037 if guest_stress :
38+ # change the on_crash value to "preserve" when guest crashes
39+ for vm in vms :
40+ logging .debug ("Setting on_crash to preserve in %s" % vm .name )
41+ vmxml = vm_xml .VMXML .new_from_inactive_dumpxml (vm .name )
42+ if vm .is_alive ():
43+ vm .destroy (gracefully = False )
44+ vmxml .on_crash = "preserve"
45+ vmxml .sync ()
46+ vm .start ()
47+
3148 try :
3249 utils_test .load_stress ("stress_in_vms" , params = params , vms = vms )
3350 except Exception as err :
34- test .fail ("Error running stress in vms: %s" % err )
51+ test .fail ("Error running stress in vms: %s" % str (err ))
52+
3553 if host_stress :
3654 if params .get ("host_stress_args" , "" ):
3755 params ["stress_args" ] = params .get ("host_stress_args" )
3856 try :
3957 utils_test .load_stress ("stress_on_host" , params = params )
4058 except Exception as err :
41- test .fail ("Error running stress in host: %s" % err )
42- try :
43- stress_event .run_threads ()
44- finally :
45- stress_event .wait_for_threads ()
46- if guest_stress :
47- utils_test .unload_stress ("stress_in_vms" , params = params , vms = vms )
48- if host_stress :
49- utils_test .unload_stress ("stress_on_host" , params = params )
50- if "reboot" not in stress_events :
51- fail = False
59+ test .fail ("Error running stress in host: %s" % str (err ))
60+
61+ stress_timer = int (stress_time )
62+ fail = False
63+ found_traces = False
64+ failed_vms = []
65+ login_error_vms = []
66+ unexpected_reboot_vms = []
67+ error_message = ""
68+
69+ if guest_stress :
70+ # check for any call traces in guest dmesg while stress is running
71+ def check_call_traces (vm ):
72+ nonlocal stress_timer
73+ found_trace = False
74+ try :
75+ retry_login = True
76+ retry_times = 0
77+ while retry_login :
78+ try :
79+ retry_login = False
80+ session = vm .wait_for_login (timeout = 100 )
81+ if vm in login_error_vms :
82+ login_error_vms .remove (vm )
83+
84+ except Exception :
85+ stress_timer -= 150
86+ if vm in login_error_vms :
87+ return False
88+
89+ retry_login = True
90+ retry_times += 1
91+ if retry_times == 3 :
92+ logging .debug ("Error in logging into %s" % vm .name )
93+ if vm not in login_error_vms :
94+ login_error_vms .append (vm )
95+ return False
96+
97+ time .sleep (30 )
98+ stress_timer -= 30
99+
100+ dmesg = session .cmd ("dmesg" )
101+ dmesg_level = session .cmd ("dmesg -l emerg,alert,crit" )
102+ if "Call Trace" in dmesg or len (dmesg_level ) >= 1 :
103+ logging .debug ("Call trace found in %s" % vm .name )
104+ if vm not in failed_vms :
105+ failed_vms .append (vm )
106+ found_trace = True
107+ session .close ()
108+
109+ except Exception as err :
110+ test .error ("Error getting dmesg of %s due to %s" % (vm .name , str (err )))
111+ return found_trace
112+
113+ # run stress for stress_time seconds
114+ logging .debug ("Sleeping for %s seconds waiting for stress completion" % stress_time )
115+ stress_time = int (stress_time )
116+
117+ # check domstate of vms after stress_time
118+ if stress_time < 600 :
119+ time .sleep (stress_time )
52120 for vm in vms :
53- if vm .uptime () < vms_uptime_init [vm .name ]:
54- logging .error ("Unexpected reboot of VM: %s between test" , vm .name )
121+ if vm .state () != "running" :
122+ logging .debug ("%s state is %s" % (vm .name , vm .state ()))
123+ failed_vms .append (vm )
55124 fail = True
56- if fail :
57- test .fail ("Unexpected VM reboot detected" )
125+ else :
126+ found_traces = check_call_traces (vm )
127+ if found_traces :
128+ fail = True
129+ time .sleep (2 )
130+
131+ # check domstate of vms for every 5 minutes during stress_time
132+ else :
133+ all_failed = False
134+ number_of_checks = int (stress_time / 600 )
135+ delta_time = int (stress_time % 600 )
136+ for itr in range (number_of_checks ):
137+ if len (failed_vms ) == len (vms ) or len (login_error_vms ) == len (vms ):
138+ all_failed = True
139+ break
140+ if stress_timer <= 0 :
141+ break
142+ time .sleep (600 )
143+ for vm in vms :
144+ if vm .state () != "running" :
145+ logging .debug ("%s state is %s" % (vm .name , vm .state ()))
146+ if vm not in failed_vms :
147+ failed_vms .append (vm )
148+ fail = True
149+ else :
150+ found_traces = check_call_traces (vm )
151+ if found_traces :
152+ fail = True
153+ time .sleep (3 )
154+ stress_timer -= 3
155+
156+ if delta_time > 0 and stress_timer > 0 and not all_failed :
157+ time .sleep (delta_time )
158+ for vm in vms :
159+ if vm .state () != "running" :
160+ logging .debug ("%s state is %s" % (vm .name , vm .state ()))
161+ if vm not in failed_vms :
162+ failed_vms .append (vm )
163+ fail = True
164+ else :
165+ found_traces = check_call_traces (vm )
166+ if found_traces :
167+ fail = True
168+ time .sleep (3 )
169+ stress_timer -= 3
170+
171+ # virsh dump the failed vms into debug_dir
172+ if fail :
173+ for vm in failed_vms :
174+ if vm .state () != "shut off" :
175+ logging .debug ("Dumping %s to debug_dir %s" % (vm .name , debug_dir ))
176+ virsh .dump (vm .name , debug_dir + vm .name + "-core" , dump_options , ignore_status = False , debug = True )
177+ logging .debug ("Successfully dumped %s as %s-core" % (vm .name , vm .name ))
178+ else :
179+ logging .debug ("Cannot dump %s as it is in shut off state" % vm .name )
180+ failed_vms_string = ", " .join (vm .name for vm in failed_vms )
181+ error_message = "Failure in " + failed_vms_string + " while running stress. "
182+
183+ if login_error_vms :
184+ login_error_vms_string = ", " .join (vm .name for vm in login_error_vms )
185+ error_message += "Login error in " + login_error_vms_string + " while running stress. "
186+
187+ if len (failed_vms ) == len (vms ) or len (login_error_vms ) == len (vms ):
188+ error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
189+ test .fail (error_message )
190+
191+ # run STRESS EVENTS in the remaining stable guests
192+ if len (failed_vms ) < len (vms ) and len (login_error_vms ) < len (vms ):
193+ for vm in failed_vms :
194+ if vm in vms :
195+ vms .remove (vm )
196+ for vm in login_error_vms :
197+ if vm in vms :
198+ vms .remove (vm )
199+
200+ if len (vms ) == 0 :
201+ error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
202+ test .fail (error_message )
203+
204+ new_vms = ", " .join (vm .name for vm in vms )
205+ try :
206+ if stress_events != "" :
207+ logging .debug ("Running stress_events in %s" % new_vms )
208+ stress_event = utils_stress .VMStressEvents (params , env , vms )
209+ stress_event .run_threads ()
210+ stress_event .wait_for_threads ()
211+
212+ if guest_stress :
213+ utils_test .unload_stress ("stress_in_vms" , params = params , vms = vms )
214+
215+ if host_stress :
216+ utils_test .unload_stress ("stress_on_host" , params = params )
217+
218+ if "reboot" not in stress_events :
219+ for vm in vms :
220+ if vm .uptime () < vms_uptime_init [vm .name ]:
221+ logging .debug ("Unexpected reboot of VM: %s between test" , vm .name )
222+ unexpected_reboot_vms .append (vm )
223+ unexpected_reboot_vms_string = ", " .join (vm .name for vm in unexpected_reboot_vms )
224+ if unexpected_reboot_vms :
225+ error_message += "Unexpected reboot of guest(s) " + unexpected_reboot_vms_string + ". "
226+
227+ except Exception as err :
228+ error_message += "Failure running STRESS EVENTS in " + new_vms + " due to" + str (err )
229+
230+ # check the test status
231+ if error_message :
232+ test .fail (error_message )
0 commit comments