Skip to content

Commit 4c4c67c

Browse files
author
Misbah Anjum N
committed
multivm-stress:Update script to test all edgecases
This patch captures multiple edge cases to test multivm scenarios. The following updates are added: add stress_time parameter to run stress test for n seconds before starting stress_events add debug_dir parameter to save the the debug files add dump_options parameter to specify virsh dump type update guest on_crash value to preserve in case of crash add function check_call_traces to check for any call trace in dmesg during stress, check for guest state and call traces every ten minutes if any crashed vms, dump the vm to the debug_dir for further analysis run stress_events in the remaining stable vms if present, else skip check for error messages and fail the test if found Signed-off-by: Misbah Anjum N <[email protected]>
1 parent e072ece commit 4c4c67c

File tree

1 file changed

+193
-18
lines changed

1 file changed

+193
-18
lines changed
Lines changed: 193 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import logging as log
2+
import time
23

34
from virttest import utils_stress
45
from virttest import error_context
56
from virttest import utils_test
7+
from virttest import virsh
8+
from virttest.libvirt_xml import vm_xml
69

710

811
# Using as lower capital is not the best way to do, but this is just a
@@ -20,38 +23,210 @@ def run(test, params, env):
2023

2124
guest_stress = params.get("guest_stress", "no") == "yes"
2225
host_stress = params.get("host_stress", "no") == "yes"
23-
stress_events = params.get("stress_events", "reboot")
26+
stress_events = params.get("stress_events", "")
27+
stress_time = params.get("stress_time", "30")
28+
debug_dir = params.get("debug_dir", "/home/")
29+
dump_options = params.get("dump_options", "--memory-only --bypass-cache")
2430
vms = env.get_all_vms()
2531
vms_uptime_init = {}
32+
2633
if "reboot" not in stress_events:
2734
for vm in vms:
2835
vms_uptime_init[vm.name] = vm.uptime()
29-
stress_event = utils_stress.VMStressEvents(params, env)
36+
3037
if guest_stress:
38+
# change the on_crash value to "preserve" when guest crashes
39+
for vm in vms:
40+
logging.debug("Setting on_crash to preserve in %s" % vm.name)
41+
vmxml = vm_xml.VMXML.new_from_inactive_dumpxml(vm.name)
42+
if vm.is_alive():
43+
vm.destroy(gracefully=False)
44+
vmxml.on_crash = "preserve"
45+
vmxml.sync()
46+
vm.start()
47+
3148
try:
3249
utils_test.load_stress("stress_in_vms", params=params, vms=vms)
3350
except Exception as err:
34-
test.fail("Error running stress in vms: %s" % err)
51+
test.fail("Error running stress in vms: %s" % str(err))
52+
3553
if host_stress:
3654
if params.get("host_stress_args", ""):
3755
params["stress_args"] = params.get("host_stress_args")
3856
try:
3957
utils_test.load_stress("stress_on_host", params=params)
4058
except Exception as err:
41-
test.fail("Error running stress in host: %s" % err)
42-
try:
43-
stress_event.run_threads()
44-
finally:
45-
stress_event.wait_for_threads()
46-
if guest_stress:
47-
utils_test.unload_stress("stress_in_vms", params=params, vms=vms)
48-
if host_stress:
49-
utils_test.unload_stress("stress_on_host", params=params)
50-
if "reboot" not in stress_events:
51-
fail = False
59+
test.fail("Error running stress in host: %s" % str(err))
60+
61+
stress_timer = int(stress_time)
62+
fail = False
63+
found_traces = False
64+
failed_vms = []
65+
login_error_vms = []
66+
unexpected_reboot_vms = []
67+
error_message = ""
68+
69+
if guest_stress:
70+
# check for any call traces in guest dmesg while stress is running
71+
def check_call_traces(vm):
72+
nonlocal stress_timer
73+
found_trace = False
74+
try:
75+
retry_login = True
76+
retry_times = 0
77+
while retry_login:
78+
try:
79+
retry_login = False
80+
session = vm.wait_for_login(timeout=100)
81+
if vm in login_error_vms:
82+
login_error_vms.remove(vm)
83+
84+
except Exception:
85+
stress_timer -= 150
86+
if vm in login_error_vms:
87+
return False
88+
89+
retry_login = True
90+
retry_times += 1
91+
if retry_times == 3:
92+
logging.debug("Error in logging into %s" % vm.name)
93+
if vm not in login_error_vms:
94+
login_error_vms.append(vm)
95+
return False
96+
97+
time.sleep(30)
98+
stress_timer -= 30
99+
100+
dmesg = session.cmd("dmesg")
101+
dmesg_level = session.cmd("dmesg -l emerg,alert,crit")
102+
if "Call Trace" in dmesg or len(dmesg_level) >= 1:
103+
logging.debug("Call trace found in %s" % vm.name)
104+
if vm not in failed_vms:
105+
failed_vms.append(vm)
106+
found_trace = True
107+
session.close()
108+
109+
except Exception as err:
110+
test.error("Error getting dmesg of %s due to %s" % (vm.name, str(err)))
111+
return found_trace
112+
113+
# run stress for stress_time seconds
114+
logging.debug("Sleeping for %s seconds waiting for stress completion" % stress_time)
115+
stress_time = int(stress_time)
116+
117+
# check domstate of vms after stress_time
118+
if stress_time < 600:
119+
time.sleep(stress_time)
52120
for vm in vms:
53-
if vm.uptime() < vms_uptime_init[vm.name]:
54-
logging.error("Unexpected reboot of VM: %s between test", vm.name)
121+
if vm.state() != "running":
122+
logging.debug("%s state is %s" % (vm.name, vm.state()))
123+
failed_vms.append(vm)
55124
fail = True
56-
if fail:
57-
test.fail("Unexpected VM reboot detected")
125+
else:
126+
found_traces = check_call_traces(vm)
127+
if found_traces:
128+
fail = True
129+
time.sleep(2)
130+
131+
# check domstate of vms for every 5 minutes during stress_time
132+
else:
133+
all_failed = False
134+
number_of_checks = int(stress_time / 600)
135+
delta_time = int(stress_time % 600)
136+
for itr in range(number_of_checks):
137+
if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
138+
all_failed = True
139+
break
140+
if stress_timer <= 0:
141+
break
142+
time.sleep(600)
143+
for vm in vms:
144+
if vm.state() != "running":
145+
logging.debug("%s state is %s" % (vm.name, vm.state()))
146+
if vm not in failed_vms:
147+
failed_vms.append(vm)
148+
fail = True
149+
else:
150+
found_traces = check_call_traces(vm)
151+
if found_traces:
152+
fail = True
153+
time.sleep(3)
154+
stress_timer -= 3
155+
156+
if delta_time > 0 and stress_timer > 0 and not all_failed:
157+
time.sleep(delta_time)
158+
for vm in vms:
159+
if vm.state() != "running":
160+
logging.debug("%s state is %s" % (vm.name, vm.state()))
161+
if vm not in failed_vms:
162+
failed_vms.append(vm)
163+
fail = True
164+
else:
165+
found_traces = check_call_traces(vm)
166+
if found_traces:
167+
fail = True
168+
time.sleep(3)
169+
stress_timer -= 3
170+
171+
# virsh dump the failed vms into debug_dir
172+
if fail:
173+
for vm in failed_vms:
174+
if vm.state() != "shut off":
175+
logging.debug("Dumping %s to debug_dir %s" % (vm.name, debug_dir))
176+
virsh.dump(vm.name, debug_dir+vm.name+"-core", dump_options, ignore_status=False, debug=True)
177+
logging.debug("Successfully dumped %s as %s-core" % (vm.name, vm.name))
178+
else:
179+
logging.debug("Cannot dump %s as it is in shut off state" % vm.name)
180+
failed_vms_string = ", ".join(vm.name for vm in failed_vms)
181+
error_message = "Failure in " + failed_vms_string + " while running stress. "
182+
183+
if login_error_vms:
184+
login_error_vms_string = ", ".join(vm.name for vm in login_error_vms)
185+
error_message += "Login error in " + login_error_vms_string + " while running stress. "
186+
187+
if len(failed_vms) == len(vms) or len(login_error_vms) == len(vms):
188+
error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
189+
test.fail(error_message)
190+
191+
# run STRESS EVENTS in the remaining stable guests
192+
if len(failed_vms) < len(vms) and len(login_error_vms) < len(vms):
193+
for vm in failed_vms:
194+
if vm in vms:
195+
vms.remove(vm)
196+
for vm in login_error_vms:
197+
if vm in vms:
198+
vms.remove(vm)
199+
200+
if len(vms) == 0:
201+
error_message += "All vms in unstable state while running stress. Couldn't run STRESS EVENTS"
202+
test.fail(error_message)
203+
204+
new_vms = ", ".join(vm.name for vm in vms)
205+
try:
206+
if stress_events != "":
207+
logging.debug("Running stress_events in %s" % new_vms)
208+
stress_event = utils_stress.VMStressEvents(params, env, vms)
209+
stress_event.run_threads()
210+
stress_event.wait_for_threads()
211+
212+
if guest_stress:
213+
utils_test.unload_stress("stress_in_vms", params=params, vms=vms)
214+
215+
if host_stress:
216+
utils_test.unload_stress("stress_on_host", params=params)
217+
218+
if "reboot" not in stress_events:
219+
for vm in vms:
220+
if vm.uptime() < vms_uptime_init[vm.name]:
221+
logging.debug("Unexpected reboot of VM: %s between test", vm.name)
222+
unexpected_reboot_vms.append(vm)
223+
unexpected_reboot_vms_string = ", ".join(vm.name for vm in unexpected_reboot_vms)
224+
if unexpected_reboot_vms:
225+
error_message += "Unexpected reboot of guest(s) " + unexpected_reboot_vms_string + ". "
226+
227+
except Exception as err:
228+
error_message += "Failure running STRESS EVENTS in " + new_vms + " due to" + str(err)
229+
230+
# check the test status
231+
if error_message:
232+
test.fail(error_message)

0 commit comments

Comments
 (0)