From dd50e300c3f27e31cc26a9f503c95b11aaf9be25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Wed, 1 Apr 2020 03:33:21 +0200 Subject: [PATCH] Fix multiple qmemman issues First the main bug: when meminfo xenstore watch fires, in some cases (just after starting some domain) XS_Watcher refreshes internal list of domains before processing the event. This is done specifically to include new domain in there. But the opposite could happen too - the domain could be destroyed. In this case refres_meminfo() function raises an exception, which isn't handled and interrupts the whole xenstore watch loop. This issue is likely to be triggered by killing the domain, as this way it could disappear shortly after writing updated meminfo entry. In case of proper shutdown, meminfo-writer is stopped earlier and do not write updates just before domain destroy. Fix this by checking if the requested domain is still there just after refreshing the list. Then, catch exceptions in xenstore watch handling functions, to not interrupt xenstore watch loop. If it gets interrupted, qmemman basically stops memory balancing. And finally, clear force_refresh_domain_list flag after refreshing the domain list. That missing line caused domain refresh at every meminfo change, making it use some more CPU time. While at it, change "EOF" log message to something a bit more meaningful. Thanks @conorsch for capturing valuable logs. Fixes QubesOS/qubes-issues#4890 --- qubes/tools/qmemmand.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/qubes/tools/qmemmand.py b/qubes/tools/qmemmand.py index df90ca69..1cb23eb4 100644 --- a/qubes/tools/qmemmand.py +++ b/qubes/tools/qmemmand.py @@ -123,13 +123,18 @@ class XS_Watcher(object): self.handle.unwatch(get_domain_meminfo_key(i), self.watch_token_dict[i]) self.watch_token_dict.pop(i) system_state.del_domain(i) + except: + self.log.exception('Updating domain list failed') finally: if got_lock: global_lock.release() self.log.debug('global_lock released') if not refresh_only: - system_state.do_balance() + try: + system_state.do_balance() + except: + self.log.exception('do_balance() failed') def meminfo_changed(self, domain_id): @@ -143,10 +148,17 @@ class XS_Watcher(object): global_lock.acquire() self.log.debug('global_lock acquired') try: + global force_refresh_domain_list if force_refresh_domain_list: self.domain_list_changed(refresh_only=True) + force_refresh_domain_list = False + if domain_id not in self.watch_token_dict: + # domain just destroyed + return system_state.refresh_meminfo(domain_id, untrusted_meminfo_key) + except: + self.log.exception('Updating meminfo for %d failed', domain_id) finally: global_lock.release() self.log.debug('global_lock released') @@ -180,7 +192,7 @@ class QMemmanReqHandler(socketserver.BaseRequestHandler): self.data = self.request.recv(1024).strip() self.log.debug('data={!r}'.format(self.data)) if len(self.data) == 0: - self.log.info('EOF') + self.log.info('client disconnected, resuming membalance') if got_lock: global force_refresh_domain_list force_refresh_domain_list = True