[PATCH] PR23891: Make sure stap and staprun respond to SIGTERM when stderr/stdout are blocked

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH] PR23891: Make sure stap and staprun respond to SIGTERM when stderr/stdout are blocked

Yichun Zhang
When stderr/stdout are blocked (the write buffers are full), write()
syscalls in stap's signal handler and staprun's stp_main_loop() might
prevent these processes from responding to signals like SIGTERM.

Also make staprun respond to SIGPIPE just like SIGTERM.

We introduce the kill_relayfs() function to kill the reader thread in
staprun without waiting for the readers (which might alreayd be blocked
on writing to stdout).

Our local stress tests have confirmed that this patch indeed fixes the
hanging issues in stap and staprun.
---
 main.cxx           | 24 ++++++++++++++++++++++--
 staprun/mainloop.c | 45 ++++++++++++++++++++++++++++++++-------------
 staprun/relay.c    | 29 +++++++++++++++++++++++++++++
 staprun/staprun.h  |  1 +
 4 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/main.cxx b/main.cxx
index 423e2abb4..dd80a9fd8 100644
--- a/main.cxx
+++ b/main.cxx
@@ -54,6 +54,7 @@ extern "C" {
 #include <unistd.h>
 #include <wordexp.h>
 #include <ftw.h>
+#include <fcntl.h>
 }
 
 using namespace std;
@@ -277,8 +278,27 @@ void handle_interrupt (int)
   if (pending_interrupts > 2) // XXX: should be configurable? time-based?
     {
       char msg[] = "Too many interrupts received, exiting.\n";
-      int rc = write (2, msg, sizeof(msg)-1);
-      if (rc) {/* Do nothing; we don't care if our last gasp went out. */ ;}
+
+      /* NB: writing to stderr blockingly in a signal handler is dangerous
+       * since it may prevent the stap process from quitting gracefully
+       * on receiving SIGTERM/etc signals when the stderr write buffer
+       * is full. PR23891 */
+      int flags = fcntl(2, F_GETFL);
+      if (flags == -1)
+        _exit (1);
+
+      if (!(flags & O_NONBLOCK))
+        {
+          if (fcntl(2, F_SETFL, flags | O_NONBLOCK) == 0) {
+            int rc = write (2, msg, sizeof(msg)-1);
+            if (rc)
+              {
+                /* Do nothing; we don't care if our last gasp went out. */
+                ;
+              }
+          }
+        }  /* if O_NONBLOCK */
+
       _exit (1);
     }
 }
diff --git a/staprun/mainloop.c b/staprun/mainloop.c
index 0ae0aea47..b60408776 100644
--- a/staprun/mainloop.c
+++ b/staprun/mainloop.c
@@ -31,6 +31,24 @@ static int target_pid_failed_p = 0;
    main thread of interruptable events. */
 static pthread_t main_thread;
 
+static void set_nonblocking_std_fds(void)
+{
+  for (int fd = 1; fd < 3; fd++) {
+    /* NB: writing to stderr/stdout blockingly in signal handler is
+     * dangerous since it may prevent the stap process from quitting
+     * gracefully on receiving SIGTERM/etc signals when the stderr/stdout
+     * write buffer is full. PR23891 */
+    int flags = fcntl(2, F_GETFL);
+    if (flags == -1)
+      continue;
+
+    if (flags & O_NONBLOCK)
+      continue;
+
+    (void) fcntl(2, F_SETFL, flags | O_NONBLOCK);
+  }
+}
+
 static void *signal_thread(void *arg)
 {
   sigset_t *s = (sigset_t *) arg;
@@ -41,13 +59,20 @@ static void *signal_thread(void *arg)
       _perr("sigwait");
       continue;
     }
-    dbug(2, "sigproc %d (%s)\n", signum, strsignal(signum));
     if (signum == SIGQUIT) {
       load_only = 1; /* flag for stp_main_loop */
       pending_interrupts ++;
-    } else if (signum == SIGINT || signum == SIGHUP || signum == SIGTERM) {
+    } else if (signum == SIGINT || signum == SIGHUP || signum == SIGTERM
+               || signum == SIGPIPE)
+    {
       pending_interrupts ++;
     }
+    if (pending_interrupts > 2) {
+      set_nonblocking_std_fds();
+      pthread_kill (main_thread, SIGURG);
+    }
+    /* stderr is not blocking at this point */
+    dbug(2, "sigproc %d (%s)\n", signum, strsignal(signum));
   }
   /* Notify main thread (interrupts select). */
   pthread_kill (main_thread, SIGURG);
@@ -153,6 +178,7 @@ static void setup_main_signals(void)
   sigaddset(s, SIGTERM);
   sigaddset(s, SIGHUP);
   sigaddset(s, SIGQUIT);
+  sigaddset(s, SIGPIPE);
   pthread_sigmask(SIG_SETMASK, s, NULL);
   if (pthread_create(&tid, NULL, signal_thread, s) < 0) {
     _perr(_("failed to create thread"));
@@ -533,10 +559,12 @@ void cleanup_and_exit(int detach, int rc)
   /* OTOH, it may be still be running - but there's no need for
      us to wait for it, considering that the script must have exited
      for another reason.  So, we no longer   while(...wait()...);  here.
-     XXX: we could consider killing it. */
+   */
 
   if (use_old_transport)
     close_oldrelayfs(detach);
+  else if (pending_interrupts > 2)
+    kill_relayfs();
   else
     close_relayfs();
 
@@ -637,7 +665,7 @@ int stp_main_loop(void)
   struct timespec ts;
   struct timespec *timeout = NULL;
   fd_set fds;
-  sigset_t blockset, mainset;
+  sigset_t mainset;
 
 
   setvbuf(ofp, (char *)NULL, _IONBF, 0);
@@ -663,15 +691,6 @@ int stp_main_loop(void)
   res = select(control_channel + 1, NULL, NULL, &fds, &tv);
   select_supported = (res == 1 && FD_ISSET(control_channel, &fds));
   dbug(2, "select_supported: %d\n", select_supported);
-  if (select_supported) {
-    /* We block SIGURG to the main thread, except when we call
-       pselect(). This makes sure we won't miss any signals. All other
-       calls are non-blocking, so we defer till pselect() time, which
-       is when we are "sleeping". */
-    sigemptyset(&blockset);
-    sigaddset(&blockset, SIGURG);
-    pthread_sigmask(SIG_BLOCK, &blockset, &mainset);
-  }
 
   if (monitor)
       monitor_setup();
diff --git a/staprun/relay.c b/staprun/relay.c
index c87bef4ca..4340aef0c 100644
--- a/staprun/relay.c
+++ b/staprun/relay.c
@@ -468,3 +468,32 @@ void close_relayfs(void)
  }
  dbug(2, "done\n");
 }
+
+void kill_relayfs(void)
+{
+ int i;
+ stop_threads = 1;
+ dbug(2, "closing\n");
+ for (i = 0; i < ncpus; i++) {
+ if (reader[avail_cpus[i]])
+ pthread_kill(reader[avail_cpus[i]], SIGUSR2);
+ else
+ break;
+ }
+ for (i = 0; i < ncpus; i++) {
+ if (reader[avail_cpus[i]])
+ pthread_cancel(reader[avail_cpus[i]]); /* no wait */
+ else
+ break;
+ }
+ for (i = 0; i < ncpus; i++) {
+ if (relay_fd[avail_cpus[i]] >= 0)
+ close(relay_fd[avail_cpus[i]]);
+ else
+ break;
+ }
+ for (i = 0; i < ncpus; i++) {
+ pthread_mutex_destroy(&mutex[avail_cpus[i]]);
+ }
+ dbug(2, "done\n");
+}
diff --git a/staprun/staprun.h b/staprun/staprun.h
index 7cebd36f6..2ad78da7c 100644
--- a/staprun/staprun.h
+++ b/staprun/staprun.h
@@ -164,6 +164,7 @@ int init_ctl_channel(const char *name, int verb);
 void close_ctl_channel(void);
 int init_relayfs(void);
 void close_relayfs(void);
+void kill_relayfs(void);
 int init_oldrelayfs(void);
 void close_oldrelayfs(int);
 int write_realtime_data(void *data, ssize_t nb);
--
2.11.0.295.gd7dffce

Reply | Threaded
Open this post in threaded view
|

[PATCH v2] PR23891: Make sure stap and staprun respond to SIGTERM when stderr/stdout are blocked

Yichun Zhang
When stderr/stdout are blocked (the write buffers are full), write()
syscalls in stap's signal handler and staprun's stp_main_loop() might
prevent these processes from responding to signals like SIGTERM.

Also make staprun respond to SIGPIPE just like SIGTERM.

We introduce the kill_relayfs() function to kill the reader thread in
staprun without waiting for the readers (which might alreayd be blocked
on writing to stdout).

Our local stress tests have confirmed that this patch indeed fixes the
hanging issues in stap and staprun.
---
 main.cxx           | 27 +++++++++++++++--
 staprun/mainloop.c | 88 ++++++++++++++++++++++++++++++++++++++++--------------
 staprun/relay.c    | 29 ++++++++++++++++++
 staprun/staprun.h  |  1 +
 4 files changed, 121 insertions(+), 24 deletions(-)

diff --git a/main.cxx b/main.cxx
index 423e2abb4..3d186b9ca 100644
--- a/main.cxx
+++ b/main.cxx
@@ -54,6 +54,7 @@ extern "C" {
 #include <unistd.h>
 #include <wordexp.h>
 #include <ftw.h>
+#include <fcntl.h>
 }
 
 using namespace std;
@@ -277,8 +278,30 @@ void handle_interrupt (int)
   if (pending_interrupts > 2) // XXX: should be configurable? time-based?
     {
       char msg[] = "Too many interrupts received, exiting.\n";
-      int rc = write (2, msg, sizeof(msg)-1);
-      if (rc) {/* Do nothing; we don't care if our last gasp went out. */ ;}
+      int fd = 2;
+
+      /* NB: writing to stderr blockingly in a signal handler is dangerous
+       * since it may prevent the stap process from quitting gracefully
+       * on receiving SIGTERM/etc signals when the stderr write buffer
+       * is full. PR23891 */
+      int flags = fcntl(fd, F_GETFL);
+      if (flags == -1)
+        _exit (1);
+
+      if (!(flags & O_NONBLOCK))
+        {
+          if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == 0) {
+            int rc = write (fd, msg, sizeof(msg)-1);
+            if (rc)
+              {
+                /* Do nothing; we don't care if our last gasp went out. */
+                ;
+              }
+          }
+        }  /* if ! O_NONBLOCK */
+
+      /* to avoid leaving any side-effects on the stderr device */
+      (void) fcntl(fd, F_SETFL, flags & ~O_NONBLOCK);
       _exit (1);
     }
 }
diff --git a/staprun/mainloop.c b/staprun/mainloop.c
index 0ae0aea47..1b949ed80 100644
--- a/staprun/mainloop.c
+++ b/staprun/mainloop.c
@@ -31,6 +31,51 @@ static int target_pid_failed_p = 0;
    main thread of interruptable events. */
 static pthread_t main_thread;
 
+static void set_nonblocking_std_fds(void)
+{
+  for (int fd = 1; fd < 3; fd++) {
+    /* NB: writing to stderr/stdout blockingly in signal handler is
+     * dangerous since it may prevent the stap process from quitting
+     * gracefully on receiving SIGTERM/etc signals when the stderr/stdout
+     * write buffer is full. PR23891 */
+    int flags = fcntl(fd, F_GETFL);
+    if (flags == -1)
+      continue;
+
+    if (flags & O_NONBLOCK)
+      continue;
+
+    (void) fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+  }
+}
+
+static void set_blocking_std_fds(void)
+{
+  for (int fd = 1; fd < 3; fd++) {
+    /* NB: writing to stderr/stdout blockingly in signal handler is
+     * dangerous since it may prevent the stap process from quitting
+     * gracefully on receiving SIGTERM/etc signals when the stderr/stdout
+     * write buffer is full. PR23891 */
+    int flags = fcntl(fd, F_GETFL);
+    if (flags == -1)
+      continue;
+
+    if (!(flags & O_NONBLOCK))
+      continue;
+
+    (void) fcntl(fd, F_SETFL, flags & ~O_NONBLOCK);
+  }
+}
+
+static void my_exit(int rc)
+{
+  /* to avoid leaving any side-effects on the stdout/stderr devices */
+  if (pending_interrupts > 2)
+    set_blocking_std_fds();
+
+  _exit(rc);
+}
+
 static void *signal_thread(void *arg)
 {
   sigset_t *s = (sigset_t *) arg;
@@ -41,13 +86,19 @@ static void *signal_thread(void *arg)
       _perr("sigwait");
       continue;
     }
-    dbug(2, "sigproc %d (%s)\n", signum, strsignal(signum));
     if (signum == SIGQUIT) {
       load_only = 1; /* flag for stp_main_loop */
       pending_interrupts ++;
-    } else if (signum == SIGINT || signum == SIGHUP || signum == SIGTERM) {
+    } else if (signum == SIGINT || signum == SIGHUP || signum == SIGTERM
+               || signum == SIGPIPE)
+    {
       pending_interrupts ++;
     }
+    if (pending_interrupts > 2) {
+      set_nonblocking_std_fds();
+      pthread_kill (main_thread, SIGURG);
+    }
+    dbug(2, "sigproc %d (%s)\n", signum, strsignal(signum));
   }
   /* Notify main thread (interrupts select). */
   pthread_kill (main_thread, SIGURG);
@@ -153,6 +204,7 @@ static void setup_main_signals(void)
   sigaddset(s, SIGTERM);
   sigaddset(s, SIGHUP);
   sigaddset(s, SIGQUIT);
+  sigaddset(s, SIGPIPE);
   pthread_sigmask(SIG_SETMASK, s, NULL);
   if (pthread_create(&tid, NULL, signal_thread, s) < 0) {
     _perr(_("failed to create thread"));
@@ -533,10 +585,12 @@ void cleanup_and_exit(int detach, int rc)
   /* OTOH, it may be still be running - but there's no need for
      us to wait for it, considering that the script must have exited
      for another reason.  So, we no longer   while(...wait()...);  here.
-     XXX: we could consider killing it. */
+   */
 
   if (use_old_transport)
     close_oldrelayfs(detach);
+  else if (pending_interrupts > 2)
+    kill_relayfs();
   else
     close_relayfs();
 
@@ -545,7 +599,7 @@ void cleanup_and_exit(int detach, int rc)
 
   if (detach) {
     eprintf(_("\nDisconnecting from systemtap module.\n" "To reconnect, type \"staprun -A %s\"\n"), modname);
-    _exit(0);
+    my_exit(0);
   }
   else if (rename_mod)
     dbug(2, "\nRenamed module to: %s\n", modname);
@@ -568,7 +622,7 @@ void cleanup_and_exit(int detach, int rc)
   pid = fork();
   if (pid < 0) {
           _perr("fork");
-          _exit(-1);
+          my_exit(-1);
   }
 
   if (pid == 0) { /* child process */
@@ -584,27 +638,27 @@ void cleanup_and_exit(int detach, int rc)
                   execlp("sh", "sh", "-c", cmd, NULL);
                   /* should not return */
                   perror(staprun);
-                  _exit(-1);
+                  my_exit(-1);
           } else {
                   perror("asprintf");
-                  _exit(-1);
+                  my_exit(-1);
           }
   }
 
   /* parent process */
   if (waitpid(pid, &rstatus, 0) < 0) {
           _perr("waitpid");
-          _exit(-1);
+          my_exit(-1);
   }
 
   if (WIFEXITED(rstatus)) {
           if(rc || target_pid_failed_p || rstatus) // if we have an error
-            _exit(1);
+            my_exit(1);
           else
-            _exit(0); //success
+            my_exit(0); //success
   }
 
-  _exit(-1);
+  my_exit(-1);
 }
 
 
@@ -637,7 +691,6 @@ int stp_main_loop(void)
   struct timespec ts;
   struct timespec *timeout = NULL;
   fd_set fds;
-  sigset_t blockset, mainset;
 
 
   setvbuf(ofp, (char *)NULL, _IONBF, 0);
@@ -663,15 +716,6 @@ int stp_main_loop(void)
   res = select(control_channel + 1, NULL, NULL, &fds, &tv);
   select_supported = (res == 1 && FD_ISSET(control_channel, &fds));
   dbug(2, "select_supported: %d\n", select_supported);
-  if (select_supported) {
-    /* We block SIGURG to the main thread, except when we call
-       pselect(). This makes sure we won't miss any signals. All other
-       calls are non-blocking, so we defer till pselect() time, which
-       is when we are "sleeping". */
-    sigemptyset(&blockset);
-    sigaddset(&blockset, SIGURG);
-    pthread_sigmask(SIG_BLOCK, &blockset, &mainset);
-  }
 
   if (monitor)
       monitor_setup();
@@ -749,7 +793,7 @@ int stp_main_loop(void)
         // Immediately update screen on input
         if (monitor)
           FD_SET(STDIN_FILENO, &fds);
- res = pselect(maxfd + 1, &fds, NULL, NULL, timeout, &mainset);
+ res = pselect(maxfd + 1, &fds, NULL, NULL, timeout, NULL);
  if (res < 0 && errno != EINTR)
   {
     _perr(_("Unexpected error in select"));
diff --git a/staprun/relay.c b/staprun/relay.c
index c87bef4ca..77f881d3d 100644
--- a/staprun/relay.c
+++ b/staprun/relay.c
@@ -468,3 +468,32 @@ void close_relayfs(void)
  }
  dbug(2, "done\n");
 }
+
+void kill_relayfs(void)
+{
+ int i;
+ stop_threads = 1;
+ dbug(2, "killing\n");
+ for (i = 0; i < ncpus; i++) {
+ if (reader[avail_cpus[i]])
+ pthread_kill(reader[avail_cpus[i]], SIGUSR2);
+ else
+ break;
+ }
+ for (i = 0; i < ncpus; i++) {
+ if (reader[avail_cpus[i]])
+ pthread_cancel(reader[avail_cpus[i]]); /* no wait */
+ else
+ break;
+ }
+ for (i = 0; i < ncpus; i++) {
+ if (relay_fd[avail_cpus[i]] >= 0)
+ close(relay_fd[avail_cpus[i]]);
+ else
+ break;
+ }
+ for (i = 0; i < ncpus; i++) {
+ pthread_mutex_destroy(&mutex[avail_cpus[i]]);
+ }
+ dbug(2, "done\n");
+}
diff --git a/staprun/staprun.h b/staprun/staprun.h
index 7cebd36f6..2ad78da7c 100644
--- a/staprun/staprun.h
+++ b/staprun/staprun.h
@@ -164,6 +164,7 @@ int init_ctl_channel(const char *name, int verb);
 void close_ctl_channel(void);
 int init_relayfs(void);
 void close_relayfs(void);
+void kill_relayfs(void);
 int init_oldrelayfs(void);
 void close_oldrelayfs(int);
 int write_realtime_data(void *data, ssize_t nb);
--
2.11.0.295.gd7dffce

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2] PR23891: Make sure stap and staprun respond to SIGTERM when stderr/stdout are blocked

Yichun Zhang
Hello!

On Fri, Nov 23, 2018 at 3:02 PM Yichun Zhang (agentzh) wrote:
>
> When stderr/stdout are blocked (the write buffers are full), write()
> syscalls in stap's signal handler and staprun's stp_main_loop() might
> prevent these processes from responding to signals like SIGTERM.
>

Committed.

Best,
Yichun