Don't stop on signals from zombie threads

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

Don't stop on signals from zombie threads

Kevin Buettner
I've committed the changes below.

We were seeing instances where zombie threads were reporting SIGTRAP
to the debugger.  The solution used in the patch below was to disable
these threads (from being continued) and then continue the process
with the zombies disabled.  

I also noticed that deleting LWPs from the hash table was done in such
a way that the order of the entries in the table can be perterbed.  This
is not a problem if done in isolation, but can be a problem when iterating
through the table as we do.  So I also rewrote the LWP hash table
deletion code so that the order is preserved.  Hash table slots of
deleted entries will eventually be reclaimed when the table is resized
or when new entries are added that fall upon a previously deleted
entry.

        * lwp-pool.c (struct lwp): Add new field `disabled'.
        (empty_lwp_slot): New static global.
        (hash_empty_slot, resize_hash, lwp_pool_stop_all)
        (lwp_pool_continue_all, clear_all_do_step_flags)
        (hash_find): Rename to hash_find_1.  Add parameter `create_p'.
        Initialize `disabled' field.
        (hash_find, hash_find_no_create, lwp_pool_disable_lwp)
        (lwp_pool_enable_lwp): New functions.
        (hash_delete): Revise method used for deleting a slot from
        the hash table.
        (lwp_pool_continue_all): Don't continue disabled LWPs.
        (lwp_pool_continue_lwp): Print a warning instead of an error, and
        then only when LWP Pool diagnostics are enabled when attempting
        to continue a LWP with the state of
        lwp_state_stopped_stop_pending_interesting.
        * lwp-pool.h (lwp_pool_disable_lwp, lwp_pool_enable_lwp): Declare.
        * thread-db.c (find_new_threads_callback): Adjust thread
        deletion / reuse message.
        (update_thread_list): Disable continuation of zombie threads.
        (thread_db_break_program): Enable diagnostic message for "monitor
        thread-db-noisy".  Use kill_lwp() instead of kill().
        (thread_db_check_child_state): Don't stop on signals from zombie
        threads.

Index: unix/lwp-pool.c
===================================================================
RCS file: /cvs/src/src/rda/unix/lwp-pool.c,v
retrieving revision 1.7
diff -u -p -r1.7 lwp-pool.c
--- unix/lwp-pool.c 14 Oct 2010 18:08:13 -0000 1.7
+++ unix/lwp-pool.c 14 Jun 2012 20:16:07 -0000
@@ -326,6 +326,10 @@ struct lwp
      signals prior to a step actually occuring.  Receipt of a SIGTRAP is
      sufficient to clear this flag.  */
   int do_step;
+
+  /* Indicates whether the lwp should be considered when continuing or
+     stepping.  */
+  int disabled;
 };
 
   
@@ -359,6 +363,11 @@ struct lwp
 static size_t hash_size, hash_population;
 static struct lwp **hash;
 
+/* We put the address of empty_lwp_slot into the hash table to
+   represent deleted entries.  We do this so that we will not
+   perturb the hash table while iterating through it.  */
+static struct lwp empty_lwp_slot;
+
 /* The minimum size for the hash table.  Small for testing.  */
 enum { minimum_hash_size = 8 };
 
@@ -389,7 +398,7 @@ hash_empty_slot (pid_t pid)
 
   /* Since hash_next_slot will eventually visit every slot, and we
      know the table isn't full, this loop will terminate.  */
-  while (hash[slot])
+  while (hash[slot] && hash[slot] != &empty_lwp_slot)
     slot = hash_next_slot (slot, hash_size);
 
   return slot;
@@ -438,7 +447,7 @@ resize_hash (void)
 
   /* Re-insert all the old lwp's in the new table.  */
   for (i = 0; i < hash_size; i++)
-    if (hash[i])
+    if (hash[i] && hash[i] != &empty_lwp_slot)
       {
  struct lwp *l = hash[i];
  int new_slot = hash_slot (l->pid, new_hash_size);
@@ -464,7 +473,7 @@ resize_hash (void)
 /* Find an existing hash table entry for LWP.  If there is none,
    create one in state lwp_state_uninitialized.  */
 static struct lwp *
-hash_find (pid_t lwp)
+hash_find_1 (pid_t lwp, int create_p)
 {
   int slot;
   struct lwp *l;
@@ -483,6 +492,9 @@ hash_find (pid_t lwp)
     if (hash[slot]->pid == lwp)
       return hash[slot];
 
+  if (!create_p)
+    return NULL;
+
   /* There is no entry for this lwp.  Create one.  */
   l = malloc (sizeof (*l));
   l->pid = lwp;
@@ -490,7 +502,9 @@ hash_find (pid_t lwp)
   l->next = l->prev = NULL;
   l->status = 42;
   l->do_step = 0;
+  l->disabled = 0;
 
+  slot = hash_empty_slot (lwp);
   hash[slot] = l;
   hash_population++;
 
@@ -501,6 +515,17 @@ hash_find (pid_t lwp)
   return l;
 }
 
+static struct lwp *
+hash_find (pid_t lwp)
+{
+  return hash_find_1 (lwp, 1);
+}
+
+static struct lwp *
+hash_find_no_create (pid_t lwp)
+{
+  return hash_find_1 (lwp, 0);
+}
 
 /* Remove the LWP L from the pool.  This does not free L itself.  */
 static void
@@ -521,6 +546,7 @@ hash_delete (struct lwp *l)
   /* There should be only one 'struct lwp' with a given PID.  */
   assert (hash[slot] == l);
 
+#if 0
   /* Deleting from this kind of hash table is interesting, because of
      the way we handle collisions.
 
@@ -573,6 +599,21 @@ hash_delete (struct lwp *l)
  hash[slot] = NULL;
  hash[hash_empty_slot (refugee->pid)] = refugee;
       }
+#else
+  /* The problem with the above (disabled) code above is that
+     we may perturb the order of the entries when we rehash.  This
+     is a serious problem when we're attempting to iterate through
+     the hash table at the same time.
+
+     The approach take here is to simply place the address of
+     `empty_lwp_slot' into the deleted slot.  This slot will
+     be reclaimed either when the hash table is resized or when
+     it is encountered on insertion by traversing the collision
+     chain.  */
+
+  hash[slot] = &empty_lwp_slot;
+  hash_population--;
+#endif
 }
 
 
@@ -1131,7 +1172,7 @@ lwp_pool_stop_all (struct gdbserv *serv)
     {
       struct lwp *l = hash[i];
 
-      if (l)
+      if (l && l != &empty_lwp_slot)
  {
   enum lwp_state old_state = l->state;
 
@@ -1205,7 +1246,7 @@ lwp_pool_stop_all (struct gdbserv *serv)
   for (i = 0; i < hash_size; i++)
     {
       struct lwp *l = hash[i];
-      if (l)
+      if (l && l != &empty_lwp_slot)
  switch (l->state)
   {
   case lwp_state_uninitialized:
@@ -1262,7 +1303,7 @@ lwp_pool_continue_all (struct gdbserv *s
     {
       struct lwp *l = hash[i];
 
-      if (l)
+      if (l && l != &empty_lwp_slot && !l->disabled)
  {
   enum lwp_state old_state = l->state;
 
@@ -1351,11 +1392,16 @@ lwp_pool_continue_lwp (struct gdbserv *s
     case lwp_state_stopped_interesting:
     case lwp_state_dead_interesting:
     case lwp_state_running_stop_pending:
-    case lwp_state_stopped_stop_pending_interesting:
       fprintf (stderr, "ERROR: continuing LWP %d in unwaited state: %s\n",
                (int) l->pid, lwp_state_str (l->state));
       break;
 
+    case lwp_state_stopped_stop_pending_interesting:
+      if (debug_lwp_pool)
+ fprintf (stderr, "WARNING: continuing LWP %d in unwaited state: %s\n",
+ (int) l->pid, lwp_state_str (l->state));
+      break;
+
     case lwp_state_stopped:
       result = continue_or_step_lwp (serv, l, signal);
       if (result == 0)
@@ -1407,7 +1453,7 @@ clear_all_do_step_flags (void)
     {
       struct lwp *l = hash[i];
 
-      if (l)
+      if (l && l != &empty_lwp_slot)
  l->do_step = 0;
     }
 }
@@ -1551,3 +1597,41 @@ lwp_pool_attach (struct gdbserv *serv, p
     
   return 0;
 }
+
+void
+lwp_pool_disable_lwp (pid_t pid)
+{
+  struct lwp *l = hash_find_no_create (pid);
+
+  if (l)
+    {
+      l->disabled = 1;
+
+      if (debug_lwp_pool)
+ fprintf (stderr, "lwp_pool_disable_lwp: disabling %d\n", pid);
+    }
+  else
+    {
+      if (debug_lwp_pool)
+ fprintf (stderr, "lwp_pool_disable_lwp: pid %d not in pool\n", pid);
+    }
+}
+
+void
+lwp_pool_enable_lwp (pid_t pid)
+{
+  struct lwp *l = hash_find_no_create (pid);
+
+  if (l)
+    {
+      l->disabled = 0;
+
+      if (debug_lwp_pool)
+ fprintf (stderr, "lwp_pool_enable_lwp: disabling %d\n", pid);
+    }
+  else
+    {
+      if (debug_lwp_pool)
+ fprintf (stderr, "lwp_pool_enable_lwp: pid %d not in pool\n", pid);
+    }
+}
Index: unix/lwp-pool.h
===================================================================
RCS file: /cvs/src/src/rda/unix/lwp-pool.h,v
retrieving revision 1.3
diff -u -p -r1.3 lwp-pool.h
--- unix/lwp-pool.h 8 Nov 2005 21:58:36 -0000 1.3
+++ unix/lwp-pool.h 14 Jun 2012 20:16:07 -0000
@@ -113,5 +113,11 @@ int lwp_pool_continue_lwp (struct gdbser
    been completed.  */
 int lwp_pool_singlestep_lwp (struct gdbserv *serv, pid_t pid, int signal);
 
+/* Disable the LWP denoted by PID.  */
+void lwp_pool_disable_lwp (pid_t pid);
+
+/* Enable the LWP denoted by PID.  */
+void lwp_pool_enable_lwp (pid_t pid);
+
 
 #endif /* RDA_UNIX_LWP_POOL_H */
Index: unix/thread-db.c
===================================================================
RCS file: /cvs/src/src/rda/unix/thread-db.c,v
retrieving revision 1.21
diff -u -p -r1.21 thread-db.c
--- unix/thread-db.c 30 Nov 2011 23:14:30 -0000 1.21
+++ unix/thread-db.c 14 Jun 2012 20:16:07 -0000
@@ -1441,7 +1441,7 @@ find_new_threads_callback (const td_thrh
          one thread has died and another was created using the
  same thread identifier.  */
       if (thread_db_noisy)
- fprintf (stderr, "(thread deletion / reuse: %s)\n", thread_debug_name (thread));
+ fprintf (stderr, "(thread deletion / reuse: %s; state: %d new lwp: %d)\n", thread_debug_name (thread), thread->ti.ti_state, ti.ti_lid);
       thread->ti = ti;
     }
   else
@@ -1522,6 +1522,16 @@ update_thread_list (struct child_process
     }
  }
     }
+
+  /* Disable zombie threads.  */
+  for (thread = first_thread_in_list ();
+       thread;
+       thread = next_thread_in_list (thread))
+    {
+      /* Don't allow zombie threads to continue.  */
+      if (thread->ti.ti_state == TD_THR_ZOMBIE)
+ lwp_pool_disable_lwp (thread->ti.ti_lid);
+    }
 }
 
 /* Function: thread_db_thread_next
@@ -2100,23 +2110,23 @@ thread_db_break_program (struct gdbserv
 
   if (process->interrupt_with_SIGSTOP)
     {
-      if (process->debug_backend)
+      if (process->debug_backend || thread_db_noisy)
  fprintf (stderr, " -- send SIGSTOP to child %d\n", proc_handle.pid);
 
       /* Tell the GDB user that SIGSTOP has been sent to the inferior.  */
       print_sigstop_message (serv);
 
-      kill (proc_handle.pid, SIGSTOP);
+      kill_lwp (proc_handle.pid, SIGSTOP);
     }
   else
     {
-      if (process->debug_backend)
+      if (process->debug_backend || thread_db_noisy)
  fprintf (stderr, " -- send SIGINT to child %d\n", proc_handle.pid);
 
       /* Tell the GDB user that SIGINT has been sent to the inferior.  */
       print_sigint_message (serv);
 
-      kill (proc_handle.pid, SIGINT);
+      kill_lwp (proc_handle.pid, SIGINT);
     }
 }
 
@@ -2262,6 +2272,26 @@ thread_db_check_child_state (struct chil
       return 0;
     }
 
+  /* Continue on if the thread in question is now a zombie.  */
+  if (process->event_thread
+              && process->event_thread->ti.ti_state == TD_THR_ZOMBIE)
+    {
+      if (thread_db_noisy)
+ fprintf (stderr,
+ "\n<check_child_state: Ignoring %d - it's a zombie.>\n",
+ process->pid);
+      /* Updating the thread list will disable this zombie, plus any others.  */
+      update_thread_list (process);
+      /* Continue the main thread; not the zombie.  */
+      process->event_thread = NULL;
+      process->pid = proc_handle.pid;
+      /* No signal.  */
+      process->signal_to_send = 0;
+      /* Continue.  */
+      currentvec->continue_program (serv);
+      return 0;
+    }
+
   process->running = 0;
 
   /* Pass this event back to GDB. */