fnmatch has exponential running time

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

fnmatch has exponential running time

Bruno Haible
fnmatch() has a worst-case complexity O(m*n) where m is the size of the
pattern and n is the size of the sample string. Unfortunately glibc has
chosen an implementation with exponential running time.

$ mkdir foo
$ cd foo
$ touch aaaabbbbccccddddeeeeffffgggghhhhiiiijjjjkkkkllllmmmmnnnnooooppppqqqqrrrrssssttttuuuuvvvvwwwwxxxxyyyy
$ time find . -name 'a*b*d*e*f*g*h*i*j*z*'
real    0m0.028s
user    0m0.028s
sys     0m0.000s
$ time find . -name 'a*b*d*e*f*g*h*i*j*k*z*'
real    0m0.095s
user    0m0.092s
sys     0m0.000s
$ time find . -name 'a*b*d*e*f*g*h*i*j*k*l*z*'
real    0m0.377s
user    0m0.348s
sys     0m0.004s
$ time find . -name 'a*b*d*e*f*g*h*i*j*k*l*m*z*'
real    0m1.381s
user    0m1.336s
sys     0m0.008s
$ time find . -name 'a*b*d*e*f*g*h*i*j*k*l*m*n*z*'
real    0m5.108s
user    0m5.016s
sys     0m0.004s
$ time find . -name 'a*b*d*e*f*g*h*i*j*k*l*m*n*o*z*'
real    0m19.393s
user    0m18.913s
sys     0m0.008s
$ time find . -name 'a*b*d*e*f*g*h*i*j*k*l*m*n*o*p*z*'
real    1m13.241s
user    1m11.840s
sys     0m0.004s
$ time find . -name 'a*b*d*e*f*g*h*i*j*k*l*m*n*o*p*q*z*'
real    4m50.007s
user    4m44.002s
sys     0m0.056s

A gdb stack trace shows this:

#0  0xf7ed3150 in wmemchr () from /lib/libc.so.6
#1  0xf7ef67d1 in internal_fnwmatch () from /lib/libc.so.6
#2  0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#3  0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#4  0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#5  0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#6  0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#7  0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#8  0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#9  0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#10 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#11 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#12 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#13 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#14 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#15 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#16 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#17 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#18 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#19 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#20 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#21 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#22 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#23 0xf7ef68c9 in internal_fnwmatch () from /lib/libc.so.6
#24 0xf7ef7821 in fnmatch@GLIBC_2.0 () from /lib/libc.so.6
#25 0x00000004 in ?? ()

So, apparently fnmnatch() is implemented in a recursive way, and does
backtracking. Even though no backtracking is needed at all for patterns
like the above.

Bruno
Reply | Threaded
Open this post in threaded view
|

Re: fnmatch has exponential running time

James Youngman-5
On 3/22/07, Bruno Haible <[hidden email]> wrote:
> fnmatch() has a worst-case complexity O(m*n) where m is the size of the
> pattern and n is the size of the sample string. Unfortunately glibc has
> chosen an implementation with exponential running time.

Yes.  Oddly, per some testng I did about a year ago, "find -regex" is
much faster than "find -name".  The former uses the Gnulib regex
support and the latter uses fnmatch().

James.
Reply | Threaded
Open this post in threaded view
|

Re: fnmatch has exponential running time

Jakub Jelinek
In reply to this post by Bruno Haible
On Thu, Mar 22, 2007 at 09:21:37PM +0100, Bruno Haible wrote:

> fnmatch() has a worst-case complexity O(m*n) where m is the size of the
> pattern and n is the size of the sample string. Unfortunately glibc has
> chosen an implementation with exponential running time.
>
> $ mkdir foo
> $ cd foo
> $ touch aaaabbbbccccddddeeeeffffgggghhhhiiiijjjjkkkkllllmmmmnnnnooooppppqqqqrrrrssssttttuuuuvvvvwwwwxxxxyyyy
> $ time find . -name 'a*b*d*e*f*g*h*i*j*k*l*m*n*o*p*q*z*'
> real    4m50.007s
> user    4m44.002s
> sys     0m0.056s

The following patch ought to fix it.
When recursing for * handling it will stop when it hits next normal *,
tell the caller what the current pattern and string pointers were
and return to the caller which will restart from that place.
I have also removed no_leading_period2 variables, since there is no need
to preserve no_leading_period variable.  Either the code will always
return without ever looking at that var again (that was the only case
before this patch), or it will reinitialize it from what the recursive
call passed up.

2007-03-27  Jakub Jelinek  <[hidden email]>

        * posix/fnmatch.c (STRUCT): Define.
        (fnmatch): Pass NULL as last argument to internal_fn{,w}match.
        * posix/fnmatch_loop.c (struct STRUCT): New type.
        (FCT): Add ends argument.  If ends != NULL and normal * is
        seen in the pattern, store current pattern and string pointers
        and return.  Adjust recursive calls.
        (EXT): Adjust FCT callers.
        (STRUCT): Undef at the end of the file.
        * posix/Makefile (tests): Add tst-fnmatch2.
        * posix/tst-fnmatch2.c: New test.

--- libc/posix/fnmatch.c.jj 2005-03-30 06:17:47.000000000 +0200
+++ libc/posix/fnmatch.c 2007-03-27 10:31:25.000000000 +0200
@@ -1,4 +1,4 @@
-/* Copyright (C) 1991,1992,1993,1996,1997,1998,1999,2000,2001,2002,2003
+/* Copyright (C) 1991,1992,1993,1996,1997,1998,1999,2000,2001,2002,2003,2007
  Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -209,6 +209,7 @@ __wcschrnul (s, c)
 # define FCT internal_fnmatch
 # define EXT ext_match
 # define END end_pattern
+# define STRUCT fnmatch_struct
 # define L(CS) CS
 # ifdef _LIBC
 #  define BTOWC(C) __btowc (C)
@@ -235,7 +236,8 @@ __wcschrnul (s, c)
 #  define INT wint_t
 #  define FCT internal_fnwmatch
 #  define EXT ext_wmatch
-# define END end_wpattern
+#  define END end_wpattern
+#  define STRUCT fnwmatch_struct
 #  define L(CS) L##CS
 #  define BTOWC(C) (C)
 #  define STRLEN(S) __wcslen (S)
@@ -397,12 +399,12 @@ fnmatch (pattern, string, flags)
  }
 
       return internal_fnwmatch (wpattern, wstring, wstring + n,
- flags & FNM_PERIOD, flags);
+ flags & FNM_PERIOD, flags, NULL);
     }
 # endif  /* mbstate_t and mbsrtowcs or _LIBC.  */
 
   return internal_fnmatch (pattern, string, string + strlen (string),
-   flags & FNM_PERIOD, flags);
+   flags & FNM_PERIOD, flags, NULL);
 }
 
 # ifdef _LIBC
--- libc/posix/fnmatch_loop.c.jj 2005-10-15 00:44:42.000000000 +0200
+++ libc/posix/fnmatch_loop.c 2007-03-27 11:47:12.000000000 +0200
@@ -1,5 +1,5 @@
-/* Copyright (C) 1991,1992,1993,1996,1997,1998,1999,2000,2001,2003,2004,2005
-   Free Software Foundation, Inc.
+/* Copyright (C) 1991,1992,1993,1996,1997,1998,1999,2000,2001,2003,2004,2005,
+   2007 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -17,10 +17,18 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
+struct STRUCT
+{
+  const CHAR *pattern;
+  const CHAR *string;
+  int no_leading_period;
+};
+
 /* Match STRING against the filename pattern PATTERN, returning zero if
    it matches, nonzero if not.  */
 static int FCT (const CHAR *pattern, const CHAR *string,
- const CHAR *string_end, int no_leading_period, int flags)
+ const CHAR *string_end, int no_leading_period, int flags,
+ struct STRUCT *ends)
      internal_function;
 static int EXT (INT opt, const CHAR *pattern, const CHAR *string,
  const CHAR *string_end, int no_leading_period, int flags)
@@ -29,12 +37,13 @@ static const CHAR *END (const CHAR *patt
 
 static int
 internal_function
-FCT (pattern, string, string_end, no_leading_period, flags)
+FCT (pattern, string, string_end, no_leading_period, flags, ends)
      const CHAR *pattern;
      const CHAR *string;
      const CHAR *string_end;
      int no_leading_period;
      int flags;
+     struct STRUCT *ends;
 {
   register const CHAR *p = pattern, *n = string;
   register UCHAR c;
@@ -97,6 +106,13 @@ FCT (pattern, string, string_end, no_lea
       if (res != -1)
  return res;
     }
+  else if (ends != NULL)
+    {
+      ends->pattern = p - 1;
+      ends->string = n;
+      ends->no_leading_period = no_leading_period;
+      return 0;
+    }
 
   if (n != string_end && *n == L('.') && no_leading_period)
     return FNM_NOMATCH;
@@ -157,7 +173,9 @@ FCT (pattern, string, string_end, no_lea
   else
     {
       const CHAR *endp;
+      struct STRUCT end;
 
+      end.pattern = NULL;
       endp = MEMCHR (n, (flags & FNM_FILE_NAME) ? L('/') : L('\0'),
      string_end - n);
       if (endp == NULL)
@@ -170,36 +188,46 @@ FCT (pattern, string, string_end, no_lea
  {
   int flags2 = ((flags & FNM_FILE_NAME)
  ? flags : (flags & ~FNM_PERIOD));
-  int no_leading_period2 = no_leading_period;
 
-  for (--p; n < endp; ++n, no_leading_period2 = 0)
-    if (FCT (p, n, string_end, no_leading_period2, flags2)
- == 0)
-      return 0;
+  for (--p; n < endp; ++n, no_leading_period = 0)
+    if (FCT (p, n, string_end, no_leading_period, flags2,
+     &end) == 0)
+      goto found;
  }
       else if (c == L('/') && (flags & FNM_FILE_NAME))
  {
   while (n < string_end && *n != L('/'))
     ++n;
   if (n < string_end && *n == L('/')
-      && (FCT (p, n + 1, string_end, flags & FNM_PERIOD, flags)
-  == 0))
+      && (FCT (p, n + 1, string_end, flags & FNM_PERIOD, flags,
+       NULL) == 0))
     return 0;
  }
       else
  {
   int flags2 = ((flags & FNM_FILE_NAME)
  ? flags : (flags & ~FNM_PERIOD));
-  int no_leading_period2 = no_leading_period;
 
   if (c == L('\\') && !(flags & FNM_NOESCAPE))
     c = *p;
   c = FOLD (c);
-  for (--p; n < endp; ++n, no_leading_period2 = 0)
+  for (--p; n < endp; ++n, no_leading_period = 0)
     if (FOLD ((UCHAR) *n) == c
- && (FCT (p, n, string_end, no_leading_period2, flags2)
-    == 0))
-      return 0;
+ && (FCT (p, n, string_end, no_leading_period, flags2,
+ &end) == 0))
+      {
+      found:
+ if (end.pattern == NULL)
+  return 0;
+ break;
+      }
+  if (end.pattern != NULL)
+    {
+      p = end.pattern;
+      n = end.string;
+      no_leading_period = end.no_leading_period;
+      continue;
+    }
  }
     }
 
@@ -1098,7 +1126,7 @@ EXT (INT opt, const CHAR *pattern, const
   switch (opt)
     {
     case L('*'):
-      if (FCT (p, string, string_end, no_leading_period, flags) == 0)
+      if (FCT (p, string, string_end, no_leading_period, flags, NULL) == 0)
  return 0;
       /* FALLTHROUGH */
 
@@ -1109,7 +1137,8 @@ EXT (INT opt, const CHAR *pattern, const
     /* First match the prefix with the current pattern with the
        current pattern.  */
     if (FCT (list->str, string, rs, no_leading_period,
-     flags & FNM_FILE_NAME ? flags : flags & ~FNM_PERIOD) == 0
+     flags & FNM_FILE_NAME ? flags : flags & ~FNM_PERIOD,
+     NULL) == 0
  /* This was successful.  Now match the rest with the rest
    of the pattern.  */
  && (FCT (p, rs, string_end,
@@ -1117,7 +1146,7 @@ EXT (INT opt, const CHAR *pattern, const
  ? no_leading_period
  : rs[-1] == '/' && NO_LEADING_PERIOD (flags) ? 1 : 0,
  flags & FNM_FILE_NAME
- ? flags : flags & ~FNM_PERIOD) == 0
+ ? flags : flags & ~FNM_PERIOD, NULL) == 0
     /* This didn't work.  Try the whole pattern.  */
     || (rs != string
  && FCT (pattern - 1, rs, string_end,
@@ -1126,7 +1155,7 @@ EXT (INT opt, const CHAR *pattern, const
  : (rs[-1] == '/' && NO_LEADING_PERIOD (flags)
    ? 1 : 0),
  flags & FNM_FILE_NAME
- ? flags : flags & ~FNM_PERIOD) == 0)))
+ ? flags : flags & ~FNM_PERIOD, NULL) == 0)))
       /* It worked.  Signal success.  */
       return 0;
  }
@@ -1136,7 +1165,7 @@ EXT (INT opt, const CHAR *pattern, const
       return FNM_NOMATCH;
 
     case L('?'):
-      if (FCT (p, string, string_end, no_leading_period, flags) == 0)
+      if (FCT (p, string, string_end, no_leading_period, flags, NULL) == 0)
  return 0;
       /* FALLTHROUGH */
 
@@ -1148,7 +1177,8 @@ EXT (INT opt, const CHAR *pattern, const
    pattern list.  */
  if (FCT (STRCAT (list->str, p), string, string_end,
  no_leading_period,
- flags & FNM_FILE_NAME ? flags : flags & ~FNM_PERIOD) == 0)
+ flags & FNM_FILE_NAME ? flags : flags & ~FNM_PERIOD,
+ NULL) == 0)
   /* It worked.  Signal success.  */
   return 0;
       while ((list = list->next) != NULL);
@@ -1163,7 +1193,8 @@ EXT (INT opt, const CHAR *pattern, const
 
   for (runp = list; runp != NULL; runp = runp->next)
     if (FCT (runp->str, string, rs,  no_leading_period,
-     flags & FNM_FILE_NAME ? flags : flags & ~FNM_PERIOD) == 0)
+     flags & FNM_FILE_NAME ? flags : flags & ~FNM_PERIOD,
+     NULL) == 0)
       break;
 
   /* If none of the patterns matched see whether the rest does.  */
@@ -1172,8 +1203,8 @@ EXT (INT opt, const CHAR *pattern, const
        rs == string
        ? no_leading_period
        : rs[-1] == '/' && NO_LEADING_PERIOD (flags) ? 1 : 0,
-       flags & FNM_FILE_NAME ? flags : flags & ~FNM_PERIOD)
-  == 0))
+       flags & FNM_FILE_NAME ? flags : flags & ~FNM_PERIOD,
+       NULL) == 0))
     /* This is successful.  */
     return 0;
  }
@@ -1198,6 +1229,7 @@ EXT (INT opt, const CHAR *pattern, const
 #undef FCT
 #undef EXT
 #undef END
+#undef STRUCT
 #undef MEMPCPY
 #undef MEMCHR
 #undef STRCOLL
--- libc/posix/Makefile.jj 2007-02-26 18:13:45.000000000 +0100
+++ libc/posix/Makefile 2007-03-27 11:07:05.000000000 +0200
@@ -90,7 +90,7 @@ tests := tstgetopt testfnm runtests run
    tst-execv1 tst-execv2 tst-execl1 tst-execl2 \
    tst-execve1 tst-execve2 tst-execle1 tst-execle2 \
    tst-execvp3 tst-execvp4 tst-rfc3484 tst-rfc3484-2 \
-   tst-getaddrinfo3
+   tst-getaddrinfo3 tst-fnmatch2
 xtests := bug-ga2
 ifeq (yes,$(build-shared))
 test-srcs := globtest
--- libc/posix/tst-fnmatch2.c.jj 2007-03-27 11:06:37.000000000 +0200
+++ libc/posix/tst-fnmatch2.c 2007-03-27 11:55:15.000000000 +0200
@@ -0,0 +1,35 @@
+#include <fnmatch.h>
+#include <stdio.h>
+
+int
+do_test (void)
+{
+  char pattern[] = "a*b*c*d*e*f*g*h*i*j*k*l*m*n*o*p*q*r*s*t*u*v*w*x*y*z*";
+  const char *string = "aaaabbbbccccddddeeeeffffgggghhhhiiiijjjjkkkkllllmmmm"
+       "nnnnooooppppqqqqrrrrssssttttuuuuvvvvwwwwxxxxyyyy";
+  if (fnmatch (pattern, string, 0) != FNM_NOMATCH)
+    {
+      puts ("First fnmatch didn't return FNM_NOMATCH");
+      return 1;
+    }
+  pattern[(sizeof pattern) - 3] = '*';
+  if (fnmatch (pattern, string, 0) != 0)
+    {
+      puts ("Second fnmatch didn't return 0");
+      return 1;
+    }
+  if (fnmatch ("a*b/*", "abbb/.x", FNM_PATHNAME | FNM_PERIOD) != FNM_NOMATCH)
+    {
+      puts ("Third fnmatch didn't return FNM_NOMATCH");
+      return 1;
+    }
+  if (fnmatch ("a*b/*", "abbb/xy", FNM_PATHNAME | FNM_PERIOD) != 0)
+    {
+      puts ("Fourth fnmatch didn't return 0");
+      return 1;
+    }
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"

        Jakub