Reworked scanf() testing. General cleanups.
[fw/pdclib] / functions / _PDCLIB / scan.c
index 523b0c81449c5846f50868f09118fcb445c69dba..b346ffc2bb81dc7943d1db696e168241e2a77370 100644 (file)
 #include <stdbool.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdint.h>
 #include <ctype.h>
+#include <string.h>
+#include <stddef.h>
+#include <limits.h>
 
 /* Using an integer's bits as flags for both the conversion flags and length
    modifiers.
 #define E_unsigned   1<<16
 
 
-#define MATCH_FAIL -1
-#define MATCH_ERROR -2
+/* Helper macro for assigning a readily converted integer value to the correct
+   parameter type, used in a switch on status->flags (see E_* flags above).
+   case_cond: combination of the E_* flags above, used for the switch-case
+   type:      integer type, used to get the correct type from the parameter
+              stack as well as for cast target.
+*/
+#define ASSIGN_VALUE_TO( case_cond, type ) \
+    case case_cond: \
+        *( va_arg( status->arg, type * ) ) = (type)( value * sign ); \
+        break
+
 
-static int MATCH( int c, struct _PDCLIB_status_t * status )
+/* Helper function to get a character from the string or stream, whatever is
+   used for input. When reading from a string, returns EOF on end-of-string
+   so that handling of the return value can be uniform for both streams and
+   strings.
+*/
+static int GET( struct _PDCLIB_status_t * status )
 {
+    int rc;
     if ( status->stream != NULL )
     {
-        if ( ! _PDCLIB_prepread( status->stream ) )
-        {
-            return MATCH_ERROR;
-        }
-        if ( tolower( status->stream->buffer[ status->stream->bufidx ] ) == c )
-        {
-            /* recycling parameter */
-            c = getc( status->stream );
-        }
-        else
-        {
-            return MATCH_FAIL;
-        }
+        rc = getc( status->stream );
     }
     else
     {
-        if ( tolower( *(status->s) ) == c )
-        {
-            /* recycling parameter */
-            c = *((status->s)++); /* TODO: \0 */
-        }
-        else
-        {
-            return MATCH_FAIL;
-        }
+        rc = ( *status->s == '\0' ) ? EOF : (unsigned char)*((status->s)++);
     }
-    ++(status->i);
-    ++(status->this);
-    return c;
+    if ( rc != EOF )
+    {
+        ++(status->i);
+        ++(status->current);
+    }
+    return rc;
 }
 
 
+/* Helper function to put a read character back into the string or stream,
+   whatever is used for input.
+*/
 static void UNGET( int c, struct _PDCLIB_status_t * status )
 {
     if ( status->stream != NULL )
@@ -75,26 +80,80 @@ static void UNGET( int c, struct _PDCLIB_status_t * status )
     }
     else
     {
-        *(--(status->s)) = c;
+        --(status->s);
     }
     --(status->i);
-    --(status->this);
+    --(status->current);
+}
+
+
+/* Helper function to check if a character is part of a given scanset */
+static bool IN_SCANSET( const char * scanlist, const char * end_scanlist, int rc )
+{
+    // SOLAR
+    int previous = -1;
+    while ( scanlist != end_scanlist )
+    {
+        if ( ( *scanlist == '-' ) && ( previous != -1 ) )
+        {
+            /* possible scangroup ("a-z") */
+            if ( ++scanlist == end_scanlist )
+            {
+                /* '-' at end of scanlist does not describe a scangroup */
+                return rc == '-';
+            }
+            while ( ++previous <= (unsigned char)*scanlist )
+            {
+                if ( previous == rc )
+                {
+                    return true;
+                }
+            }
+            previous = -1;
+        }
+        else
+        {
+            /* not a scangroup, check verbatim */
+            if ( rc == (unsigned char)*scanlist )
+            {
+                return true;
+            }
+            previous = (unsigned char)(*scanlist++);
+        }
+    }
+    return false;
 }
 
 
 const char * _PDCLIB_scan( const char * spec, struct _PDCLIB_status_t * status )
 {
+    /* generic input character */
+    int rc;
     const char * orig_spec = spec;
     if ( *(++spec) == '%' )
     {
         /* %% -> match single '%' */
-        MATCH( *spec, status );
-        return ++spec;
+        rc = GET( status );
+        switch ( rc )
+        {
+            case EOF:
+                /* input error */
+                if ( status->n == 0 )
+                {
+                    status->n = -1;
+                }
+                return NULL;
+            case '%':
+                return ++spec;
+            default:
+                UNGET( rc, status );
+                break;
+        }
     }
     /* Initializing status structure */
     status->flags = 0;
     status->base = -1;
-    status->this = 0;
+    status->current = 0;
     status->width = 0;
     status->prec = 0;
 
@@ -109,7 +168,12 @@ const char * _PDCLIB_scan( const char * spec, struct _PDCLIB_status_t * status )
        strtol() will return zero. In both cases, endptr will point to the
        rest of the conversion specifier - just what we need.
     */
+    char const * prev_spec = spec;
     status->width = (int)strtol( spec, (char**)&spec, 10 );
+    if ( spec == prev_spec )
+    {
+        status->width = SIZE_MAX;
+    }
 
     /* Optional length modifier
        We step one character ahead in any case, and step back only if we find
@@ -166,6 +230,10 @@ const char * _PDCLIB_scan( const char * spec, struct _PDCLIB_status_t * status )
     }
 
     /* Conversion specifier */
+
+    /* whether valid input had been parsed */
+    bool value_parsed = false;
+
     switch ( *spec )
     {
         case 'd':
@@ -196,13 +264,137 @@ const char * _PDCLIB_scan( const char * spec, struct _PDCLIB_status_t * status )
         case 'A':
             break;
         case 'c':
-            /* TODO */
-            break;
+        {
+            char * c = va_arg( status->arg, char * );
+            /* for %c, default width is one */
+            if ( status->width == SIZE_MAX )
+            {
+                status->width = 1;
+            }
+            /* reading until width reached or input exhausted */
+            while ( ( status->current < status->width ) &&
+                    ( ( rc = GET( status ) ) != EOF ) )
+            {
+                *(c++) = rc;
+                value_parsed = true;
+            }
+            /* width or input exhausted */
+            if ( value_parsed )
+            {
+                ++status->n;
+                return ++spec;
+            }
+            else
+            {
+                /* input error, no character read */
+                if ( status->n == 0 )
+                {
+                    status->n = -1;
+                }
+                return NULL;
+            }
+        }
         case 's':
-            /* TODO */
-            break;
+        {
+            char * c = va_arg( status->arg, char * );
+            while ( ( status->current < status->width ) && 
+                    ( ( rc = GET( status ) ) != EOF ) )
+            {
+                if ( isspace( rc ) )
+                {
+                    UNGET( rc, status );
+                    if ( value_parsed )
+                    {
+                        /* matching sequence terminated by whitespace */
+                        *c = '\0';
+                        ++status->n;
+                        return ++spec;
+                    }
+                    else
+                    {
+                        /* matching error */
+                        return NULL;
+                    }
+                }
+                else
+                {
+                    /* match */
+                    value_parsed = true;
+                    *(c++) = rc;
+                }
+            }
+            /* width or input exhausted */
+            if ( value_parsed )
+            {
+                *c = '\0';
+                ++status->n;
+                return ++spec;
+            }
+            else
+            {
+                /* input error, no character read */
+                if ( status->n == 0 )
+                {
+                    status->n = -1;
+                }
+                return NULL;
+            }
+        }
+        case '[':
+        {
+            const char * endspec = spec;
+            bool negative_scanlist = false;
+            if ( *(++endspec) == '^' )
+            {
+                negative_scanlist = true;
+                ++endspec;
+            }
+            spec = endspec;
+            do
+            {
+                // TODO: This can run beyond a malformed format string
+                ++endspec;
+            } while ( *endspec != ']' );
+            // read according to scanlist, equiv. to %s above
+            char * c = va_arg( status->arg, char * );
+            while ( ( status->current < status->width ) && 
+                    ( ( rc = GET( status ) ) != EOF ) )
+            {
+                if ( negative_scanlist )
+                {
+                    if ( IN_SCANSET( spec, endspec, rc ) )
+                    {
+                        UNGET( rc, status );
+                        break;
+                    }
+                }
+                else
+                {
+                    if ( ! IN_SCANSET( spec, endspec, rc ) )
+                    {
+                        UNGET( rc, status );
+                        break;
+                    }
+                }
+                value_parsed = true;
+                *(c++) = rc;
+            }
+            if ( value_parsed )
+            {
+                *c = '\0';
+                ++status->n;
+                return ++endspec;
+            }
+            else
+            {
+                if ( rc == EOF )
+                {
+                    status->n = -1;
+                }
+                return NULL;
+            }
+        }
         case 'p':
-            /* TODO */
             status->base = 16;
             status->flags |= E_unsigned;
             break;
@@ -216,67 +408,190 @@ const char * _PDCLIB_scan( const char * spec, struct _PDCLIB_status_t * status )
             /* No conversion specifier. Bad conversion. */
             return orig_spec;
     }
-    bool zero = false;
+
     if ( status->base != -1 )
     {
-        bool value = false;
-        int rc;
-        if ( ( rc = MATCH( '0', status ) ) >= 0 )
+        /* integer conversion */
+        uintmax_t value = 0;         /* absolute value read */
+        bool prefix_parsed = false;
+        int sign = 0;
+        while ( ( status->current < status->width ) &&
+                ( ( rc = GET( status ) ) != EOF ) )
         {
-            if ( ( rc = MATCH( 'x', status ) ) >= 0 )
+            if ( isspace( rc ) )
             {
-                if ( ( status->base == 0 ) || ( status->base == 16 ) )
+                if ( sign )
                 {
-                    status->base = 16;
+                    /* matching sequence terminated by whitespace */
+                    UNGET( rc, status );
+                    break;
                 }
                 else
                 {
-                    UNGET( rc, status );
-                    value = true;
+                    /* leading whitespace not counted against width */
+                    status->current--;
                 }
             }
-            else if ( rc == MATCH_FAIL )
+            else if ( ! sign )
             {
-                if ( status->base == 0 )
+                /* no sign parsed yet */
+                switch ( rc )
                 {
-                    status->base = 8;
+                    case '-':
+                        sign = -1;
+                        break;
+                    case '+':
+                        sign = 1;
+                        break;
+                    default:
+                        /* not a sign; put back character */
+                        sign = 1;
+                        UNGET( rc, status );
+                        break;
+                }
+            }
+            else if ( ! prefix_parsed )
+            {
+                /* no prefix (0x... for hex, 0... for octal) parsed yet */
+                prefix_parsed = true;
+                if ( rc != '0' )
+                {
+                    /* not a prefix; if base not yet set, set to decimal */
+                    if ( status->base == 0 )
+                    {
+                        status->base = 10;
+                    }
+                    UNGET( rc, status );
                 }
                 else
                 {
-                    value = true;
+                    /* starts with zero, so it might be a prefix. */
+                    /* check what follows next (might be 0x...) */
+                    if ( ( status->current < status->width ) &&
+                         ( ( rc = GET( status ) ) != EOF ) )
+                    {
+                        if ( tolower( rc ) == 'x' )
+                        {
+                            /* 0x... would be prefix for hex base... */
+                            if ( ( status->base == 0 ) ||
+                                 ( status->base == 16 ) )
+                            {
+                                status->base = 16;
+                            }
+                            else
+                            {
+                                /* ...unless already set to other value */
+                                UNGET( rc, status );
+                                value_parsed = true;
+                            }
+                        }
+                        else
+                        {
+                            /* 0... but not 0x.... would be octal prefix */
+                            UNGET( rc, status );
+                            if ( status->base == 0 )
+                            {
+                                status->base = 8;
+                            }
+                            /* in any case we have read a zero */
+                            value_parsed = true;
+                        }
+                    }
+                    else
+                    {
+                        /* failed to read beyond the initial zero */
+                        value_parsed = true;
+                        break;
+                    }
                 }
             }
             else
             {
-                /* TODO: MATCH_ERROR */
+                char * digitptr = memchr( _PDCLIB_digits, tolower( rc ), status->base );
+                if ( digitptr == NULL )
+                {
+                    /* end of input item */
+                    UNGET( rc, status );
+                    break;
+                }
+                value *= status->base;
+                value += digitptr - _PDCLIB_digits;
+                value_parsed = true;
             }
         }
-        else if ( rc == MATCH_FAIL )
+        /* width or input exhausted, or non-matching character */
+        if ( ! value_parsed )
         {
-            if ( status->base == 0 )
+            /* out of input before anything could be parsed - input error */
+            /* FIXME: if first character does not match, value_parsed is not set - but it is NOT an input error */
+            if ( ( status->n == 0 ) && ( rc == EOF ) )
             {
-                status->base = 10;
+                status->n = -1;
             }
+            return NULL;
         }
-        else
+        /* convert value to target type and assign to parameter */
+        switch ( status->flags & ( E_char | E_short | E_long | E_llong |
+                                   E_intmax | E_size | E_ptrdiff |
+                                   E_unsigned ) )
         {
-            /* TODO: MATCH_ERROR */
+            ASSIGN_VALUE_TO( E_char, char );
+            ASSIGN_VALUE_TO( E_char | E_unsigned, unsigned char );
+            ASSIGN_VALUE_TO( E_short, short );
+            ASSIGN_VALUE_TO( E_short | E_unsigned, unsigned short );
+            ASSIGN_VALUE_TO( 0, int );
+            ASSIGN_VALUE_TO( E_unsigned, unsigned int );
+            ASSIGN_VALUE_TO( E_long, long );
+            ASSIGN_VALUE_TO( E_long | E_unsigned, unsigned long );
+            ASSIGN_VALUE_TO( E_llong, long long );
+            ASSIGN_VALUE_TO( E_llong | E_unsigned, unsigned long long );
+            ASSIGN_VALUE_TO( E_intmax, intmax_t );
+            ASSIGN_VALUE_TO( E_intmax | E_unsigned, uintmax_t );
+            ASSIGN_VALUE_TO( E_size, size_t );
+            /* ASSIGN_VALUE_TO( E_size | E_unsigned, unsigned size_t ); */
+            ASSIGN_VALUE_TO( E_ptrdiff, ptrdiff_t );
+            /* ASSIGN_VALUE_TO( E_ptrdiff | E_unsigned, unsigned ptrdiff_t ); */
+            default:
+                puts( "UNSUPPORTED SCANF FLAG COMBINATION" );
+                return NULL; /* behaviour unspecified */
         }
-        /* TODO: Integer conversion */
-    }
-    else
-    {
-        /* TODO: Float conversions? */
+        ++(status->n);
+        return ++spec;
     }
+    /* TODO: Floats. */
     return NULL;
 }
 
+
 #ifdef TEST
+#define _PDCLIB_FILEID "_PDCLIB/scan.c"
+#define _PDCLIB_STRINGIO
+
 #include <_PDCLIB_test.h>
 
+static int testscanf( char const * s, char const * format, ... )
+{
+    struct _PDCLIB_status_t status;
+    status.n = 0;
+    status.i = 0;
+    status.s = (char *)s;
+    status.stream = NULL;
+    va_start( status.arg, format );
+    if ( *(_PDCLIB_scan( format, &status )) != '\0' )
+    {
+        printf( "_PDCLIB_scan() did not return end-of-specifier on '%s'.\n", format );
+        ++TEST_RESULTS;
+    }
+    va_end( status.arg );
+    return status.n;
+}
+
+#define TEST_CONVERSION_ONLY
+
 int main( void )
 {
-    TESTCASE( NO_TESTDRIVER );
+    char source[100];
+#include "scanf_testcases.h"
     return TEST_RESULTS;
 }