Using strtok() to parse a string into separate words.


Source listing
//=============================================================
// File: Code501_Goodies.cpp
// Purpose: String parser demonstration using both the standard
//        C function strtok() and a local function which does
//        the same thing.  The local function demonstrates how
//        strtok() works.
// Author: Dr. Rick Coleman
//=============================================================
#include <iostream>
#include <string>

using namespace std;

// Function prototype
char *myStrtok(char *parseStr, char *splitChars);


int main(int argc, char **argv)
{
    char    str[128];    // Character array holding strings to parse
    char    tokChars[8]; // Array of characters used to split on
    char    *cptr;       // Pointer to string returned by tokenizers
    int     count;       // Counter for words in the string

    // Create the string to be parsed.
    strcpy(str, "This is a string, with punctuation. It is used to demonstrate parsing.");
    // Create the string of characters used to split tokens on.
    // This includes: space, period, question mark, exclamation mark, comma,
    //        semicolon, and colon.
    strcpy(tokChars, " .?!,;:");

    //---------------------------------------------
    //     Demonstrate string parsing with strtok()
    //---------------------------------------------
    cout << "Demonstrating parsing a string into tokens using strtok()\n";
    
    // Make initial call to strtok() passing in the string to be parsed and
    //        the list of characters used to split tokens apart.
    cptr = strtok(str, tokChars);
    count = 1; // Initialize the word counter

    // Create while() loop to print all the tokens in the string.  Note that
    //    the punctuation has been eliminated leaving just the words from the string.
    //    As long as NULL is passed in as the first argument to strtok it will 
    //    continue parsing the last "non-NULL" string passed to it.  It returns
    //    NULL when the entire string has been parsed.
    while(cptr != NULL)
    {
        cout << "Token " << count << " -->" << cptr << "<--\n";
        cptr = strtok(NULL, tokChars); // Get next word
        count++; // Increment counter
    }

    cout << "\n----- End demonstration of strtok() -----\n";

    cout << "\nPress [Enter] for demonstation of next tokenizer.\n";
    cin.get();

    //-----------------------------------------------
    //     Demonstrate string parsing with myStrtok()
    //-----------------------------------------------
    cout << "Demonstrating parsing a string into tokens using myStrtok()\n";
    cout << "which duplicates the functionality of the standard strtok().\n";

    // Recreate the string to be parsed.
    strcpy(str, "This is a string, with punctuation. It is used to demonstrate parsing.");
    // We will use the same set of split characters

    // Make initial call to myStrtok() passing in the string to be parsed and
    //        the list of characters used to split tokens apart.
    cptr = myStrtok(str, tokChars);
    count = 1; // Initialize the word counter

    // Create while() loop to print all the tokens in the string.  Note that
    //    the punctuation has been eliminated leaving just the words from the string.
    //    As long as NULL is passed in as the first argument to myStrtok it will 
    //    continue parsing the last "non-NULL" string passed to it.  It returns
    //    NULL when the entire string has been parsed.
    while(cptr != NULL)
    {
        cout << "Token " << count << " -->" << cptr << "<--\n";
        cptr = myStrtok(NULL, tokChars); // Get next word
        count++; // Increment counter
    }

    cout << "\n----- End demonstration of myStrtok() -----\n";
    return 0;
}


//-------------------------------------------------
// Function: myStrtok()
// Purpose; Split a string passed in into tokens
//        using a given string of characters as the
//        split characters.
// Args: parseStr - String to be parsed or NULL if
//            parsing is to continue with the current
//            string.
//       splitChars - Array of characters to use as
//            the split characters.
// Returns: Pointer to next token in the string
//--------------------------------------------------
char *myStrtok(char *parseStr, char *splitChars)
{
    // Note: The first var is declared as "static" this
    //  means that after the initial call to the function it
    //    will retain its last value with each subsequent call
    //    to this function. It is initialized on the very first
    //    call to this function to NULL.
    static char *pStr = NULL; // Pointer to string to be tokenized.

    char *tok;  // Pointer to start of next token
    char *temp; // Misc. use temporary pointer.
    int  found; // Boolean flag to indicate a split character was found

    //----------------------------------------------------------------
    // Case 1: See if a new string to be parsed was passed in.  This
    //            will be true if parseStr is not NULL
    //----------------------------------------------------------------
    if(parseStr != NULL)
        pStr = parseStr; // If yes, then hold the pointer to it

    //----------------------------------------------------------------
    // Case 2: Check to see if the last call to this function returned 
    //            the last token in the string. If pStr is now pointing 
    //            to the NULL terminator this will be true
    //----------------------------------------------------------------
    if(*pStr == '\0') return NULL; // Tell user all tokens have been returned

    //----------------------------------------------------------------
    // Case 3: Find the next token.  
    //        Step 1: Starting from the end of the last token returned
    //            or the beginning of the new string to parse skip any
    //            leading characters that are the same as characters found
    //            in the splitChars array. We also look for the NULL
    //            terminator indicating we have reached the end of parseStr.
    //----------------------------------------------------------------
    found = 0; // Initialize to FALSE
    tok = pStr; // Initialize tok pointer to start current point in parseStr
    // Skip any leading splitChars
    while((!found) && (*tok != '\0'))
    {
        temp = splitChars;    // Point to start of splitChars array
        while(*temp != '\0') // Scan entire splitChars array each time
        {
            if(*tok != *temp) 
            {
                temp++; // Advance to next character in splitChars
            }
            else    // Found a split char
            {
                tok++; // Advance to next character in parseStr
                break; // and end this scan of the splitChars array
            }
        }
        // Check to see if we made it through the entire splitChars
        // array without finding a match, i.e. we have the first char
        // in the next token
        if(*temp == '\0') found = 1; // Mark as TRUE to end search
        // Note: If tok was advanced to point to the NULL terminator at the
        //   end of parseStr this will also terminate the loop
    }

    // Check to see if we reached the end of parseStr without finding another
    //        token.  If so set pStr so we can recognize this at the next call
    if(*tok == '\0')
    {
        pStr = tok; // Point pStr to the NULL terminator at the end of parseStr
        return tok; // Return NULL to indicate the end of the string was reached.
    }

    // When we reach this point tok points to the first non-splitChars character

    //----------------------------------------------------------------
    //        Step 2: Find the end of this token.  This will be the next
    //            occurance of one of the characters in splitChars or the
    //            NULL terminator marking the end of parseStr
    //----------------------------------------------------------------
    found = 0; // Initialize to FALSE
    pStr = tok; // Initialize pStr to tok
    // Search for first occurance of a splitChars character marking the end
    //    of this token.  Also look to see if we reach the end of parseStr
    while((!found) && (*pStr != '\0'))
    {
        temp = splitChars;    // Point to start of splitChars array
        // Scan entire splitChars array to see if the char pStr points to
        // is one of the split chars.
        while((*temp != '\0') && (*pStr != *temp)) temp++;

        // if this char was OK advance to the next and try again
        if(*temp == '\0') 
            pStr++;
        else
            found = 1;    // Found the end of the token so end the while() loop
        // Note: If pStr was advanced to point to the NULL terminator at the
        //   end of parseStr this will also terminate the loop
    }

    // At this point we have tok pointing to the first character of the
    //    next token in parseStr and pStr pointing to the first character
    //    after the end of the next token.

    //----------------------------------------------------------------
    //        Step 3: Set up for the return and next call.
    //----------------------------------------------------------------
    // When we reach this point if pStr is pointing to the NULL terminator
    //    at the end of parseStr we leave pStr pointing to this NULL terminator
    //    so we will know this on the next call to this function. 

    if(*pStr != '\0')
    {
        // However, if pStr is not pointing to a NULL terminator then it
        //    must be pointing to a split character so we replace the character 
        //    pStr is pointing to with a NULL terminator so the caller can get 
        //    the token by itself and advance pStr to the first character after 
        //    that so it is ready to parse the next token on the next call to 
        //    this function.
        *pStr = '\0'; // Put a NULL terminator at the end of this token
        pStr++; // Advance pStr to the next character in parseStr
    }

    return tok; // Return the pointer to the next token
}