Tokenize.cc

Go to the documentation of this file.
00001 // Tekkodu Library
00002 #include "Kodu/Parsing/Parser.h"
00003 
00004 // C++ Library
00005 #include <cctype>
00006 
00007 namespace Kodu {
00008 
00009     bool Parser::TokenParser::tokenize(const std::string& kString, std::vector<TokenBase*>& tokens) {
00010         std::size_t pos = 0;
00011         const std::size_t kSize = kString.size();
00012         while (pos < kSize) {
00013             // get the current character
00014             char currChar = kString[pos];
00015 
00016             // ASSERTION: the current character is either a double quote, a whitespace, a digit, or a letter
00017             PARSER_ASSERT((currChar == '"' || currChar == ' ' || isalnum(currChar) || currChar == '<'
00018                 || currChar == '>' || currChar == ':'),
00019                 errorMessage << "The character '" << currChar << "' is unrecognized.");
00020 
00021             // check what the current character is
00022             // check if current character is a double quote (") identifying a literal string
00023             if (currChar == '"') {
00024                 // find the next occurence of the double quote (after the first)
00025                 std::size_t nextDqOccurence = kString.find('"', pos + 1);
00026 
00027                 // ASSERTION: the second double quote was found
00028                 PARSER_ASSERT((nextDqOccurence != std::string::npos),
00029                     errorMessage << "Could not find a closing double quote (first occurrence of (\") at Col "
00030                         << (pos + 1) << ".");
00031 
00032                 // capture the literal string without the double quotes
00033                 tokens.push_back(new StringToken(kString.substr(pos + 1, nextDqOccurence - pos - 1)));
00034 
00035                 // set the position to the index after the second double quote
00036                 pos = nextDqOccurence + 1;
00037                 continue;
00038             }
00039 
00040             if (currChar == ':') {
00041                 // ASSERTION: the character after the indentation marker is a whitespaces
00042                 PARSER_ASSERT((pos + 1 < kSize && kString[pos + 1] == ' '),
00043                     errorMessage << "(Col " << (pos + 2) << ") The character '"
00044                     << kString[pos + 1] << "' is an invalid character.");
00045 
00046                 // add indentation character to the list of tokens
00047                 tokens.push_back(new KeywordToken(":"));
00048 
00049                 // increment the position by 2
00050                 pos = pos + 2;
00051             }
00052 
00053             // check if current character is a letter
00054             if (isalpha(currChar)) {
00055                 // find the next occurence of whitespace
00056                 std::size_t nextWsOccurence = kString.find(' ', pos + 1);
00057 
00058                 // temporary string to hold the keyword
00059                 std::string keyword;
00060 
00061                 // test if this was the last token in the string
00062                 if (nextWsOccurence == std::string::npos)
00063                     keyword = kString.substr(pos);
00064                 else
00065                     keyword = kString.substr(pos, nextWsOccurence - pos);
00066 
00067                 // check if the string is a valid keyword
00068                 PARSER_ASSERT((koduKeywords.count(keyword) == 1),
00069                     errorMessage << "(Col " << (pos + 1) << ") The token \"" << keyword
00070                     << "\" is not a recognized Kodu keyword.");
00071 
00072                 // add the keyword to the token vector
00073                 tokens.push_back(new KeywordToken(keyword));
00074 
00075                 // set the position to the index after the whitespace or the size of the string is nothing is left
00076                 if (nextWsOccurence == std::string::npos)
00077                     pos = kSize;
00078                 else
00079                     pos = nextWsOccurence + 1;
00080                 continue;
00081             }
00082 
00083             // if check if current character is a digit
00084             if (isdigit(currChar)) {
00085                 // check if the entire "number" string can be converted into an actual floating-point number
00086                 std::size_t tempPos = pos + 1;
00087                 std::size_t stopPos = 0;
00088                 unsigned int dotCount = 0;
00089                 bool hasOnlyZeros = true;
00090 
00091                 // find the next occurence of whitespace
00092                 std::size_t nextWsOccurence = kString.find(' ', tempPos);
00093 
00094                 // test if this was the last token in the string
00095                 if (nextWsOccurence == std::string::npos)
00096                     stopPos = kString.size();
00097                 else
00098                     stopPos = nextWsOccurence;
00099 
00100                 // check each character up to, but not including 
00101                 while (tempPos < stopPos) {
00102                     // ASSERTION: the current character is a digit or a dot
00103                     PARSER_ASSERT((isdigit(kString[tempPos]) || (kString[tempPos] == '.')),
00104                         errorMessage << "(Col " << (tempPos + 1) << ") The character '"
00105                         << kString[tempPos] << "' in token \"" << kString.substr(pos, stopPos - pos)
00106                         << "\" is not a digit [0-9] or a decimal point (.).");
00107 
00108                     // keep track of the number of dots found (there should only be one!)
00109                     if (kString[tempPos] == '.') {
00110                         dotCount++;
00111                     }
00112 
00113                     // ASSERTION: the number of dots is 0 or 1
00114                     PARSER_ASSERT((dotCount == 0 || dotCount == 1),
00115                         errorMessage << "(Col " << (tempPos + 1)
00116                         << ") There is an additional decimal point (.) in the token \""
00117                         << kString.substr(pos, stopPos - pos) << "\".");
00118 
00119                     // check if it is a digit other than zero
00120                     if (isdigit(kString[tempPos]) && kString[tempPos] != '0') {
00121                         hasOnlyZeros = false;
00122                     }
00123 
00124                     // increase the position
00125                     tempPos++;
00126                 }
00127 
00128                 // ASSERTION: the decimal point is not the last character in the range
00129                 PARSER_ASSERT((kString[tempPos] != '.'),
00130                     errorMessage << "(Col " << (tempPos + 1)
00131                     << ") A decimal point cannot the last character of a number.");
00132 
00133                 // ASSERTION: the first character of the number sequence is not a zero (unless followed by a decimal)
00134                 if (kString[pos] == '0') {
00135                   PARSER_ASSERT( ((kString[pos + 1] == '.') || (pos+1 == tempPos)),
00136                         errorMessage << "(Col " << (pos + 2)
00137                         << ") A number can only begin with zero if the zero is followed by a decimal point.");
00138                 }
00139 
00140                 // convert the digits (and decimal point) to a floating-point number
00141                 float value = (float)strtod(kString.substr(pos, stopPos - pos).c_str(), NULL);
00142 
00143                 // I DON'T KNOW IF I HAVE TO DO THIS (TODO 16-JUL-13)
00144                 // make sure if all the chars were not zeros that the return value was greater than zero
00145                 if (!hasOnlyZeros) {
00146                     // ASSERTION: the value is greater than zero if the while loop found any other digit than zero
00147                     PARSER_ASSERT((value > 0.0f),
00148                         errorMessage << "(Col " << (pos + 1)
00149                         << ") There was an error converting \""
00150                         << kString.substr(pos, stopPos - pos) << "\" to a floating-point number. "
00151                         << "Numbers should only contain digits and one dot, if needed.");
00152                 }
00153 
00154                 // add the floating-point number to the token vector
00155                 tokens.push_back(new NumericToken(value));
00156 
00157                 // set the position to the index after the whitespace 
00158                 pos = stopPos + 1;
00159                 continue;
00160             }
00161 
00162             // check if current character is an inequality sign
00163             if (currChar == '<' || currChar == '>') {
00164                 // ASSERTION: there is at least 2 more characters in this string
00165                 PARSER_ASSERT((pos + 2 < kString.size()),
00166                     errorMessage << "(Col " << (pos + 1)
00167                     << ") Invalid use of the inequality signs. There must be a number after the signs.");
00168 
00169                 // ASSERTION: there is an equal sign in the next character position (pos + 1)
00170                 // and a space at position after that (pos + 2)
00171                 PARSER_ASSERT((kString[pos + 1] == '=' && kString[pos + 2] == ' '),
00172                     errorMessage << "(Col " << (pos + 1)
00173                     << ") Invalid use of inequality signs. Correct usage: [number] [space] <= [space] [number]."
00174                     << "(E.g. 10 <= 13).");
00175 
00176                 // add the inequality sign to the token vector
00177                 if (currChar == '<')
00178                     tokens.push_back(new KeywordToken("<="));
00179                 else
00180                     tokens.push_back(new KeywordToken(">="));
00181 
00182                 // set the position to the index after the whitespace
00183                 pos = pos + 3;
00184                 continue;
00185             }
00186 
00187             // check if current character is a whitespace (move unto to the next character)
00188             if (isspace(kString[pos])) {
00189                 pos++;
00190                 continue;
00191             }
00192         }
00193         return true;
00194     }
00195 }