{============================================================================} {= =} {= 'GetSym.Pas' =} {= =} {= Pascal compiler lexical analyzer. =} {= =} {= 12/14/86 - Bruce Tomlin =} {= =} {============================================================================} {$R+} { Turn on range checking in this module } UNIT GetSym; INTERFACE USES MacDefs, Files, SymTab, Error; (* TYPE { Token is defined in 'SymTab' } Token = (intConst,realConst,strConst,ident,noSym, becomes,range,eq,neq,lt,gt,leq,geq,comma,dot, lparen,rparen,lbrack,rbrack,colon,semicolon, plus,minus,times,divide,kARRAY,kBEGIN, kCONST,kDIV,kDO,kELSE,kEND,kFOR,kFUNCTION, kIF,kINTEGER,kOF,kPROCEDURE,kPROGRAM, kREAL,kRECORD,kTHEN,kTO,kTYPE,kVAR,kWHILE); *) VAR { Compiler option flags } symDumpFlag: BOOLEAN; { TRUE to enable listing symbol table } treeDumpFlag: BOOLEAN; { TRUE to enable listing parse tree } optConstExpr: BOOLEAN; { TRUE to enable optimization of constant exprs } { Variables returned from GetSym } sym: Token; { Last token read in } id: Str255; { Last identifier or strConst read in GetSym } idPtr: SymPtr; { Pointer to identifier in symbol table, or NIL } intNum: INTEGER; { Last intConst read in GetSym } realNum: REAL; { Last realConst read in GetSym } PROCEDURE GetSym; PROCEDURE InitGetSym; IMPLEMENTATION CONST newline = 13; { ASCII code for newline character } VAR { 'linePos' and 'readLine' are defined in 'Error' } (* linePos: INTEGER; { Current position in 'readLine' } readLine: Str255; { Current line being scanned } *) { Variables shared among GetCh and GetSym } ch: CHAR; { Last character read from 'readLine' } inComment: BOOLEAN; { TRUE if currently scanning a comment } { Static variables used by GetCh } eofHit: BOOLEAN; { Used to allow last token before eof to finish up } { Static variables used by GetSym } nextSym: Token; { next token if GetSym read too far, ex: in '1..2' } {============================================================================} {= =} {= GETCH =} {= =} {= Gets another character from the current input device. =} {= =} {============================================================================} PROCEDURE GetCh; BEGIN { If end of file has been reached unexpectedly, abort the compiler } IF eofHit AND inComment THEN Error([eLex,eFatal],'End of input encountered while reading a comment.'); IF eofHit THEN Error([eLex,eFatal],'End of input encountered before end of program.'); { If the current line is exhausted, we need another line } IF linePos>Length(readLine) THEN BEGIN IF Eof(inFile) THEN BEGIN { If end of file has been reached, } { delay it until the next call to GetCh } readLine := ''; eofHit := TRUE; END ELSE BEGIN { Get the next line from input, and echo it so that } { errors will show up under the line where they occurred. } ReadLn(inFile,readLine); IF NOT screenOut THEN Write(outFile,'; '); WriteLn(outFile,readLine); { Allow the user to pause the compiler here } Pause; END; { Start reading the new line at the first character } linePos := 1; { Pass a newline character as white space so that } { GetSym can detect a broken string constant. } ch := CHR(newline); { If the length of the line is 255 characters it could } { have been more, but we can't know, so warn the user. } IF Length(readLine)=255 THEN Error([eLex],'Source line exceeds 254 characters.'); END ELSE BEGIN { Get the next character from the line } ch := readLine[linePos]; linePos := linePos + 1; END; END; {============================================================================} {= =} {= FINDKEYWORD =} {= =} {= Searches the keyword table for the current identifier. =} {= =} {= Returns: =} {= If a keyword is found, the token for that keyword is returned, =} {= otherwise the token 'ident' is returned. =} {= If the identifier was found in the symbol table, a pointer to the =} {= identifier's symbol table entry is placed in the global variable =} {= 'idPtr', else 'idPtr' is set to NIL if the symbol was not found. =} {= =} {============================================================================} FUNCTION FindKeyword: Token; BEGIN { Find the identifier in the symbol table } idPtr := FindSym(id); { If the identifier was not found, it isn't a keyword, so return 'ident' } IF idPtr=NIL THEN FindKeyword := ident { If the identifier is in the symbol table and } { it is a keyword, return its token value. } ELSE IF idPtr^.typ=tKeyword THEN FindKeyword := idPtr^.kwSym { But if it isn't a keyword, just return 'ident' } ELSE FindKeyWord := ident; END; {============================================================================} {= =} {= TOUPPER =} {= =} {= Converts a character to uppercase. =} {= =} {============================================================================} FUNCTION ToUpper(ch: CHAR): CHAR; BEGIN IF ch in ['a'..'z'] THEN ch := CHR(ORD(ch)-32); ToUpper := ch; END; {============================================================================} {= =} {= TONUM =} {= =} {= Converts an ASCII digit to its numeric equivalent. =} {= The return value is undefined if 'ch' is not an ASCII digit. =} {= =} {============================================================================} FUNCTION ToNum(ch: CHAR): INTEGER; BEGIN ToNum := ORD(ch) - ORD('0'); END; {============================================================================} {= =} {= ALPHA =} {= =} {= Returns TRUE if the character is an alphabetic character. =} {= =} {============================================================================} FUNCTION Alpha(ch: CHAR): BOOLEAN; BEGIN Alpha := ch IN ['A'..'Z','a'..'z']; END; {============================================================================} {= =} {= NUMERIC =} {= =} {= Returns TRUE if the character is a numeric character. =} {= =} {============================================================================} FUNCTION Numeric(ch: CHAR): BOOLEAN; BEGIN Numeric := ch IN ['0'..'9']; END; {============================================================================} {= =} {= WHITE =} {= =} {= Returns TRUE if the character is a white space character. =} {= =} {============================================================================} FUNCTION White(ch: CHAR): BOOLEAN; BEGIN White := (ch<=' '); END; {============================================================================} {= =} {= CONCATCHAR =} {= =} {= Concatenates a character to the end of a string. =} {= =} {============================================================================} PROCEDURE ConcatChar(VAR str: Str255; ch: CHAR); BEGIN IF Length(str)=255 THEN { String already full } ELSE BEGIN str[0] := CHR(Length(str)+1); str[Length(str)] := ch; END; END; {============================================================================} {= =} {= SETOPTION =} {= =} {= Set a compiler option true if a '+' is encountered, false if =} {= a '-' is encountered, unchanged with an error message otherwise. =} {= =} {============================================================================} PROCEDURE SetOption(VAR flag: BOOLEAN); BEGIN GetCh; { Eat the option character } CASE ch OF '+': flag := TRUE; '-': flag := FALSE; OTHERWISE Error([eLex],'''+'' or ''-'' expected in compiler option.'); END; END; {============================================================================} {= =} {= DOOPTIONS =} {= =} {= Process a compiler option found inside of a '{$' comment. =} {= =} {= Valid options are: =} {= =} {= $DS+ $DS- Turn symbol table listing on/off. =} {= $DT+ $DT- Turn parse tree listing on/off. =} {= $OC+ $OC- Turn constant expression optimization on/off. =} {= =} {============================================================================} PROCEDURE DoOptions; BEGIN CASE ToUpper(ch) OF 'D': BEGIN GetCh; { Eat the 'D' character } CASE ToUpper(ch) OF 'S': SetOption(symDumpFlag); 'T': SetOption(treeDumpFlag); END; END; 'O': BEGIN GetCh; { Eat the 'O' character } CASE ToUpper(ch) OF 'C': SetOption(optConstExpr); END; END; END; END; {============================================================================} {= =} {= GETSYM =} {= =} {= Reads in the next token. =} {= =} {= Returns: =} {= =} {= SYM The token for the last symbol read in =} {= =} {= ID The last identifier or string constant read in =} {= Valid only if sym='ident' or 'strConst', =} {= or if sym is a keyword token. =} {= =} {= IDPTR The pointer to the symbol 'id' in the symbol table. =} {= If the identifier was not found, 'idPtr' is set to =} {= NIL. =} {= Valid only if sym='ident' or if 'sym' is a =} {= keyword token. =} {= =} {= INTNUM The last integer constant read in =} {= Valid only if sym='intConst'. =} {= =} {= REALNUM The last real number constant read in =} {= Valid only if sym='realConst'. =} {= =} {============================================================================} PROCEDURE GetSym; VAR frac: REAL; { Scaling factor for fractional part of a realConst } symCh: CHAR; { First character of a special-character symbol } done: BOOLEAN; { Used to indicate end of getting a strConst } scaleFact: INTEGER; { Scale factor of an 'E' format realConst } scaleSign: INTEGER; { Sign of the scale factor of an 'E' realConst } BEGIN { If there is an extra symbol to be returned by GetSym, return it now. } { This is currently only used in the case where an integer constant is } { followed by the '..' symbol, when GetSym has read too far and then } { finds out that the first period is not a decimal point after all. } IF nextSym<>noSym THEN BEGIN sym := nextSym; nextSym := noSym; END ELSE BEGIN { Not currently in a comment } inComment := FALSE; { Read through all white space characters } WHILE White(ch) DO GetCh; IF Alpha(ch) THEN BEGIN { If the character is a letter, build up an identifier into 'id' } id := ''; WHILE Alpha(ch) OR Numeric(ch) OR (ch='_') DO BEGIN ConcatChar(id,ToUpper(ch)); GetCh; END; { Find out if the identifier is really a keyword } sym := FindKeyword; END ELSE IF Numeric(ch) THEN BEGIN { If the character is a digit, build up a numeric constant } intNum := 0; { Initialize the numeric return values to zero } realNum := 0.0; sym := intConst; { The number starts out as an integer constant } { Read in the integer part of the number } WHILE Numeric(ch) DO BEGIN { If the number is going to go past 32767, make it real } IF (intNum>3276) OR ((intNum=3276) AND (ToNum(ch)>=8)) THEN sym := realConst; intNum := intNum * 10 + ToNum(ch); realNum := realNum * 10 + ToNum(ch); GetCh; END; { Check for a fractional part of a number } IF ch='.' THEN BEGIN { If two periods in a row are encountered, we've read too } { far and need to return 'range' from the next GetSym call } GetCh; IF ch='.' THEN BEGIN nextSym := range; GetCh; END ELSE BEGIN { The decimal point forces the number to be real } sym := realConst; { Read in the fractional part of the number } frac := 0.1; WHILE Numeric(ch) DO BEGIN realNum := realNum + ToNum(ch) * frac; frac := frac / 10; GetCh; END; END; END; { If the next character is 'E' and the '..' symbol } { wasn't accidentally read in, read in the scale factor } IF (ch IN ['E','e']) AND (nextSym=noSym) THEN BEGIN GetCh; { The 'E' character forces the number to be real } sym := realConst; { Read in the sign of the scale factor } scaleSign := 1; CASE ch OF '+': GetCh; '-': BEGIN scaleSign := -1; GetCh; END; END; { Read in the integer part of the scale factor } scaleFact := 0; WHILE Numeric(ch) DO BEGIN scaleFact := scaleFact * 10 + ToNum(ch); GetCh; END; { Multiply the number by ten to the power of the scale factor } scaleFact := scaleFact * scaleSign; realNum := realNum * XPwrI(10.0,scaleFact); END; END { If the character was not a number or a letter, } { check for a token made up of special characters } ELSE BEGIN symCh := ch; GetCh; CASE symCh OF { Check for ':' and ':=' } ':': IF ch='=' THEN BEGIN sym := becomes; GetCh; END ELSE sym := colon; { Check for '.' and '..' } '.': IF ch='.' THEN BEGIN sym := range; GetCh; END ELSE sym := dot; { Check for '<', '<=', and '<>' } '<': CASE ch OF '=': BEGIN sym := leq; GetCh; END; '>': BEGIN sym := neq; GetCh; END; OTHERWISE sym := lt; END; { Check for '=', '=<', and '=>' } '=': CASE ch OF '<': BEGIN sym := leq; GetCh; END; '>': BEGIN sym := geq; GetCh; END; OTHERWISE sym := eq; END; { Check for '>' and '>=' } '>': IF ch='=' THEN BEGIN sym := geq; GetCh; END ELSE sym := gt; { Check for a string constant } '''': BEGIN { Start with a null string constant } sym := strConst; id := ''; { Keep reading characters into the string until } { a close quote or broken string is encountered } done := FALSE; REPEAT { Read until a close quote or a broken string } WHILE (ch<>'''') AND NOT done DO BEGIN { If the character is a newline, } { we have a broken string, so exit } IF ch=CHR(newline) THEN BEGIN Error([eLex],'String split over end of line'); done := TRUE; END { Otherwise, add the character into } { the string and get another char } ELSE BEGIN ConcatChar(id,ch); GetCh; END; END; { If the next char is a quote, insert } { a quote into the string and continue, } { otherwise, the string has ended } GetCh; IF (ch='''') AND NOT done THEN BEGIN ConcatChar(id,ch); GetCh; END ELSE done := TRUE; UNTIL done; END; { Check for a '{' comment } '{': BEGIN inComment := TRUE; IF ch='$' THEN BEGIN GetCh; { Eat the '$' character } DoOptions; END; WHILE ch<>'}' DO GetCh; GetCh; GetSym; END; { Check for '(' or a '(*' comment } '(': IF ch<>'*' THEN sym := lparen ELSE BEGIN inComment := TRUE; GetCh; { Eat the '*' character } IF ch='$' THEN BEGIN GetCh; { Eat the '$' character } DoOptions; END; REPEAT WHILE ch<>'*' DO GetCh; GetCh; UNTIL ch=')'; GetCh; GetSym; END; { Check for unambiguous single-character tokens } ')': sym := rparen; '[': sym := lbrack; ']': sym := rbrack; ',': sym := comma; ';': sym := semicolon; '+': sym := plus; '-': sym := minus; '*': sym := times; '/': sym := divide; '­': sym := neq; '²': sym := leq; '³': sym := geq; { Character not recognized, just complain and ignore it } OTHERWISE BEGIN; Error([eLex],'Illegal character in input.'); GetSym; END; END; END; END; END; {============================================================================} {= =} {= ADDKEYWORD =} {= =} {= Add a keyword into the keyword list. =} {= =} {= Parameters: =} {= =} {= KWNAME The keyword's name =} {= KWSYM The keyword's token =} {= =} {============================================================================} PROCEDURE AddKeyword(kwName: Str255; kwSym: Token); VAR p: SymPtr; { Pointer to the new keyword element to add to the list } BEGIN { Allocate a new keyword element } p := AddSym(kwName,tKeyword); { Fill in the information } p^.kwSym := kwSym; END; {============================================================================} {= =} {= INITKEYWORDS =} {= =} {= Initialize the keyword list. =} {= =} {============================================================================} PROCEDURE InitKeywords; BEGIN { of MakeKeywords } AddKeyword('ARRAY', kARRAY); AddKeyword('BEGIN', kBEGIN); AddKeyword('CONST', kCONST); AddKeyword('DIV', kDIV); AddKeyword('DO', kDO); AddKeyword('ELSE', kELSE); AddKeyword('END', kEND); AddKeyword('FOR', kFOR); AddKeyword('FUNCTION', kFUNCTION); AddKeyword('IF', kIF); AddKeyword('OF', kOF); AddKeyword('PROCEDURE', kPROCEDURE); AddKeyword('PROGRAM', kPROGRAM); AddKeyword('RECORD', kRECORD); AddKeyword('THEN', kTHEN); AddKeyword('TO', kTO); AddKeyword('TYPE', kTYPE); AddKeyword('VAR', kVAR); AddKeyword('WHILE', kWHILE); AddKeyword('WRITELN', kWriteLn); END; {============================================================================} {= =} {= INITOPTIONS =} {= =} {= Initialize the compiler options. =} {= =} {============================================================================} PROCEDURE InitOptions; BEGIN symDumpFlag := FALSE; treeDumpFlag := FALSE; optConstExpr := FALSE; END; {============================================================================} {= =} {= INITGETSYM =} {= =} {= Initialize the lexical analyzer =} {= =} {============================================================================} PROCEDURE InitGetSym; BEGIN InitOptions; { Initialize the compiler options } InitKeywords; { Initialize the keywords } eofHit := FALSE; { End of file has not been reached yet } readLine := ''; { Force an immediate read in GetCh } linePos := 1; ch := ' '; { White space forces GetSym to immediately call GetCh } nextSym := noSym; { No extra symbol to be returned by GetSym } GetSym; { Start the pipeline } END; END.