--[[-------------------------------------------------------------------- llex.lua: Lua 5.1 lexical analyzer in Lua This file is part of LuaSrcDiet, based on Yueliang material. Copyright (c) 2008 Kein-Hong Man The COPYRIGHT file describes the conditions under which this software may be distributed. See the ChangeLog for more information. ------------------------------------------------------------------------]] --[[-------------------------------------------------------------------- -- NOTES: -- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0, -- with significant modifications to handle LuaSrcDiet's needs: -- (1) llex.error is an optional error function handler -- (2) seminfo for strings include their delimiters and no -- translation operations are performed on them -- * ADDED shbang handling has been added to support executable scripts -- * NO localized decimal point replacement magic -- * NO limit to number of lines -- * NO support for compatible long strings (LUA_COMPAT_LSTR) -- * Please read technotes.txt for more technical details. ------------------------------------------------------------------------]] local base=_G local string=require"string" module"llex" local find=string.find local match=string.match local sub=string.sub ---------------------------------------------------------------------- -- initialize keyword list, variables ---------------------------------------------------------------------- local kw={} for v in string.gmatch([[ and break do else elseif end false for function if in local nil not or repeat return then true until while]],"%S+")do kw[v]=true end -- NOTE: see init() for module variables (externally visible): -- tok, seminfo, tokln local z,-- source stream sourceid,-- name of source I,-- position of lexer buff,-- buffer for strings ln-- line number ---------------------------------------------------------------------- -- add information to token listing ---------------------------------------------------------------------- local function addtoken(token,info) local i=#tok+1 tok[i]=token seminfo[i]=info tokln[i]=ln end ---------------------------------------------------------------------- -- handles line number incrementation and end-of-line characters ---------------------------------------------------------------------- local function inclinenumber(i,is_tok) local sub=sub local old=sub(z,i,i) i=i+1-- skip '\n' or '\r' local c=sub(z,i,i) if(c=="\n"or c=="\r")and(c~=old)then i=i+1-- skip '\n\r' or '\r\n' old=old..c end if is_tok then addtoken("TK_EOL",old)end ln=ln+1 I=i return i end ---------------------------------------------------------------------- -- initialize lexer for given source _z and source name _sourceid ---------------------------------------------------------------------- function init(_z,_sourceid) z=_z-- source sourceid=_sourceid-- name of source I=1-- lexer's position in source ln=1-- line number tok={}-- lexed token list* seminfo={}-- lexed semantic information list* tokln={}-- line numbers for messages* -- (*) externally visible thru' module -------------------------------------------------------------------- -- initial processing (shbang handling) -------------------------------------------------------------------- local p,_,q,r=find(z,"^(#[^\r\n]*)(\r?\n?)") if p then-- skip first line I=I+#q addtoken("TK_COMMENT",q) if#r>0 then inclinenumber(I,true)end end end ---------------------------------------------------------------------- -- returns a chunk name or id, no truncation for long names ---------------------------------------------------------------------- function chunkid() if sourceid and match(sourceid,"^[=@]")then return sub(sourceid,2)-- remove first char end return"[string]" end ---------------------------------------------------------------------- -- formats error message and throws error -- * a simplified version, does not report what token was responsible ---------------------------------------------------------------------- function errorline(s,line) local e=error or base.error e(string.format("%s:%d: %s",chunkid(),line or ln,s)) end ------------------------------------------------------------------------ -- count separators ("=") in a long string delimiter ------------------------------------------------------------------------ local function skip_sep(i) local sub=sub local s=sub(z,i,i) i=i+1 local count=#match(z,"=*",i)-- note, take the length i=i+count I=i return(sub(z,i,i)==s)and count or(-count)-1 end ---------------------------------------------------------------------- -- reads a long string or long comment ---------------------------------------------------------------------- local function read_long_string(is_str,sep) local i=I+1-- skip 2nd '[' local sub=sub local c=sub(z,i,i) if c=="\r"or c=="\n"then-- string starts with a newline? i=inclinenumber(i)-- skip it end local j=i while true do local p,q,r=find(z,"([\r\n%]])",i)-- (long range) if not p then errorline(is_str and"unfinished long string"or "unfinished long comment") end i=p if r=="]"then-- delimiter test if skip_sep(i)==sep then buff=sub(z,buff,I) I=I+1-- skip 2nd ']' return buff end i=I else-- newline buff=buff.."\n" i=inclinenumber(i) end end--while end ---------------------------------------------------------------------- -- reads a string ---------------------------------------------------------------------- local function read_string(del) local i=I local find=find local sub=sub while true do local p,q,r=find(z,"([\n\r\\\"\'])",i)-- (long range) if p then if r=="\n"or r=="\r"then errorline("unfinished string") end i=p if r=="\\"then-- handle escapes i=i+1 r=sub(z,i,i) if r==""then break end-- (EOZ error) p=find("abfnrtv\n\r",r,1,true) ------------------------------------------------------ if p then-- special escapes if p>7 then i=inclinenumber(i) else i=i+1 end ------------------------------------------------------ elseif find(r,"%D")then-- other non-digits i=i+1 ------------------------------------------------------ else-- \xxx sequence local p,q,s=find(z,"^(%d%d?%d?)",i) i=q+1 if s+1>256 then-- UCHAR_MAX errorline("escape sequence too large") end ------------------------------------------------------ end--if p else i=i+1 if r==del then-- ending delimiter I=i return sub(z,buff,i-1)-- return string end end--if r else break-- (error) end--if p end--while errorline("unfinished string") end ------------------------------------------------------------------------ -- main lexer function ------------------------------------------------------------------------ function llex() local find=find local match=match while true do--outer local i=I -- inner loop allows break to be used to nicely section tests while true do--inner ---------------------------------------------------------------- local p,_,r=find(z,"^([_%a][_%w]*)",i) if p then I=i+#r if kw[r]then addtoken("TK_KEYWORD",r)-- reserved word (keyword) else addtoken("TK_NAME",r)-- identifier end break-- (continue) end ---------------------------------------------------------------- local p,_,r=find(z,"^(%.?)%d",i) if p then-- numeral if r=="."then i=i+1 end local _,q,r=find(z,"^%d*[%.%d]*([eE]?)",i) i=q+1 if#r==1 then-- optional exponent if match(z,"^[%+%-]",i)then-- optional sign i=i+1 end end local _,q=find(z,"^[_%w]*",i) I=q+1 local v=sub(z,p,q)-- string equivalent if not base.tonumber(v)then-- handles hex test also errorline("malformed number") end addtoken("TK_NUMBER",v) break-- (continue) end ---------------------------------------------------------------- local p,q,r,t=find(z,"^((%s)[ \t\v\f]*)",i) if p then if t=="\n"or t=="\r"then-- newline inclinenumber(i,true) else I=q+1-- whitespace addtoken("TK_SPACE",r) end break-- (continue) end ---------------------------------------------------------------- local r=match(z,"^%p",i) if r then buff=i local p=find("-[\"\'.=<>~",r,1,true) if p then -- two-level if block for punctuation/symbols -------------------------------------------------------- if p<=2 then if p==1 then-- minus local c=match(z,"^%-%-(%[?)",i) if c then i=i+2 local sep=-1 if c=="["then sep=skip_sep(i) end if sep>=0 then-- long comment addtoken("TK_LCOMMENT",read_long_string(false,sep)) else-- short comment I=find(z,"[\n\r]",i)or(#z+1) addtoken("TK_COMMENT",sub(z,buff,I-1)) end break-- (continue) end -- (fall through for "-") else-- [ or long string local sep=skip_sep(i) if sep>=0 then addtoken("TK_LSTRING",read_long_string(true,sep)) elseif sep==-1 then addtoken("TK_OP","[") else errorline("invalid long string delimiter") end break-- (continue) end -------------------------------------------------------- elseif p<=5 then if p<5 then-- strings I=i+1 addtoken("TK_STRING",read_string(r)) break-- (continue) end r=match(z,"^%.%.?%.?",i)-- .|..|... dots -- (fall through) -------------------------------------------------------- else-- relational r=match(z,"^%p=?",i) -- (fall through) end end I=i+#r addtoken("TK_OP",r)-- for other symbols, fall through break-- (continue) end ---------------------------------------------------------------- local r=sub(z,i,i) if r~=""then I=i+1 addtoken("TK_OP",r)-- other single-char tokens break end addtoken("TK_EOS","")-- end of stream, return-- exit here ---------------------------------------------------------------- end--while inner end--while outer end return base.getfenv()