boost/spirit/home/lex/lexer/lexertl/lexer.hpp
// Copyright (c) 2001-2011 Hartmut Kaiser
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM)
#define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM
#if defined(_MSC_VER)
#pragma once
#endif
#include <iosfwd>
#include <boost/spirit/home/support/detail/lexer/generator.hpp>
#include <boost/spirit/home/support/detail/lexer/rules.hpp>
#include <boost/spirit/home/support/detail/lexer/consts.hpp>
#include <boost/spirit/home/support/unused.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/token.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/functor.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp>
#if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
#include <boost/spirit/home/support/detail/lexer/debug.hpp>
#endif
#include <iterator> // for std::iterator_traits
namespace boost { namespace spirit { namespace lex { namespace lexertl
{
///////////////////////////////////////////////////////////////////////////
namespace detail
{
///////////////////////////////////////////////////////////////////////
// The must_escape function checks if the given character value needs
// to be preceded by a backslash character to disable its special
// meaning in the context of a regular expression
///////////////////////////////////////////////////////////////////////
template <typename Char>
inline bool must_escape(Char c)
{
// FIXME: more needed?
switch (c) {
case '+': case '/': case '*': case '?':
case '|':
case '(': case ')':
case '[': case ']':
case '{': case '}':
case '.':
case '^': case '$':
case '\\':
case '"':
return true;
default:
break;
}
return false;
}
///////////////////////////////////////////////////////////////////////
// The escape function returns the string representation of the given
// character value, possibly escaped with a backslash character, to
// allow it being safely used in a regular expression definition.
///////////////////////////////////////////////////////////////////////
template <typename Char>
inline std::basic_string<Char> escape(Char ch)
{
std::basic_string<Char> result(1, ch);
if (detail::must_escape(ch))
{
typedef typename std::basic_string<Char>::size_type size_type;
result.insert((size_type)0, 1, '\\');
}
return result;
}
///////////////////////////////////////////////////////////////////////
//
///////////////////////////////////////////////////////////////////////
inline boost::lexer::regex_flags map_flags(unsigned int flags)
{
unsigned int retval = boost::lexer::none;
if (flags & match_flags::match_not_dot_newline)
retval |= boost::lexer::dot_not_newline;
if (flags & match_flags::match_icase)
retval |= boost::lexer::icase;
return boost::lexer::regex_flags(retval);
}
}
///////////////////////////////////////////////////////////////////////////
template <typename Lexer, typename F>
bool generate_static(Lexer const&
, std::basic_ostream<typename Lexer::char_type>&
, typename Lexer::char_type const*, F);
///////////////////////////////////////////////////////////////////////////
//
// Every lexer type to be used as a lexer for Spirit has to conform to
// the following public interface:
//
// typedefs:
// iterator_type The type of the iterator exposed by this lexer.
// token_type The type of the tokens returned from the exposed
// iterators.
//
// functions:
// default constructor
// Since lexers are instantiated as base classes
// only it might be a good idea to make this
// constructor protected.
// begin, end Return a pair of iterators, when dereferenced
// returning the sequence of tokens recognized in
// the input stream given as the parameters to the
// begin() function.
// add_token Should add the definition of a token to be
// recognized by this lexer.
// clear Should delete all current token definitions
// associated with the given state of this lexer
// object.
//
// template parameters:
// Iterator The type of the iterator used to access the
// underlying character stream.
// Token The type of the tokens to be returned from the
// exposed token iterator.
// Functor The type of the InputPolicy to use to instantiate
// the multi_pass iterator type to be used as the
// token iterator (returned from begin()/end()).
//
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
//
// The lexer class is a implementation of a Spirit.Lex lexer on
// top of Ben Hanson's lexertl library as outlined above (For more
// information about lexertl go here: http://www.benhanson.net/lexertl.html).
//
// This class is supposed to be used as the first and only template
// parameter while instantiating instances of a lex::lexer class.
//
///////////////////////////////////////////////////////////////////////////
template <typename Token = token<>
, typename Iterator = typename Token::iterator_type
, typename Functor = functor<Token, lexertl::detail::data, Iterator> >
class lexer
{
private:
struct dummy { void true_() {} };
typedef void (dummy::*safe_bool)();
static std::size_t const all_states_id = static_cast<std::size_t>(-2);
public:
operator safe_bool() const
{ return initialized_dfa_ ? &dummy::true_ : 0; }
typedef typename std::iterator_traits<Iterator>::value_type char_type;
typedef std::basic_string<char_type> string_type;
typedef boost::lexer::basic_rules<char_type> basic_rules_type;
// Every lexer type to be used as a lexer for Spirit has to conform to
// a public interface .
typedef Token token_type;
typedef typename Token::id_type id_type;
typedef iterator<Functor> iterator_type;
private:
// this type is purely used for the iterator_type construction below
struct iterator_data_type
{
typedef typename Functor::semantic_actions_type semantic_actions_type;
iterator_data_type(
boost::lexer::basic_state_machine<char_type> const& sm
, boost::lexer::basic_rules<char_type> const& rules
, semantic_actions_type const& actions)
: state_machine_(sm), rules_(rules), actions_(actions)
{}
boost::lexer::basic_state_machine<char_type> const& state_machine_;
boost::lexer::basic_rules<char_type> const& rules_;
semantic_actions_type const& actions_;
// silence MSVC warning C4512: assignment operator could not be generated
BOOST_DELETED_FUNCTION(iterator_data_type& operator= (iterator_data_type const&))
};
public:
// Return the start iterator usable for iterating over the generated
// tokens.
iterator_type begin(Iterator& first, Iterator const& last
, char_type const* initial_state = 0) const
{
if (!init_dfa()) // never minimize DFA for dynamic lexers
return iterator_type();
iterator_data_type iterator_data(state_machine_, rules_, actions_);
return iterator_type(iterator_data, first, last, initial_state);
}
// Return the end iterator usable to stop iterating over the generated
// tokens.
iterator_type end() const
{
return iterator_type();
}
protected:
// Lexer instances can be created by means of a derived class only.
lexer(unsigned int flags)
: flags_(detail::map_flags(flags))
, rules_(flags_)
, initialized_dfa_(false)
{}
public:
// interface for token definition management
std::size_t add_token(char_type const* state, char_type tokendef,
std::size_t token_id, char_type const* targetstate)
{
add_state(state);
initialized_dfa_ = false;
if (state == all_states())
return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot());
if (0 == targetstate)
targetstate = state;
else
add_state(targetstate);
return rules_.add(state, detail::escape(tokendef), token_id, targetstate);
}
std::size_t add_token(char_type const* state, string_type const& tokendef,
std::size_t token_id, char_type const* targetstate)
{
add_state(state);
initialized_dfa_ = false;
if (state == all_states())
return rules_.add(state, tokendef, token_id, rules_.dot());
if (0 == targetstate)
targetstate = state;
else
add_state(targetstate);
return rules_.add(state, tokendef, token_id, targetstate);
}
// interface for pattern definition management
void add_pattern (char_type const* state, string_type const& name,
string_type const& patterndef)
{
add_state(state);
rules_.add_macro(name.c_str(), patterndef);
initialized_dfa_ = false;
}
boost::lexer::rules const& get_rules() const { return rules_; }
void clear(char_type const* state)
{
std::size_t s = rules_.state(state);
if (boost::lexer::npos != s)
rules_.clear(state);
initialized_dfa_ = false;
}
std::size_t add_state(char_type const* state)
{
if (state == all_states())
return all_states_id;
std::size_t stateid = rules_.state(state);
if (boost::lexer::npos == stateid) {
stateid = rules_.add_state(state);
initialized_dfa_ = false;
}
return stateid;
}
string_type initial_state() const
{
return string_type(rules_.initial());
}
string_type all_states() const
{
return string_type(rules_.all_states());
}
// Register a semantic action with the given id
template <typename F>
void add_action(std::size_t unique_id, std::size_t state, F act)
{
// If you see an error here stating add_action is not a member of
// fusion::unused_type then you are probably having semantic actions
// attached to at least one token in the lexer definition without
// using the lex::lexertl::actor_lexer<> as its base class.
typedef typename Functor::wrap_action_type wrapper_type;
if (state == all_states_id) {
// add the action to all known states
typedef typename
basic_rules_type::string_size_t_map::const_iterator
state_iterator;
std::size_t states = rules_.statemap().size();
for (state_iterator it = rules_.statemap().begin(),
end = rules_.statemap().end(); it != end; ++it) {
for (std::size_t j = 0; j < states; ++j)
actions_.add_action(unique_id + j, it->second, wrapper_type::call(act));
}
}
else {
actions_.add_action(unique_id, state, wrapper_type::call(act));
}
}
// template <typename F>
// void add_action(std::size_t unique_id, char_type const* state, F act)
// {
// typedef typename Functor::wrap_action_type wrapper_type;
// actions_.add_action(unique_id, add_state(state), wrapper_type::call(act));
// }
// We do not minimize the state machine by default anymore because
// Ben said: "If you can afford to generate a lexer at runtime, there
// is little point in calling minimise."
// Go figure.
bool init_dfa(bool minimize = false) const
{
if (!initialized_dfa_) {
state_machine_.clear();
typedef boost::lexer::basic_generator<char_type> generator;
generator::build (rules_, state_machine_);
if (minimize)
generator::minimise (state_machine_);
#if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
boost::lexer::debug::dump(state_machine_, std::cerr);
#endif
initialized_dfa_ = true;
// // release memory held by rules description
// basic_rules_type rules;
// rules.init_state_info(rules_); // preserve states
// std::swap(rules, rules_);
}
return true;
}
private:
// lexertl specific data
mutable boost::lexer::basic_state_machine<char_type> state_machine_;
boost::lexer::regex_flags flags_;
/*mutable*/ basic_rules_type rules_;
typename Functor::semantic_actions_type actions_;
mutable bool initialized_dfa_;
// generator functions must be able to access members directly
template <typename Lexer, typename F>
friend bool generate_static(Lexer const&
, std::basic_ostream<typename Lexer::char_type>&
, typename Lexer::char_type const*, F);
};
///////////////////////////////////////////////////////////////////////////
//
// The actor_lexer class is another implementation of a Spirit.Lex
// lexer on top of Ben Hanson's lexertl library as outlined above (For
// more information about lexertl go here:
// http://www.benhanson.net/lexertl.html).
//
// The only difference to the lexer class above is that token_def
// definitions may have semantic (lexer) actions attached while being
// defined:
//
// int w;
// token_def word = "[^ \t\n]+";
// self = word[++ref(w)]; // see example: word_count_lexer
//
// This class is supposed to be used as the first and only template
// parameter while instantiating instances of a lex::lexer class.
//
///////////////////////////////////////////////////////////////////////////
template <typename Token = token<>
, typename Iterator = typename Token::iterator_type
, typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> >
class actor_lexer : public lexer<Token, Iterator, Functor>
{
protected:
// Lexer instances can be created by means of a derived class only.
actor_lexer(unsigned int flags)
: lexer<Token, Iterator, Functor>(flags) {}
};
}}}}
#endif