I use libclang
for creating syntax colorizing tool for c/c++ files. All works fine, but I has a problem with handling init statement
in if statement
(c++17 feature). For example
struct alpha {};
int main() {
if (alpha a; 1 < 10) {
}
}
So when i tokenize file (by clang_annotateTokens
) I expect that alpha a
in if statement produce two CXToken
and two CXCursor
: for type alpha
and for variable a
. But I get two CXToken
and one CXCursor
for both tokens. This CXCursor
has IfStmt
kind and Unexposed
type, so I don't know how to get additional information about this tokens (information about that first token is a type and second is a variable).
Here is a minimal example:
// CppTokenizer.cpp
#include <algorithm>
#include <clang-c/Index.h>
#include <fstream>
#include <iostream>
#include <iterator>
#include <list>
#include <memory>
#include <vector>
using StringList = std::list<std::string>;
template <typename T>
class Finalizer {
public:
~Finalizer() {
callback();
}
T callback;
};
template <typename T>
Finalizer<T> make_finalizer(T callback) {
return Finalizer<T>{callback};
}
using TokenLocation = std::array<unsigned int, 3>;
struct Token {
std::string group;
TokenLocation location;
};
using TokenList = std::list<Token>;
static std::string mapTokenKind(CXCursorKind const kind,
CXTypeKind const type) noexcept;
static TokenLocation getTokenLocation(CXTranslationUnit trUnit,
CXToken token) noexcept;
static std::string getTokenGroup(CXCursor cursor) noexcept;
static TokenList tokenize(const std::string &compileFile);
int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "set file for analizing as parameter" << std::endl;
return EXIT_FAILURE;
}
std::string fileName = argv[1];
TokenList tokenList = tokenize(fileName);
for (const Token &token : tokenList) {
std::cout << token.group << ": " << token.location[0] << ' '
<< token.location[1] << ' ' << token.location[2] << std::endl;
}
return EXIT_SUCCESS;
}
TokenList tokenize(const std::string &compileFile) {
std::vector<const char *> flags{"-x", "c++", "-std=gnu++17", "-fPIC"};
// clang analizing
CXIndex index = clang_createIndex(0, 0);
CXTranslationUnit translationUnit;
CXErrorCode parseError =
clang_parseTranslationUnit2(index,
compileFile.c_str(),
flags.data(),
flags.size(),
nullptr,
0,
CXTranslationUnit_DetailedPreprocessingRecord,
&translationUnit);
auto indexFinalizer = make_finalizer([index]() {
clang_disposeIndex(index);
});
auto trUnitFinalizer = make_finalizer([translationUnit]() {
clang_disposeTranslationUnit(translationUnit);
});
if (parseError != CXError_Success) {
throw std::runtime_error{"parse error"};
}
CXFile trUFile = clang_getFile(translationUnit, compileFile.c_str());
size_t endOffset = 0;
clang_getFileContents(translationUnit, trUFile, &endOffset);
CXSourceLocation beginLoc =
clang_getLocationForOffset(translationUnit, trUFile, 0);
CXSourceLocation endLoc =
clang_getLocationForOffset(translationUnit, trUFile, endOffset);
CXSourceRange range = clang_getRange(beginLoc, endLoc);
unsigned int numTokens;
CXToken * cxTokens = nullptr;
clang_tokenize(translationUnit, range, &cxTokens, &numTokens);
std::vector<CXCursor> cursors(numTokens);
clang_annotateTokens(translationUnit, cxTokens, numTokens, cursors.data());
TokenList tokens;
for (unsigned int i = 0; i < numTokens; ++i) {
CXToken &cxToken = cxTokens[i];
// handle only identifiers
if (clang_getTokenKind(cxToken) != CXToken_Identifier) {
continue;
}
CXCursor &cursor = cursors[i];
std::string group = getTokenGroup(cursor);
TokenLocation location = getTokenLocation(translationUnit, cxToken);
tokens.emplace_back(Token{group, location});
}
clang_disposeTokens(translationUnit, cxTokens, numTokens);
return tokens;
}
static std::string mapTypeKind(CXTypeKind const typeKind) noexcept {
switch (typeKind) {
case CXType_Void:
case CXType_Bool:
case CXType_Char_U:
case CXType_UChar:
case CXType_Char16:
case CXType_Char32:
case CXType_UShort:
case CXType_UInt:
case CXType_ULong:
case CXType_ULongLong:
case CXType_UInt128:
case CXType_Char_S:
case CXType_SChar:
case CXType_WChar:
case CXType_Short:
case CXType_Int:
case CXType_Long:
case CXType_LongLong:
case CXType_Int128:
case CXType_Float:
case CXType_Double:
case CXType_LongDouble:
case CXType_NullPtr:
case CXType_Overload:
case CXType_Dependent:
case CXType_ObjCId:
case CXType_ObjCClass:
case CXType_ObjCSel:
case CXType_Float128:
case CXType_Half:
case CXType_Float16:
case CXType_ShortAccum:
case CXType_Accum:
case CXType_LongAccum:
case CXType_UShortAccum:
case CXType_UAccum:
case CXType_ULongAccum:
case CXType_Complex:
case CXType_Pointer:
case CXType_BlockPointer:
case CXType_LValueReference:
case CXType_RValueReference:
case CXType_Record:
case CXType_Typedef:
case CXType_ObjCInterface:
case CXType_ObjCObjectPointer:
case CXType_ConstantArray:
case CXType_Vector:
case CXType_IncompleteArray:
case CXType_VariableArray:
case CXType_DependentSizedArray:
case CXType_Auto:
case CXType_Elaborated:
return "Variable";
case CXType_MemberPointer:
return "Member";
case CXType_Enum:
return "EnumConstant";
case CXType_FunctionNoProto:
case CXType_FunctionProto:
return "Function";
default:
break;
}
CXString typeSpelling = clang_getTypeKindSpelling(typeKind);
std::string retval = clang_getCString(typeSpelling);
clang_disposeString(typeSpelling);
return retval;
}
static std::string mapTokenKind(CXCursorKind const cursorKind,
CXTypeKind const typeKind) noexcept {
switch (cursorKind) {
case CXCursor_DeclRefExpr:
case CXCursor_VarDecl:
return mapTypeKind(typeKind);
default:
break;
}
CXString cursorKindSpelling = clang_getCursorKindSpelling(cursorKind);
std::string retval = clang_getCString(cursorKindSpelling);
clang_disposeString(cursorKindSpelling);
return retval;
}
TokenLocation getTokenLocation(CXTranslationUnit trUnit,
CXToken token) noexcept {
CXSourceRange tokenRange = clang_getTokenExtent(trUnit, token);
CXSourceLocation begin = clang_getRangeStart(tokenRange);
CXSourceLocation end = clang_getRangeEnd(tokenRange);
unsigned int line;
unsigned int column;
unsigned int beginOffset;
unsigned int endOffset;
clang_getFileLocation(begin, nullptr, &line, &column, &beginOffset);
clang_getFileLocation(end, nullptr, nullptr, nullptr, &endOffset);
return TokenLocation{line, column, endOffset - beginOffset};
}
std::string getTokenGroup(CXCursor cursor) noexcept {
CXTypeKind typeKind = clang_getCursorType(cursor).kind;
CXCursorKind cursorKind = clang_getCursorKind(cursor);
std::string group = mapTokenKind(cursorKind, typeKind);
return group;
}
std::string clangErrorToString(CXErrorCode code) noexcept {
std::string errorString;
switch (code) {
case CXError_Failure:
errorString = "Failure";
break;
case CXError_Crashed:
errorString = "Crashed";
break;
case CXError_InvalidArguments:
errorString = "InvalidArgument";
break;
case CXError_ASTReadError:
errorString = "ASTReadError";
break;
case CXError_Success:
errorString = "Success";
break;
}
return errorString;
}