0

I use libclang for creating syntax colorizing tool for c/c++ files. All works fine, but I has a problem with handling init statement in if statement (c++17 feature). For example

struct alpha {};

int main() {
  if (alpha a; 1 < 10) {
  }
}

So when i tokenize file (by clang_annotateTokens) I expect that alpha a in if statement produce two CXToken and two CXCursor: for type alpha and for variable a. But I get two CXToken and one CXCursor for both tokens. This CXCursor has IfStmt kind and Unexposed type, so I don't know how to get additional information about this tokens (information about that first token is a type and second is a variable).

Here is a minimal example:

// CppTokenizer.cpp

#include <algorithm>
#include <clang-c/Index.h>
#include <fstream>
#include <iostream>
#include <iterator>
#include <list>
#include <memory>
#include <vector>

using StringList = std::list<std::string>;

template <typename T>
class Finalizer {
public:
  ~Finalizer() {
    callback();
  }

  T callback;
};

template <typename T>
Finalizer<T> make_finalizer(T callback) {
  return Finalizer<T>{callback};
}

using TokenLocation = std::array<unsigned int, 3>;

struct Token {
  std::string   group;
  TokenLocation location;
};

using TokenList = std::list<Token>;

static std::string mapTokenKind(CXCursorKind const kind,
                                CXTypeKind const   type) noexcept;

static TokenLocation getTokenLocation(CXTranslationUnit trUnit,
                                      CXToken           token) noexcept;

static std::string getTokenGroup(CXCursor cursor) noexcept;

static TokenList tokenize(const std::string &compileFile);

int main(int argc, char *argv[]) {
  if (argc != 2) {
    std::cerr << "set file for analizing as parameter" << std::endl;
    return EXIT_FAILURE;
  }

  std::string fileName = argv[1];

  TokenList tokenList = tokenize(fileName);

  for (const Token &token : tokenList) {
    std::cout << token.group << ": " << token.location[0] << ' '
              << token.location[1] << ' ' << token.location[2] << std::endl;
  }

  return EXIT_SUCCESS;
}

TokenList tokenize(const std::string &compileFile) {
  std::vector<const char *> flags{"-x", "c++", "-std=gnu++17", "-fPIC"};

  // clang analizing
  CXIndex           index = clang_createIndex(0, 0);
  CXTranslationUnit translationUnit;
  CXErrorCode       parseError =
      clang_parseTranslationUnit2(index,
                                  compileFile.c_str(),
                                  flags.data(),
                                  flags.size(),
                                  nullptr,
                                  0,
                                  CXTranslationUnit_DetailedPreprocessingRecord,
                                  &translationUnit);

  auto indexFinalizer  = make_finalizer([index]() {
    clang_disposeIndex(index);
  });
  auto trUnitFinalizer = make_finalizer([translationUnit]() {
    clang_disposeTranslationUnit(translationUnit);
  });

  if (parseError != CXError_Success) {
    throw std::runtime_error{"parse error"};
  }

  CXFile trUFile = clang_getFile(translationUnit, compileFile.c_str());

  size_t endOffset = 0;
  clang_getFileContents(translationUnit, trUFile, &endOffset);

  CXSourceLocation beginLoc =
      clang_getLocationForOffset(translationUnit, trUFile, 0);
  CXSourceLocation endLoc =
      clang_getLocationForOffset(translationUnit, trUFile, endOffset);

  CXSourceRange range = clang_getRange(beginLoc, endLoc);

  unsigned int numTokens;
  CXToken *    cxTokens = nullptr;
  clang_tokenize(translationUnit, range, &cxTokens, &numTokens);

  std::vector<CXCursor> cursors(numTokens);
  clang_annotateTokens(translationUnit, cxTokens, numTokens, cursors.data());

  TokenList tokens;
  for (unsigned int i = 0; i < numTokens; ++i) {
    CXToken &cxToken = cxTokens[i];

    // handle only identifiers
    if (clang_getTokenKind(cxToken) != CXToken_Identifier) {
      continue;
    }
    CXCursor &cursor = cursors[i];

    std::string   group    = getTokenGroup(cursor);
    TokenLocation location = getTokenLocation(translationUnit, cxToken);

    tokens.emplace_back(Token{group, location});
  }

  clang_disposeTokens(translationUnit, cxTokens, numTokens);

  return tokens;
}

static std::string mapTypeKind(CXTypeKind const typeKind) noexcept {
  switch (typeKind) {
  case CXType_Void:
  case CXType_Bool:
  case CXType_Char_U:
  case CXType_UChar:
  case CXType_Char16:
  case CXType_Char32:
  case CXType_UShort:
  case CXType_UInt:
  case CXType_ULong:
  case CXType_ULongLong:
  case CXType_UInt128:
  case CXType_Char_S:
  case CXType_SChar:
  case CXType_WChar:
  case CXType_Short:
  case CXType_Int:
  case CXType_Long:
  case CXType_LongLong:
  case CXType_Int128:
  case CXType_Float:
  case CXType_Double:
  case CXType_LongDouble:
  case CXType_NullPtr:
  case CXType_Overload:
  case CXType_Dependent:
  case CXType_ObjCId:
  case CXType_ObjCClass:
  case CXType_ObjCSel:
  case CXType_Float128:
  case CXType_Half:
  case CXType_Float16:
  case CXType_ShortAccum:
  case CXType_Accum:
  case CXType_LongAccum:
  case CXType_UShortAccum:
  case CXType_UAccum:
  case CXType_ULongAccum:

  case CXType_Complex:
  case CXType_Pointer:
  case CXType_BlockPointer:
  case CXType_LValueReference:
  case CXType_RValueReference:
  case CXType_Record:
  case CXType_Typedef:
  case CXType_ObjCInterface:
  case CXType_ObjCObjectPointer:
  case CXType_ConstantArray:
  case CXType_Vector:
  case CXType_IncompleteArray:
  case CXType_VariableArray:
  case CXType_DependentSizedArray:
  case CXType_Auto:
  case CXType_Elaborated:
    return "Variable";

  case CXType_MemberPointer:
    return "Member";

  case CXType_Enum:
    return "EnumConstant";

  case CXType_FunctionNoProto:
  case CXType_FunctionProto:
    return "Function";

  default:
    break;
  }

  CXString    typeSpelling = clang_getTypeKindSpelling(typeKind);
  std::string retval       = clang_getCString(typeSpelling);
  clang_disposeString(typeSpelling);
  return retval;
}

static std::string mapTokenKind(CXCursorKind const cursorKind,
                                CXTypeKind const   typeKind) noexcept {
  switch (cursorKind) {
  case CXCursor_DeclRefExpr:
  case CXCursor_VarDecl:
    return mapTypeKind(typeKind);
  default:
    break;
  }

  CXString    cursorKindSpelling = clang_getCursorKindSpelling(cursorKind);
  std::string retval             = clang_getCString(cursorKindSpelling);
  clang_disposeString(cursorKindSpelling);
  return retval;
}

TokenLocation getTokenLocation(CXTranslationUnit trUnit,
                               CXToken           token) noexcept {
  CXSourceRange    tokenRange = clang_getTokenExtent(trUnit, token);
  CXSourceLocation begin      = clang_getRangeStart(tokenRange);
  CXSourceLocation end        = clang_getRangeEnd(tokenRange);

  unsigned int line;
  unsigned int column;
  unsigned int beginOffset;
  unsigned int endOffset;
  clang_getFileLocation(begin, nullptr, &line, &column, &beginOffset);
  clang_getFileLocation(end, nullptr, nullptr, nullptr, &endOffset);

  return TokenLocation{line, column, endOffset - beginOffset};
}

std::string getTokenGroup(CXCursor cursor) noexcept {
  CXTypeKind   typeKind   = clang_getCursorType(cursor).kind;
  CXCursorKind cursorKind = clang_getCursorKind(cursor);

  std::string group = mapTokenKind(cursorKind, typeKind);
  return group;
}

std::string clangErrorToString(CXErrorCode code) noexcept {
  std::string errorString;
  switch (code) {
  case CXError_Failure:
    errorString = "Failure";
    break;
  case CXError_Crashed:
    errorString = "Crashed";
    break;
  case CXError_InvalidArguments:
    errorString = "InvalidArgument";
    break;
  case CXError_ASTReadError:
    errorString = "ASTReadError";
    break;
  case CXError_Success:
    errorString = "Success";
    break;
  }

  return errorString;
}
  • So, basically you think you found a software bug? I don't think SO is the right place to report that. – Ulrich Eckhardt May 21 '20 at 07:08
  • @UlrichEckhardt no, I don't think that is software bug. I think that here must be way to get needed information about this tokens, but I don't know how to do this – Andrej Levkovitch May 21 '20 at 07:11

0 Answers0