How to get instruction information from libopcodes?

Question

I am writing a tool which uses libbfd and libopcodes in x86-32 and x86-64 Linux to perform disassembly. The problem is that whilst I am able to get libopcodes to disassemble, I am unable to get any instruction information. For the purposes of demonstration, I have made a minimal example which reproduces my issue. The program should disassemble itself from entry point to the first RET/RETQ.

The code is a bit hacked up with globals and error checking has been omitted for brevity, etc. but should illustrate the issue clearly.

#include <bfd.h>
#include <dis-asm.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <libiberty.h>

/*
 * Holds state for BFD and libopcodes.
 */
bfd *        abfd  = NULL;
disassemble_info dinfo = {0};

/*
 * Temporary hack to signal when disassembling should stop.
 */
static bool stop_disassembling = FALSE;

/*
 * Gets path to currently running executable.
 */
bool get_target_path(char * target_path, size_t size)
{
    char *   path;
    ssize_t len;

    pid_t pid = getpid();
    sprintf(target_path, "/proc/%d/exe", (int)pid );

    path = strdup(target_path);
    len  = readlink(path, target_path, size);

    target_path[len] = '\0';
    free(path);
    return TRUE;
}

/*
 * libopcodes appends spaces on the end of some instructions so for
 * comparisons, we want to strip those first.
 */
void strip_tail(char * str, unsigned int size)
{
    int i;
    for(i = 0; i < size; i++) {
        if(!isgraph(str[i])) {
            str[i] = '\0';
            break;
        }
    }
}

/*
 * Checks whether the current instruction will cause the control flow to not
 * proceed to the linearly subsequent instruction (e.g. ret, jmp, etc.)
 */
bool breaks_control_flow(char * str)
{
    if(abfd->arch_info->bits_per_address == 64) {
        if(strcmp(str, "retq") == 0) {
            return TRUE;
        }
    } else {
        if(strcmp(str, "ret") == 0) {
            return TRUE;
        }
    }

    return FALSE;
}

/*
 * Used as a callback for libopcodes so we can do something useful with the
 * disassembly. Currently this just outputs to stdout.
 */
int custom_fprintf(void * stream, const char * format, ...)
{
    /* silly amount */
    char    str[128] = {0};
    int rv;
    va_list args;

    va_start(args, format);
    rv = vsnprintf(str, ARRAY_SIZE(str) - 1, format, args);
    va_end(args);

    puts(str);
    strip_tail(str, ARRAY_SIZE(str));

    if(breaks_control_flow(str)) {
        puts("Stopped disassembly");
        stop_disassembling = TRUE;
    }

    if(dinfo.insn_info_valid) {
        switch(dinfo.insn_type) {
            case dis_noninsn:
                printf("not an instruction\n");
                break;
            case dis_nonbranch:
                printf("not a branch\n");
                break;
            case dis_branch:
                printf("is a branch\n");
                break;
            case dis_condbranch:
                printf("is a conditional branch\n");
                break;
            case dis_jsr:
                printf("jump to subroutine\n");
                break;
            case dis_condjsr:
                printf("conditional jump to subroutine\n");
                break;
            case dis_dref:
                printf("data reference in instruction\n");
                break;
            case dis_dref2:
                printf("two data references in instruction\n");
                break;
            default:
                printf("not enumerated\n");
                break;
        }
    } else {
        printf("insn_info not valid\n");
    }

    return rv;
}

/*
 * Initialises libopcodes disassembler and returns an instance of it.
 */
disassembler_ftype init_disasm(bfd * abfd, disassemble_info * dinfo)
{
    /* Override the stream the disassembler outputs to */
    init_disassemble_info(dinfo, NULL, custom_fprintf);
    dinfo->flavour = bfd_get_flavour(abfd);
    dinfo->arch    = bfd_get_arch(abfd);
    dinfo->mach    = bfd_get_mach(abfd);
    dinfo->endian  = abfd->xvec->byteorder;
    disassemble_init_for_target(dinfo);

    return disassembler(abfd);
}

/*
 * Method of locating section from VMA taken from opdis.
 */
typedef struct {
    bfd_vma    vma;
    asection * sec;
} BFD_VMA_SECTION;

/*
 * Loads section and fills in dinfo accordingly. Since this function allocates
 * memory in dinfo->buffer, callers need to call free once they are finished.
 */
bool load_section(bfd * abfd, disassemble_info * dinfo, asection * s)
{
    int     size = bfd_section_size(s->owner, s);
    unsigned char * buf  = xmalloc(size);

    if(!bfd_get_section_contents(s->owner, s, buf, 0, size)) {
        free(buf);
        return FALSE;
    }

    dinfo->section       = s;
    dinfo->buffer        = buf;
    dinfo->buffer_length = size;
    dinfo->buffer_vma    = bfd_section_vma(s->owner, s);

    printf("Allocated %d bytes for %s section\n: 0x%lX", size, s->name,
            dinfo->buffer_vma);
    return TRUE;
}

/*
 * Used to locate section for a vma.
 */
void vma_in_section(bfd * abfd, asection * s, void * data)
{
    BFD_VMA_SECTION * req = data;

    if(req && req->vma >= s->vma &&
    req->vma < (s->vma + bfd_section_size(abfd, s)) ) {
        req->sec = s;
    }
}

/*
 * Locate and load section containing vma.
 */
bool load_section_for_vma(bfd * abfd, disassemble_info * dinfo,
        bfd_vma vma)
{
    BFD_VMA_SECTION req = {vma, NULL};
    bfd_map_over_sections(abfd, vma_in_section, &req);

    if(!req.sec) {
        return FALSE;
    } else {
        return load_section(abfd, dinfo, req.sec);
    }
}

/*
 * Start disassembling from entry point.
 */
bool disassemble_entry(bfd * abfd, disassemble_info * dinfo,
        disassembler_ftype disassembler)
{
    bfd_vma    vma = bfd_get_start_address(abfd);

    /* First locate and load the section containing the vma */
    if(load_section_for_vma(abfd, dinfo, vma)) {
        int size;

        /* Keep disassembling until signalled otherwise or error */
        while(true) {
            dinfo->insn_info_valid = 0;
            size = disassembler(vma, dinfo);
            printf("Disassembled %d bytes at 0x%lX\n", size, vma);

            if(size == 0 || size == -1 || stop_disassembling) {
                break;
            }

            vma += size;
        }

        free(dinfo->buffer);
        return TRUE;
    }

    return FALSE;
}

int main(void)
{
    char  target_path[PATH_MAX] = {0};

    bfd_init();

    /* Get path for the running instance of this program */
    get_target_path(target_path, ARRAY_SIZE(target_path));

    abfd = bfd_openr(target_path, NULL);

    if(abfd != NULL && bfd_check_format(abfd, bfd_object)) {
        disassembler_ftype disassembler = init_disasm(abfd, &dinfo);

        disassemble_entry(abfd, &dinfo, disassembler);

        bfd_close(abfd);
    }

    return EXIT_SUCCESS;
}

This source can be built with the following makefile. To perform a successful link, the binutils-dev package needs to be installed on the local machine:

all:
    gcc -Wall disasm.c -o disasm -lbfd -lopcodes

clean:
    rm -f disasm

When run, the output is this:

Allocated 2216 bytes for .text section
: 0x400BF0xor    
insn_info not valid
%ebp
insn_info not valid
,
insn_info not valid
%ebp
insn_info not valid
Disassembled 2 bytes at 0x400BF0
mov    
insn_info not valid
%rdx
insn_info not valid
,
insn_info not valid
%r9
insn_info not valid
Disassembled 3 bytes at 0x400BF2
pop    
insn_info not valid
%rsi
insn_info not valid
Disassembled 1 bytes at 0x400BF5
mov    
insn_info not valid
%rsp
insn_info not valid
,
insn_info not valid
%rdx
insn_info not valid
Disassembled 3 bytes at 0x400BF6
and    
insn_info not valid
$0xfffffffffffffff0
insn_info not valid
,
insn_info not valid
%rsp
insn_info not valid
Disassembled 4 bytes at 0x400BF9
push   
insn_info not valid
%rax
insn_info not valid
Disassembled 1 bytes at 0x400BFD
push   
insn_info not valid
%rsp
insn_info not valid
Disassembled 1 bytes at 0x400BFE
mov    
insn_info not valid
$0x401450
insn_info not valid
,
insn_info not valid
%r8
insn_info not valid
Disassembled 7 bytes at 0x400BFF
mov    
insn_info not valid
$0x4013c0
insn_info not valid
,
insn_info not valid
%rcx
insn_info not valid
Disassembled 7 bytes at 0x400C06
mov    
insn_info not valid
$0x4012ce
insn_info not valid
,
insn_info not valid
%rdi
insn_info not valid
Disassembled 7 bytes at 0x400C0D
callq  
insn_info not valid
0x0000000000400ad8
insn_info not valid
Disassembled 5 bytes at 0x400C14
hlt    
insn_info not valid
Disassembled 1 bytes at 0x400C19
nop
insn_info not valid
Disassembled 1 bytes at 0x400C1A
nop
insn_info not valid
Disassembled 1 bytes at 0x400C1B
sub    
insn_info not valid
$0x8
insn_info not valid
,
insn_info not valid
%rsp
insn_info not valid
Disassembled 4 bytes at 0x400C1C
mov    
insn_info not valid
0x2013b9(%rip)
insn_info not valid
,
insn_info not valid
%rax
insn_info not valid
        # 
insn_info not valid
0x0000000000601fe0
insn_info not valid
Disassembled 7 bytes at 0x400C20
test   
insn_info not valid
%rax
insn_info not valid
,
insn_info not valid
%rax
insn_info not valid
Disassembled 3 bytes at 0x400C27
je     
insn_info not valid
0x0000000000400c2e
insn_info not valid
Disassembled 2 bytes at 0x400C2A
callq  
insn_info not valid
*%rax
insn_info not valid
Disassembled 2 bytes at 0x400C2C
add    
insn_info not valid
$0x8
insn_info not valid
,
insn_info not valid
%rsp
insn_info not valid
Disassembled 4 bytes at 0x400C2E
retq   
Stopped disassembly
insn_info not valid
Disassembled 1 bytes at 0x400C32

What I am expecting is to be able to read instruction information for each instruction through the dinfo->insn_type, target, etc. The behaviour is exhibited on both x86-32 and x86-64. If I can at least get confirmation that this is unimplemented on these two architectures then I can go about filling in this information myself.

you might just find it easier to use a multi-platform disassembler like beaengine and skip all the headache: http://www.beaengine.org/ — Necrolis, Feb 06 '12 at 09:44
Unfortunately, these are requirements for the project I'm working on. Something interesting is that opdis uses the instruction information, or at least copies it to a buffer, suggesting that information is accessible. I'm having trouble seeing what opdis is doing that I am not though. — Mike Kwan, Feb 06 '12 at 09:49
by the way, there is a problem with your source code: `readlink` does not append a trailing `\0` to the string. — sam hocevar, Feb 06 '12 at 22:44

sam hocevar · Accepted Answer · 2012-02-07T13:13:18.697

10

Unfortunately, as of binutils libopcodes 2.22, insn_type is not filled in on either i386 or x86_64. The only widespread supported architectures are MIPS, Sparc, and the Cell’s SPU. This is still true as of current CVS HEAD.

It's hard to prove that something does not exist, but for instance, in the Sparc disassembler source you can see several occurrences of insn_type being set, for instance info->insn_type = dis_branch, whereas in the i386 disassembler source there are no occurrences of insn_type nor any of the values it would be expected to have (dis_branch, dis_nonbranch etc.).

Checking for all the libopcodes files that support insn_type you get:

opcodes/mips-dis.c
opcodes/spu-dis.c
opcodes/microblaze-dis.c
opcodes/cris-dis.c
opcodes/sparc-dis.c
opcodes/mmix-dis.c

edited Feb 07 '12 at 13:13

answered Feb 06 '12 at 22:53

sam hocevar

11,853
5
49
68

This is exactly the answer I was looking for! Is there some citation or documentation for this information however? – Mike Kwan Feb 07 '12 at 10:11
@MikeKwan: I have added as much information to the answer as I could gather; there does not seem to be official documentation about what is supported or not. But the `dis-asm.h` header explicitly says _Not all decoders yet support this information_. – sam hocevar Feb 07 '12 at 13:14
Thanks for looking that up for me. I have also been looking at i386-dis.c, which supports what you're saying. I've now awarded the bounty. – Mike Kwan Feb 07 '12 at 14:01

score 3 · Answer 2 · answered Feb 07 '12 at 08:13

Doing this with just those libraries is going to be an extremely painful and arduous process. I think you should listen to Necrolis and use a library that already does this. I've used the Dyninst in the past (namely, the InstructionAPI + ParseAPI). They're very well documented, and will do exactly what you're trying to do. At the very least, spending an hour with this library and compiling their examples in the manuals will give you an application that will let you examine things like the opcodes of each instruction, length of each instruction, number of arguments to each instruction, etc. These are things that libopcodes does not tell you nor handle (it decodes addresses at a time, which aren't guaranteed to be instructions).

Here's a snippet from the developers of Opdis that I took from their manual (which I would suggest reading if you haven't, lots of good stuff in there about libopcodes):

The libopcodes library is a very serviceable disassembler, but it has three shortcomings:

it is under-documented, making it difficult for new users to understand

its feature set is limited to the disassembly of a single address

it is designed mainly to print disassembled instructions to a stream

Among other things, I think you might be getting stung by the second item in that list. Namely, the fact that most (all?) opcodes would fit into a single address and would agree with the observed output (e.g., you're getting the mov and pop and some register arguments). But what about tricky things like variable length instructions or instructions that aren't lining up exactly at the 4-byte boundaries? You're not doing anything to handle those.

The disassembly generated by libopcodes is a sequence of strings intended for writing to a stream. There is no metadata, so the strings must be examined to determine which are mnemonics and which are operands, and which of these are branch/jump/return instructions and what their targets are.

I'm guessing that Opdis is smarter than your program -- it knows how and what to look for in the stream. Perhaps sometimes it knows that it needs to read two addresses instead of just one before disassembling. From your code, and the description of libopcodes, neither is doing this.

Good luck! Remember to read that manual, and perhaps consider using libopdis instead!

I agree that `libopcodes` is a pain to use. The main reason I am using it is because of a requirement to work on top of the BFD abstraction. In fact, the disassembly of a target is only a small part of the project. The final vision is to be able to provide arbitrary executable editing. BFD provides convenient features for the instrumentation of extra code. — Mike Kwan, Feb 07 '12 at 10:15
In regards to the issues you describe with `libopcodes`, those are certainly genuine concerns but can all be worked around. Specifically, it is possible to tell the length of an instruction by seeing how many bytes are disassembled. Naturally, a prerequisite is that you are starting at an instruction boundary. I ensure this by starting control flow analysis of disassembly from the entry point of the target. — Mike Kwan, Feb 07 '12 at 10:17
The problems listed by `Opdis` are also legitimate concerns and can be worked around as follows. 1) yes... this is a problem and this question demonstrates it :p 2) with the control flow analysis approach I am taken, we don't need to worry about this, we just disassemble the next instruction and stop branch appropriately at jmps/calls/rets, etc. 3) it is possible to redirect and override the print stream from the built in fprintf to a custom function (which is what opdis itself does). — Mike Kwan, Feb 07 '12 at 10:19
I think you may have slightly misunderstood the second shortcoming pointed out by the opdis authors. What it is saying is that the library provides disassembly of a single address **at a time**, not that it provides disassembly for fixed-length instructions only. — Mike Kwan, Feb 07 '12 at 10:21
Much of the code I am using is similar to that in `libopdis`. The library is something I researched heavily as part of my initial studies and am now using as reference. However, Opdis provides only disassembly whereas I need to provide static executable editing. It also uses BFD but attempts to abstract this away. To change this is necessary but would result in a huge mess. Anyway, these comments are surely verbose but my hope is they may serve as some small form of documentation for some of the problems/workarounds I hit with `libopcodes` for future users. — Mike Kwan, Feb 07 '12 at 10:27

score 0 · Answer 3 · edited Oct 27 '12 at 03:26

0

Libopcodes prints disassembled instructions into the stream which is intercepted by your custom_printf function. Your mistake is that you assume that custom_printf is called once each time a single instruction is disassembled, however, it is called more often, particularly, to print each mnemonic,operand, address or separator.

So, resulting disassembly of your binary is

xor %ebp, %ebp

mov %rdx, %r9

pop %rsi

mov %rsp, %rdx

and $0xfffffffffffffff0, %rsp

push %rax

push %rsp

mov $0x401450,%r8

...

edited Oct 27 '12 at 03:26

Kjuly

34,476
22
104
118

answered Apr 30 '12 at 21:21

Alexandra Dmitrienko

1

Hi Alexandra. Thanks for the reply. I think you misunderstood my question though. I am aware that is how libopcodes works. Cheers! – Mike Kwan Apr 30 '12 at 22:39

How to get instruction information from libopcodes?

3 Answers3

Linked