I am tracing some processes and their children using ptrace. I am trying to print specific system call (using Seccomp filter that notifies ptrace, see this blogpost).
In most cases my code (see below) is working fine. However, when I am tracing a java program (from the default-jre package), the latter clones using the CLONE_THREAD
flag. And for some reason, my tracer hangs (I believe) because I can't receive signals from the cloned process. I think the reason is that (according to this discussion) the child process in fact becomes a child of the original process' parent, instead of becoming the original process' child.
I reproduced this issue by using a simple program that simply calls clone()
with flags and perform actions. When I used the when I use CLONE_THREAD | CLONE_SIGHAND | CLONE_VM
flags (as clone() documentation specifies they should come together since Linux 2.6.0), at least I am able to trace everything correctly until one of the two thread finishes.
I would like to trace both thread independently. Is it possible?
More importantly, I need to trace a Java program, and I cannot change it. Here a strace of the Java program clone call:
[...]
4665 clone(child_stack=0x7fb166e95fb0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tid=[4666], tls=0x7fb166e96700, child_tidptr=0x7fb166e969d0) = 4666
[...]
So Java seems to respect the rules. I wanted to experiment to understand: I ruled out any flags unrelated to thread (i.e., `CLONE_FS | CLONE_FILES | CLONE_SYSVSEM).
Here are the results of running my test program with different combination of flags (I know, I am really desperate):
CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS
: only gets trace from parentCLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_PARENT_SETTID
: inconsistent; gets trace from both until the parent finishesCLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_CHILD_CLEARTID
: inconsistent; gets trace from both until the child finishesCLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_PARENT_SETTID
: only gets trace from parentCLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_CHILD_CLEARTID
: only gets trace from parentCLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_PARENT_SETTID|CLONE_SETTLS
: only gets trace from parentCLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID
: inconsistent; gets trace from both until the child finishesCLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_CHILD_CLEARTID|CLONE_SETTLS
: only gets trace from parentCLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_CHILD_CLEARTID|CLONE_PARENT_SETTID
: inconsistent; gets trace from both until the child finishesCLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID
: only gets trace from parent
So at least I get the same behaviour from my program and the Java program: it does not work.
How can I make it work? For instance, how does strace
successfully traces any kind of clone? I tried to dig into its code but I can't find how they are doing it.
Any help might appreciated! Best regards,
The tracer code (compile with g++ tracer.cpp -o tracer -g -lseccomp -lexplain
):
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <stddef.h>
#include <sys/ptrace.h>
#include <sys/reg.h>
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/user.h>
#include <sys/prctl.h>
#include <fcntl.h>
#include <linux/limits.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <linux/unistd.h>
#include <libexplain/waitpid.h>
#include <tuple>
#include <vector>
#define DEFAULT_SIZE 1000
#define MAX_SIZE 1000
int process_signals();
int inspect(pid_t);
void read_string_into_buff(const pid_t, unsigned long long, char *, unsigned int);
int main(int argc, char **argv){
pid_t pid;
int status;
if (argc < 2) {
fprintf(stderr, "Usage: %s <prog> <arg1> ... <argN>\n", argv[0]);
return 1;
}
if ((pid = fork()) == 0) {
/* If execve syscall, trace */
struct sock_filter filter[] = {
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_getpid, 0, 1),
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRACE),
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
.len = (unsigned short) (sizeof(filter)/sizeof(filter[0])),
.filter = filter,
};
ptrace(PTRACE_TRACEME, 0, 0, 0);
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
perror("prctl(PR_SET_NO_NEW_PRIVS)");
return 1;
}
if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1) {
perror("when setting seccomp filter");
return 1;
}
kill(getpid(), SIGSTOP);
return execvp(argv[1], argv + 1);
} else {
waitpid(pid, &status, 0);
ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_TRACESECCOMP | PTRACE_O_TRACEFORK | PTRACE_O_TRACECLONE | PTRACE_O_TRACEVFORK );
ptrace(PTRACE_CONT, pid, 0, 0);
process_signals();
return 0;
}
}
int process_signals(){
int status;
while (1){
pid_t child_pid;
// When child status changes
if ((child_pid = waitpid(-1, &status, 0)) < 0){
fprintf(stderr, "%s\n", explain_waitpid(child_pid, &status, 0));
exit(EXIT_FAILURE);
}
//printf("Sigtrap received\n");
// Checking if it is thanks to seccomp
if (status >> 8 == (SIGTRAP | (PTRACE_EVENT_SECCOMP << 8))){
// Perform argument inspection with ptrace
int syscall = inspect(child_pid);
}
// Resume no matter what
ptrace(PTRACE_CONT, child_pid, 0, 0);
}
}
int inspect(pid_t pid){
printf("From PID: %d\n", pid);
struct user_regs_struct regs;
ptrace(PTRACE_GETREGS, pid, 0, ®s);
// Get syscall number
int syscall = regs.orig_rax;
printf("------\nCaught syscall: %d\n", syscall);
if (syscall == __NR_getpid){
printf("Getpid detected\n");
}
return syscall;
}
void read_string_into_buff(const pid_t pid, unsigned long long addr, char * buff, unsigned int max_len){
/* Are we aligned on the "start" front? */
unsigned int offset=((unsigned long)addr)%sizeof(long);
addr-=offset;
unsigned int i=0;
int done=0;
int word_offset=0;
while( !done ) {
unsigned long word=ptrace( PTRACE_PEEKDATA, pid, addr+(word_offset++)*sizeof(long), 0 );
// While loop to stop at the first '\0' char indicating end of string
while( !done && offset<sizeof(long) && i<max_len ) {
buff[i]=((char *)&word)[offset]; /* Endianity neutral copy */
done=buff[i]=='\0';
++i;
++offset;
}
offset=0;
done=done || i>=max_len;
}
}
The sample program (compile with gcc sample.c -o sample
):
#define _GNU_SOURCE
#include <stdio.h>
#include <sched.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>
#include <signal.h>
#define FLAGS CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID
int fn(void *arg)
{
printf("\nINFO: This code is running under child process.\n");
int i = 0;
int n = atoi(arg);
for ( i = 1 ; i <= 10 ; i++ )
printf("[%d] %d * %d = %d\n", getpid(), n, i, (n*i));
printf("\n");
return 0;
}
void main(int argc, char *argv[])
{
printf("[%d] Hello, World!\n", getpid());
void *pchild_stack = malloc(1024 * 1024);
if ( pchild_stack == NULL ) {
printf("ERROR: Unable to allocate memory.\n");
exit(EXIT_FAILURE);
}
int pid = clone(fn, pchild_stack + (1024 * 1024), FLAGS, argv[1]);
if ( pid < 0 ) {
printf("ERROR: Unable to create the child process.\n");
exit(EXIT_FAILURE);
}
fn(argv[1]);
wait(NULL);
free(pchild_stack);
printf("INFO: Child process terminated.\n");
}
You can test what you want by running ./tracer ./sample
. You can also test the original test case ./tracer java
and observe that both the tracer and java hangs.
ANSWER: As pointed it out in the comment, I had issues in that example that were preventing me from handling signals from the child.
In my original code (not listed here because too complex), I was only attaching ptrace AFTER the processes started... and I was only attaching to PID listed by pstree. My mistake was that I omitted the threads (and java is one program that does create threads), explaining why I had issue tracing java only.
I modified the code to attach to all the children process and thread (ps -L -g <Main_PID> -o tid=
) and everything works again.