I tested your observation and at least on recent kernels its false. I wrote this code.
#define _GNU_SOURCE
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <assert.h>
#include <err.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/resource.h>
#define TIMEOUT 4
void print_usage(
char *type)
{
struct rusage use;
getrusage(RUSAGE_THREAD, &use);
float total_time = 0;
long total_sw = 0;
total_time += use.ru_utime.tv_sec + ((float)use.ru_utime.tv_usec / 1000000);
total_time += use.ru_stime.tv_sec + ((float)use.ru_stime.tv_usec / 1000000);
total_sw = use.ru_nvcsw + use.ru_nivcsw;
printf("Type: %s, CPU Time: %.3f seconds, Total context switches: %d\n",
type, total_time, total_sw);
return;
}
struct worksync {
pthread_spinlock_t spin;
};
void * spinner_thread(
void *data)
{
struct worksync *sync = (struct worksync *)data;
pthread_spin_lock(&sync->spin);
print_usage("Thread");
pthread_spin_unlock(&sync->spin);
pthread_exit(0);
}
void spawn_threaded_worker(
int ncpu,
int timeout)
{
pid_t pid;
pid = fork();
if (pid < 0)
err(EXIT_FAILURE, "fork failed");
if (pid == 0) {
/* allocate and initialize structures */
pthread_t *threads = alloca(sizeof(pthread_t) * ncpu);
struct worksync sync;
int i;
pthread_spin_init(&sync.spin, PTHREAD_PROCESS_PRIVATE);
assert(threads);
for (i=0; i < ncpu; i++) {
pthread_create(&threads[i], NULL, spinner_thread, (void *)&sync);
}
pthread_spin_lock(&sync.spin);
sleep(timeout);
pthread_spin_unlock(&sync.spin);
for (i=0; i < ncpu; i++)
pthread_join(threads[i], NULL);
exit(0);
}
}
void spinner_process(
struct worksync *sync)
{
pthread_spin_lock(&sync->spin);
print_usage("Process");
pthread_spin_unlock(&sync->spin);
exit(0);
}
void spawn_forked_worker(
int ncpu,
int timeout)
{
int i;
int status;
pid_t pid;
pid = fork();
if (pid < 0)
err(EXIT_FAILURE, "fork failed");
if (pid == 0) {
pid_t *pids = alloca(sizeof(pid_t) * ncpu);
struct worksync *sync = mmap(NULL, sizeof(struct worksync),
PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
assert(sync != MAP_FAILED);
pthread_spin_init(&sync->spin, PTHREAD_PROCESS_SHARED);
pthread_spin_lock(&sync->spin);
for (i=0; i < ncpu; i++) {
pids[i] = fork();
if (pids[i] < 0)
abort();
if (pids[i] == 0)
spinner_process(sync);
}
sleep(timeout);
pthread_spin_unlock(&sync->spin);
for (i=0; i < ncpu; i++)
wait(&status);
exit(0);
}
}
int main(
void)
{
int ncpu;
int status;
ncpu = sysconf(_SC_NPROCESSORS_ONLN);
assert(ncpu > 0);
printf("Running %d threads and %d processes for %d seconds\n", ncpu, ncpu, TIMEOUT);
spawn_threaded_worker(ncpu, TIMEOUT);
spawn_forked_worker(ncpu, TIMEOUT);
wait(&status);
wait(&status);
exit(0);
}
It measures the CPU time spent performing a CPU intensive peice of work (spinning in a spinlock) in both a threaded model and a forked model, both at the same time using all the systems CPUs. Then reports the CPU statistics.
My results show on a 4 CPU box:
With autogroup DISABLED
$ ./schedtest
Running 4 threads and 4 processes for 4 seconds
Type: Thread, CPU Time: 1.754 seconds, Total context switches: 213
Type: Thread, CPU Time: 1.758 seconds, Total context switches: 208
Type: Thread, CPU Time: 1.755 seconds, Total context switches: 217
Type: Process, CPU Time: 1.768 seconds, Total context switches: 251
Type: Process, CPU Time: 1.759 seconds, Total context switches: 209
Type: Thread, CPU Time: 1.772 seconds, Total context switches: 258
Type: Process, CPU Time: 1.752 seconds, Total context switches: 215
Type: Process, CPU Time: 1.756 seconds, Total context switches: 225
With autogroup ENABLED
$ ./schedtest
Running 4 threads and 4 processes for 4 seconds
Type: Thread, CPU Time: 0.495 seconds, Total context switches: 167
Type: Thread, CPU Time: 0.496 seconds, Total context switches: 167
Type: Thread, CPU Time: 0.430 seconds, Total context switches: 145
Type: Process, CPU Time: 0.430 seconds, Total context switches: 148
Type: Process, CPU Time: 0.440 seconds, Total context switches: 149
Type: Process, CPU Time: 0.440 seconds, Total context switches: 150
Type: Thread, CPU Time: 0.457 seconds, Total context switches: 153
Type: Process, CPU Time: 0.430 seconds, Total context switches: 144
You can clearly see that there is no kernel distinction between threads and processes.
I've no idea what you're doing but whatever it is doesn't conform to the way Linux works, at least for me.