/* * Copyright (c) Facebook, Inc. * Licensed under the Apache License, Version 2.0 (the "License") */ #include namespace ebpf { namespace pyperf { extern const std::string PYPERF_BPF_PROGRAM = R"( #include #include #define PYTHON_STACK_FRAMES_PER_PROG 25 #define PYTHON_STACK_PROG_CNT 3 #define STACK_MAX_LEN (PYTHON_STACK_FRAMES_PER_PROG * PYTHON_STACK_PROG_CNT) #define CLASS_NAME_LEN 32 #define FUNCTION_NAME_LEN 64 #define FILE_NAME_LEN 128 #define TASK_COMM_LEN 16 enum { STACK_STATUS_COMPLETE = 0, STACK_STATUS_ERROR = 1, STACK_STATUS_TRUNCATED = 2, }; enum { GIL_STATE_NO_INFO = 0, GIL_STATE_ERROR = 1, GIL_STATE_UNINITIALIZED = 2, GIL_STATE_NOT_LOCKED = 3, GIL_STATE_THIS_THREAD = 4, GIL_STATE_GLOBAL_CURRENT_THREAD = 5, GIL_STATE_OTHER_THREAD = 6, GIL_STATE_NULL = 7, }; enum { THREAD_STATE_UNKNOWN = 0, THREAD_STATE_MATCH = 1, THREAD_STATE_MISMATCH = 2, THREAD_STATE_THIS_THREAD_NULL = 3, THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL = 4, THREAD_STATE_BOTH_NULL = 5, }; enum { PTHREAD_ID_UNKNOWN = 0, PTHREAD_ID_MATCH = 1, PTHREAD_ID_MISMATCH = 2, PTHREAD_ID_THREAD_STATE_NULL = 3, PTHREAD_ID_NULL = 4, PTHREAD_ID_ERROR = 5, }; typedef struct { int64_t PyObject_type; int64_t PyTypeObject_name; int64_t PyThreadState_frame; int64_t PyThreadState_thread; int64_t PyFrameObject_back; int64_t PyFrameObject_code; int64_t PyFrameObject_lineno; int64_t PyFrameObject_localsplus; int64_t PyCodeObject_filename; int64_t PyCodeObject_name; int64_t PyCodeObject_varnames; int64_t PyTupleObject_item; int64_t String_data; int64_t String_size; } OffsetConfig; typedef struct { uintptr_t current_state_addr; // virtual address of _PyThreadState_Current uintptr_t tls_key_addr; // virtual address of autoTLSkey for pthreads TLS uintptr_t gil_locked_addr; // virtual address of gil_locked uintptr_t gil_last_holder_addr; // virtual address of gil_last_holder OffsetConfig offsets; } PidData; typedef struct { char classname[CLASS_NAME_LEN]; char name[FUNCTION_NAME_LEN]; char file[FILE_NAME_LEN]; // NOTE: PyFrameObject also has line number but it is typically just the // first line of that function and PyCode_Addr2Line needs to be called // to get the actual line } Symbol; typedef struct { uint32_t pid; uint32_t tid; char comm[TASK_COMM_LEN]; uint8_t thread_state_match; uint8_t gil_state; uint8_t pthread_id_match; uint8_t stack_status; // instead of storing symbol name here directly, we add it to another // hashmap with Symbols and only store the ids here int64_t stack_len; int32_t stack[STACK_MAX_LEN]; } Event; #define _STR_CONCAT(str1, str2) str1##str2 #define STR_CONCAT(str1, str2) _STR_CONCAT(str1, str2) #define FAIL_COMPILATION_IF(condition) \ typedef struct { \ char _condition_check[1 - 2 * !!(condition)]; \ } STR_CONCAT(compile_time_condition_check, __COUNTER__); // See comments in get_frame_data FAIL_COMPILATION_IF(sizeof(Symbol) == sizeof(struct bpf_perf_event_value)) typedef struct { OffsetConfig offsets; uint64_t cur_cpu; int64_t symbol_counter; void* frame_ptr; int64_t python_stack_prog_call_cnt; Event event; } sample_state_t; BPF_PERCPU_ARRAY(state_heap, sample_state_t, 1); BPF_HASH(symbols, Symbol, int32_t, __SYMBOLS_SIZE__); BPF_HASH(pid_config, pid_t, PidData); BPF_PROG_ARRAY(progs, 1); BPF_PERF_OUTPUT(events); static inline __attribute__((__always_inline__)) void* get_thread_state( void* tls_base, PidData* pid_data) { // Python sets the thread_state using pthread_setspecific with the key // stored in a global variable autoTLSkey. // We read the value of the key from the global variable and then read // the value in the thread-local storage. This relies on pthread implementation. // This is basically the same as running the following in GDB: // p *(PyThreadState*)((struct pthread*)pthread_self())-> // specific_1stblock[autoTLSkey]->data int key; bpf_probe_read_user(&key, sizeof(key), (void*)pid_data->tls_key_addr); // This assumes autoTLSkey < 32, which means that the TLS is stored in // pthread->specific_1stblock[autoTLSkey] // 0x310 is offsetof(struct pthread, specific_1stblock), // 0x10 is sizeof(pthread_key_data) // 0x8 is offsetof(struct pthread_key_data, data) // 'struct pthread' is not in the public API so we have to hardcode // the offsets here void* thread_state; bpf_probe_read_user( &thread_state, sizeof(thread_state), tls_base + 0x310 + key * 0x10 + 0x08); return thread_state; } static inline __attribute__((__always_inline__)) int submit_sample( struct pt_regs* ctx, sample_state_t* state) { events.perf_submit(ctx, &state->event, sizeof(Event)); return 0; } // this function is trivial, but we need to do map lookup in separate function, // because BCC doesn't allow direct map calls (including lookups) from inside // a macro (which we want to do in GET_STATE() macro below) static inline __attribute__((__always_inline__)) sample_state_t* get_state() { int zero = 0; return state_heap.lookup(&zero); } #define GET_STATE() \ sample_state_t* state = get_state(); \ if (!state) { \ return 0; /* should never happen */ \ } static inline __attribute__((__always_inline__)) int get_thread_state_match( void* this_thread_state, void* global_thread_state) { if (this_thread_state == 0 && global_thread_state == 0) { return THREAD_STATE_BOTH_NULL; } if (this_thread_state == 0) { return THREAD_STATE_THIS_THREAD_NULL; } if (global_thread_state == 0) { return THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL; } if (this_thread_state == global_thread_state) { return THREAD_STATE_MATCH; } else { return THREAD_STATE_MISMATCH; } } static inline __attribute__((__always_inline__)) int get_gil_state( void* this_thread_state, void* global_thread_state, PidData* pid_data) { // Get information of GIL state if (pid_data->gil_locked_addr == 0 || pid_data->gil_last_holder_addr == 0) { return GIL_STATE_NO_INFO; } int gil_locked = 0; void* gil_thread_state = 0; if (bpf_probe_read_user( &gil_locked, sizeof(gil_locked), (void*)pid_data->gil_locked_addr)) { return GIL_STATE_ERROR; } switch (gil_locked) { case -1: return GIL_STATE_UNINITIALIZED; case 0: return GIL_STATE_NOT_LOCKED; case 1: // GIL is held by some Thread bpf_probe_read_user( &gil_thread_state, sizeof(void*), (void*)pid_data->gil_last_holder_addr); if (gil_thread_state == this_thread_state) { return GIL_STATE_THIS_THREAD; } else if (gil_thread_state == global_thread_state) { return GIL_STATE_GLOBAL_CURRENT_THREAD; } else if (gil_thread_state == 0) { return GIL_STATE_NULL; } else { return GIL_STATE_OTHER_THREAD; } default: return GIL_STATE_ERROR; } } static inline __attribute__((__always_inline__)) int get_pthread_id_match(void* thread_state, void* tls_base, PidData* pid_data) { if (thread_state == 0) { return PTHREAD_ID_THREAD_STATE_NULL; } uint64_t pthread_self, pthread_created; bpf_probe_read_user( &pthread_created, sizeof(pthread_created), thread_state + pid_data->offsets.PyThreadState_thread); if (pthread_created == 0) { return PTHREAD_ID_NULL; } // 0x10 = offsetof(struct pthread, header.self) bpf_probe_read_user(&pthread_self, sizeof(pthread_self), tls_base + 0x10); if (pthread_self == 0) { return PTHREAD_ID_ERROR; } if (pthread_self == pthread_created) { return PTHREAD_ID_MATCH; } else { return PTHREAD_ID_MISMATCH; } } int on_event(struct pt_regs* ctx) { uint64_t pid_tgid = bpf_get_current_pid_tgid(); pid_t pid = (pid_t)(pid_tgid >> 32); PidData* pid_data = pid_config.lookup(&pid); if (!pid_data) { return 0; } GET_STATE(); state->offsets = pid_data->offsets; state->cur_cpu = bpf_get_smp_processor_id(); state->python_stack_prog_call_cnt = 0; Event* event = &state->event; event->pid = pid; event->tid = (pid_t)pid_tgid; bpf_get_current_comm(&event->comm, sizeof(event->comm)); // Get pointer of global PyThreadState, which should belong to the Thread // currently holds the GIL void* global_current_thread = (void*)0; bpf_probe_read_user( &global_current_thread, sizeof(global_current_thread), (void*)pid_data->current_state_addr); struct task_struct* task = (struct task_struct*)bpf_get_current_task(); #if __x86_64__ // thread_struct->fs was renamed to fsbase in // https://github.com/torvalds/linux/commit/296f781a4b7801ad9c1c0219f9e87b6c25e196fe // so depending on kernel version, we need to account for that #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 7, 0) void* tls_base = (void*)task->thread.fs; #else void* tls_base = (void*)task->thread.fsbase; #endif #elif __aarch64__ void* tls_base = (void*)task->thread.tp_value; #else #error "Unsupported platform" #endif // Read PyThreadState of this Thread from TLS void* thread_state = get_thread_state(tls_base, pid_data); // Check for matching between TLS PyThreadState and // the global _PyThreadState_Current event->thread_state_match = get_thread_state_match(thread_state, global_current_thread); // Read GIL state event->gil_state = get_gil_state(thread_state, global_current_thread, pid_data); // Check for matching between pthread ID created current PyThreadState and // pthread of actual current pthread event->pthread_id_match = get_pthread_id_match(thread_state, tls_base, pid_data); // pre-initialize event struct in case any subprogram below fails event->stack_status = STACK_STATUS_COMPLETE; event->stack_len = 0; if (thread_state != 0) { // Get pointer to top frame from PyThreadState bpf_probe_read_user( &state->frame_ptr, sizeof(void*), thread_state + pid_data->offsets.PyThreadState_frame); // jump to reading first set of Python frames progs.call(ctx, PYTHON_STACK_PROG_IDX); // we won't ever get here } return submit_sample(ctx, state); } static inline __attribute__((__always_inline__)) void get_names( void* cur_frame, void* code_ptr, OffsetConfig* offsets, Symbol* symbol, void* ctx) { // Figure out if we want to parse class name, basically checking the name of // the first argument, // ((PyTupleObject*)$frame->f_code->co_varnames)->ob_item[0] // If it's 'self', we get the type and it's name, if it's cls, we just get // the name. This is not perfect but there is no better way to figure this // out from the code object. void* args_ptr; bpf_probe_read_user( &args_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_varnames); bpf_probe_read_user( &args_ptr, sizeof(void*), args_ptr + offsets->PyTupleObject_item); bpf_probe_read_user_str( &symbol->name, sizeof(symbol->name), args_ptr + offsets->String_data); // compare strings as ints to save instructions char self_str[4] = {'s', 'e', 'l', 'f'}; char cls_str[4] = {'c', 'l', 's', '\0'}; bool first_self = *(int32_t*)symbol->name == *(int32_t*)self_str; bool first_cls = *(int32_t*)symbol->name == *(int32_t*)cls_str; // We re-use the same Symbol instance across loop iterations, which means // we will have left-over data in the struct. Although this won't affect // correctness of the result because we have '\0' at end of the strings read, // it would affect effectiveness of the deduplication. // Helper bpf_perf_prog_read_value clears the buffer on error, so here we // (ab)use this behavior to clear the memory. It requires the size of Symbol // to be different from struct bpf_perf_event_value, which we check at // compilation time using the FAIL_COMPILATION_IF macro. bpf_perf_prog_read_value(ctx, symbol, sizeof(Symbol)); // Read class name from $frame->f_localsplus[0]->ob_type->tp_name. if (first_self || first_cls) { void* ptr; bpf_probe_read_user( &ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_localsplus); if (first_self) { // we are working with an instance, first we need to get type bpf_probe_read_user(&ptr, sizeof(void*), ptr + offsets->PyObject_type); } bpf_probe_read_user(&ptr, sizeof(void*), ptr + offsets->PyTypeObject_name); bpf_probe_read_user_str(&symbol->classname, sizeof(symbol->classname), ptr); } void* pystr_ptr; // read PyCodeObject's filename into symbol bpf_probe_read_user( &pystr_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_filename); bpf_probe_read_user_str( &symbol->file, sizeof(symbol->file), pystr_ptr + offsets->String_data); // read PyCodeObject's name into symbol bpf_probe_read_user( &pystr_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_name); bpf_probe_read_user_str( &symbol->name, sizeof(symbol->name), pystr_ptr + offsets->String_data); } // get_frame_data reads current PyFrameObject filename/name and updates // stack_info->frame_ptr with pointer to next PyFrameObject static inline __attribute__((__always_inline__)) bool get_frame_data( void** frame_ptr, OffsetConfig* offsets, Symbol* symbol, // ctx is only used to call helper to clear symbol, see documentation below void* ctx) { void* cur_frame = *frame_ptr; if (!cur_frame) { return false; } void* code_ptr; // read PyCodeObject first, if that fails, then no point reading next frame bpf_probe_read_user( &code_ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_code); if (!code_ptr) { return false; } get_names(cur_frame, code_ptr, offsets, symbol, ctx); // read next PyFrameObject pointer, update in place bpf_probe_read_user( frame_ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_back); return true; } // To avoid duplicate ids, every CPU needs to use different ids when inserting // into the hashmap. NUM_CPUS is defined at PyPerf backend side and passed // through CFlag. static inline __attribute__((__always_inline__)) int64_t get_symbol_id( sample_state_t* state, Symbol* sym) { int32_t* symbol_id_ptr = symbols.lookup(sym); if (symbol_id_ptr) { return *symbol_id_ptr; } // the symbol is new, bump the counter int32_t symbol_id = state->symbol_counter * NUM_CPUS + state->cur_cpu; state->symbol_counter++; symbols.update(sym, &symbol_id); return symbol_id; } int read_python_stack(struct pt_regs* ctx) { GET_STATE(); state->python_stack_prog_call_cnt++; Event* sample = &state->event; Symbol sym = {}; bool last_res = false; #pragma unroll for (int i = 0; i < PYTHON_STACK_FRAMES_PER_PROG; i++) { last_res = get_frame_data(&state->frame_ptr, &state->offsets, &sym, ctx); if (last_res) { uint32_t symbol_id = get_symbol_id(state, &sym); int64_t cur_len = sample->stack_len; if (cur_len >= 0 && cur_len < STACK_MAX_LEN) { sample->stack[cur_len] = symbol_id; sample->stack_len++; } } } if (!state->frame_ptr) { sample->stack_status = STACK_STATUS_COMPLETE; } else { if (!last_res) { sample->stack_status = STACK_STATUS_ERROR; } else { sample->stack_status = STACK_STATUS_TRUNCATED; } } if (sample->stack_status == STACK_STATUS_TRUNCATED && state->python_stack_prog_call_cnt < PYTHON_STACK_PROG_CNT) { // read next batch of frames progs.call(ctx, PYTHON_STACK_PROG_IDX); } return submit_sample(ctx, state); } )"; } } // namespace ebpf