Performance Optimization Techniques

Introduction

Maximizing performance in GDExtension: This guide covers advanced techniques for optimizing GDExtension performance, from minimizing cross-boundary calls to implementing cache-friendly data structures. Understanding these patterns is crucial for building high-performance extensions that can handle demanding real-time applications.

Cross-Boundary Call Optimization

Minimizing the cost of engine communication: Every call across the GDExtension boundary has overhead. Reducing and optimizing these calls is often the biggest performance win.

Batch Operations

class BatchProcessor : public RefCounted {
    GDCLASS(BatchProcessor, RefCounted)

private:
    Vector<Vector3> pending_positions;
    Vector<Node3D*> target_nodes;

protected:
    static void _bind_methods() {
        ClassDB::bind_method(D_METHOD("add_position_update", "node", "position"),
                           &BatchProcessor::add_position_update);
        ClassDB::bind_method(D_METHOD("flush_updates"),
                           &BatchProcessor::flush_updates);
    }

public:
    // Bad: Individual cross-boundary calls
    void update_position_immediate(Node3D *node, const Vector3 &pos) {
        node->set_position(pos);  // Cross-boundary call per update
    }

    // Good: Batch updates
    void add_position_update(Node3D *node, const Vector3 &pos) {
        target_nodes.append(node);
        pending_positions.append(pos);
    }

    void flush_updates() {
        // Single batch of cross-boundary calls
        for (int i = 0; i < target_nodes.size(); i++) {
            target_nodes[i]->set_position(pending_positions[i]);
        }
        target_nodes.clear();
        pending_positions.clear();
    }
};

Cache Engine Data

class CachedNodeManager : public Node {
    GDCLASS(CachedNodeManager, Node)

private:
    struct NodeCache {
        Node *node;
        Vector3 last_position;
        Vector3 last_scale;
        bool position_dirty = false;
        bool scale_dirty = false;
    };

    Vector<NodeCache> cached_nodes;
    bool batch_updates = true;

public:
    void cache_node(Node *node) {
        NodeCache cache;
        cache.node = node;

        // Cache current values to avoid repeated queries
        if (Node3D *node3d = Object::cast_to<Node3D>(node)) {
            cache.last_position = node3d->get_position();
            cache.last_scale = node3d->get_scale();
        }

        cached_nodes.append(cache);
    }

    void update_cached_position(int index, const Vector3 &new_pos) {
        ERR_FAIL_INDEX(index, cached_nodes.size());

        NodeCache &cache = cached_nodes.write[index];
        if (cache.last_position.distance_squared_to(new_pos) > 0.001f) {
            cache.last_position = new_pos;
            cache.position_dirty = true;
        }
    }

    void flush_dirty_updates() {
        for (NodeCache &cache : cached_nodes) {
            Node3D *node3d = Object::cast_to<Node3D>(cache.node);
            if (!node3d) continue;

            if (cache.position_dirty) {
                node3d->set_position(cache.last_position);
                cache.position_dirty = false;
            }

            if (cache.scale_dirty) {
                node3d->set_scale(cache.last_scale);
                cache.scale_dirty = false;
            }
        }
    }
};

Reduce Method Call Overhead

class OptimizedCalculator : public RefCounted {
    GDCLASS(OptimizedCalculator, RefCounted)

private:
    PackedFloat32Array results_buffer;

protected:
    static void _bind_methods() {
        // Bad: Multiple calls for multiple results
        ClassDB::bind_method(D_METHOD("calculate_single", "value"),
                           &OptimizedCalculator::calculate_single);

        // Good: Single call for batch processing
        ClassDB::bind_method(D_METHOD("calculate_batch", "values"),
                           &OptimizedCalculator::calculate_batch);
    }

public:
    // Bad: Called many times from GDScript
    float calculate_single(float value) {
        return sqrt(value * value + 1.0f);
    }

    // Good: Process entire array in one call
    PackedFloat32Array calculate_batch(const PackedFloat32Array &values) {
        results_buffer.resize(values.size());
        const float *input = values.ptr();
        float *output = results_buffer.ptrw();

        // Vectorized processing
        for (int i = 0; i < values.size(); i++) {
            output[i] = sqrt(input[i] * input[i] + 1.0f);
        }

        return results_buffer;
    }
};

Memory Optimization

Cache-Friendly Data Structures

class ParticleSystem : public Node2D {
    GDCLASS(ParticleSystem, Node2D)

private:
    // Bad: Array of Objects (cache-unfriendly)
    // Vector<Ref<Particle>> particles;

    // Good: Structure of Arrays (cache-friendly)
    struct ParticleData {
        PackedVector2Array positions;
        PackedFloat32Array velocities_x;
        PackedFloat32Array velocities_y;
        PackedFloat32Array lifetimes;
        PackedInt32Array colors;  // Packed as integers

        int count = 0;
        int capacity = 0;

        void resize(int new_capacity) {
            capacity = new_capacity;
            positions.resize(capacity);
            velocities_x.resize(capacity);
            velocities_y.resize(capacity);
            lifetimes.resize(capacity);
            colors.resize(capacity);
        }

        void add_particle(const Vector2 &pos, const Vector2 &vel,
                         float lifetime, uint32_t color) {
            if (count >= capacity) {
                resize(capacity == 0 ? 64 : capacity * 2);
            }

            positions.set(count, pos);
            velocities_x.set(count, vel.x);
            velocities_y.set(count, vel.y);
            lifetimes.set(count, lifetime);
            colors.set(count, color);
            count++;
        }
    };

    ParticleData particles;

public:
    void _process(double delta) override {
        // SIMD-friendly processing
        Vector2 *pos = particles.positions.ptrw();
        float *vel_x = particles.velocities_x.ptrw();
        float *vel_y = particles.velocities_y.ptrw();
        float *life = particles.lifetimes.ptrw();

        int alive_count = 0;

        // Process 4 particles at once (manual vectorization)
        int simd_count = particles.count & ~3;  // Round down to multiple of 4

        for (int i = 0; i < simd_count; i += 4) {
            // Update positions
            pos[i].x += vel_x[i] * delta;
            pos[i].y += vel_y[i] * delta;
            pos[i+1].x += vel_x[i+1] * delta;
            pos[i+1].y += vel_y[i+1] * delta;
            pos[i+2].x += vel_x[i+2] * delta;
            pos[i+2].y += vel_y[i+2] * delta;
            pos[i+3].x += vel_x[i+3] * delta;
            pos[i+3].y += vel_y[i+3] * delta;

            // Update lifetimes
            life[i] -= delta;
            life[i+1] -= delta;
            life[i+2] -= delta;
            life[i+3] -= delta;
        }

        // Handle remaining particles
        for (int i = simd_count; i < particles.count; i++) {
            pos[i].x += vel_x[i] * delta;
            pos[i].y += vel_y[i] * delta;
            life[i] -= delta;
        }

        // Remove dead particles
        compact_particles();
    }

private:
    void compact_particles() {
        int write_index = 0;
        const float *life = particles.lifetimes.ptr();

        for (int read_index = 0; read_index < particles.count; read_index++) {
            if (life[read_index] > 0.0f) {
                if (write_index != read_index) {
                    // Move particle data
                    particles.positions.set(write_index, particles.positions[read_index]);
                    particles.velocities_x.set(write_index, particles.velocities_x[read_index]);
                    particles.velocities_y.set(write_index, particles.velocities_y[read_index]);
                    particles.lifetimes.set(write_index, particles.lifetimes[read_index]);
                    particles.colors.set(write_index, particles.colors[read_index]);
                }
                write_index++;
            }
        }

        particles.count = write_index;
    }
};

Memory Pool Allocators

class MemoryPool {
private:
    struct Block {
        bool in_use;
        size_t size;
        uint8_t *data;
        Block *next;
    };

    Block *free_list = nullptr;
    Vector<uint8_t> pool_memory;
    size_t pool_size;
    size_t block_size;
    int num_blocks;

public:
    MemoryPool(size_t p_block_size, int p_num_blocks)
        : block_size(p_block_size), num_blocks(p_num_blocks) {

        pool_size = block_size * num_blocks + sizeof(Block) * num_blocks;
        pool_memory.resize(pool_size);

        // Initialize free list
        uint8_t *memory = pool_memory.ptrw();
        Block *blocks = reinterpret_cast<Block*>(memory);
        uint8_t *data_start = memory + sizeof(Block) * num_blocks;

        for (int i = 0; i < num_blocks; i++) {
            blocks[i].in_use = false;
            blocks[i].size = block_size;
            blocks[i].data = data_start + i * block_size;
            blocks[i].next = (i == num_blocks - 1) ? nullptr : &blocks[i + 1];
        }

        free_list = blocks;
    }

    void* allocate(size_t size) {
        if (size > block_size || !free_list) {
            return nullptr;  // Fallback to system allocator
        }

        Block *block = free_list;
        free_list = block->next;
        block->in_use = true;
        return block->data;
    }

    void deallocate(void *ptr) {
        if (!ptr) return;

        // Find block containing this pointer
        uint8_t *memory = pool_memory.ptrw();
        Block *blocks = reinterpret_cast<Block*>(memory);

        for (int i = 0; i < num_blocks; i++) {
            if (blocks[i].data == ptr) {
                blocks[i].in_use = false;
                blocks[i].next = free_list;
                free_list = &blocks[i];
                break;
            }
        }
    }
};

class PoolAllocatedClass : public RefCounted {
    GDCLASS(PoolAllocatedClass, RefCounted)

private:
    static MemoryPool *pool;

public:
    static void initialize_pool() {
        if (!pool) {
            pool = new MemoryPool(sizeof(PoolAllocatedClass), 1000);
        }
    }

    static void cleanup_pool() {
        delete pool;
        pool = nullptr;
    }

    void* operator new(size_t size) {
        if (pool) {
            void *ptr = pool->allocate(size);
            if (ptr) return ptr;
        }
        return ::operator new(size);
    }

    void operator delete(void *ptr) {
        if (pool) {
            pool->deallocate(ptr);
            return;
        }
        ::operator delete(ptr);
    }
};

MemoryPool *PoolAllocatedClass::pool = nullptr;

Algorithmic Optimizations

Spatial Data Structures

class QuadTree : public RefCounted {
    GDCLASS(QuadTree, RefCounted)

private:
    struct QuadNode {
        Rect2 bounds;
        Vector<int> objects;  // Object IDs
        std::unique_ptr<QuadNode> children[4];
        bool is_leaf = true;

        static const int MAX_OBJECTS = 8;
        static const int MAX_DEPTH = 6;
    };

    std::unique_ptr<QuadNode> root;
    HashMap<int, Vector2> object_positions;
    int next_object_id = 0;

protected:
    static void _bind_methods() {
        ClassDB::bind_method(D_METHOD("insert", "position"),
                           &QuadTree::insert);
        ClassDB::bind_method(D_METHOD("query_range", "area"),
                           &QuadTree::query_range);
        ClassDB::bind_method(D_METHOD("find_nearest", "position", "max_distance"),
                           &QuadTree::find_nearest);
    }

public:
    QuadTree() {
        root = std::make_unique<QuadNode>();
        root->bounds = Rect2(-1000, -1000, 2000, 2000);
    }

    int insert(const Vector2 &position) {
        int id = next_object_id++;
        object_positions[id] = position;
        insert_recursive(root.get(), id, position, 0);
        return id;
    }

    PackedInt32Array query_range(const Rect2 &area) {
        Vector<int> results;
        query_recursive(root.get(), area, results);

        PackedInt32Array packed_results;
        packed_results.resize(results.size());
        for (int i = 0; i < results.size(); i++) {
            packed_results.set(i, results[i]);
        }
        return packed_results;
    }

    int find_nearest(const Vector2 &position, float max_distance) {
        int nearest_id = -1;
        float nearest_distance = max_distance * max_distance;

        find_nearest_recursive(root.get(), position, nearest_id, nearest_distance);
        return nearest_id;
    }

private:
    void insert_recursive(QuadNode *node, int object_id,
                         const Vector2 &position, int depth) {
        if (node->is_leaf) {
            node->objects.append(object_id);

            if (node->objects.size() > QuadNode::MAX_OBJECTS &&
                depth < QuadNode::MAX_DEPTH) {
                subdivide(node);
                redistribute_objects(node, depth);
            }
        } else {
            int quadrant = get_quadrant(node->bounds, position);
            insert_recursive(node->children[quadrant].get(),
                           object_id, position, depth + 1);
        }
    }

    void subdivide(QuadNode *node) {
        float half_width = node->bounds.size.width * 0.5f;
        float half_height = node->bounds.size.height * 0.5f;
        float x = node->bounds.position.x;
        float y = node->bounds.position.y;

        node->children[0] = std::make_unique<QuadNode>();
        node->children[0]->bounds = Rect2(x, y, half_width, half_height);

        node->children[1] = std::make_unique<QuadNode>();
        node->children[1]->bounds = Rect2(x + half_width, y, half_width, half_height);

        node->children[2] = std::make_unique<QuadNode>();
        node->children[2]->bounds = Rect2(x, y + half_height, half_width, half_height);

        node->children[3] = std::make_unique<QuadNode>();
        node->children[3]->bounds = Rect2(x + half_width, y + half_height, half_width, half_height);

        node->is_leaf = false;
    }

    void redistribute_objects(QuadNode *node, int depth) {
        Vector<int> objects_to_redistribute = node->objects;
        node->objects.clear();

        for (int obj_id : objects_to_redistribute) {
            Vector2 pos = object_positions[obj_id];
            int quadrant = get_quadrant(node->bounds, pos);
            insert_recursive(node->children[quadrant].get(), obj_id, pos, depth + 1);
        }
    }

    int get_quadrant(const Rect2 &bounds, const Vector2 &position) {
        float mid_x = bounds.position.x + bounds.size.width * 0.5f;
        float mid_y = bounds.position.y + bounds.size.height * 0.5f;

        if (position.x < mid_x) {
            return (position.y < mid_y) ? 0 : 2;
        } else {
            return (position.y < mid_y) ? 1 : 3;
        }
    }

    void query_recursive(QuadNode *node, const Rect2 &area, Vector<int> &results) {
        if (!node->bounds.intersects(area)) {
            return;
        }

        if (node->is_leaf) {
            for (int obj_id : node->objects) {
                Vector2 pos = object_positions[obj_id];
                if (area.has_point(pos)) {
                    results.append(obj_id);
                }
            }
        } else {
            for (int i = 0; i < 4; i++) {
                query_recursive(node->children[i].get(), area, results);
            }
        }
    }

    void find_nearest_recursive(QuadNode *node, const Vector2 &position,
                               int &nearest_id, float &nearest_distance_sq) {
        float distance_to_bounds = distance_to_rect(position, node->bounds);
        if (distance_to_bounds * distance_to_bounds > nearest_distance_sq) {
            return;  // This node can't contain anything closer
        }

        if (node->is_leaf) {
            for (int obj_id : node->objects) {
                Vector2 pos = object_positions[obj_id];
                float dist_sq = position.distance_squared_to(pos);
                if (dist_sq < nearest_distance_sq) {
                    nearest_distance_sq = dist_sq;
                    nearest_id = obj_id;
                }
            }
        } else {
            // Check children in order of distance to position
            int quadrant = get_quadrant(node->bounds, position);
            find_nearest_recursive(node->children[quadrant].get(),
                                 position, nearest_id, nearest_distance_sq);

            for (int i = 0; i < 4; i++) {
                if (i != quadrant) {
                    find_nearest_recursive(node->children[i].get(),
                                         position, nearest_id, nearest_distance_sq);
                }
            }
        }
    }

    float distance_to_rect(const Vector2 &point, const Rect2 &rect) {
        float dx = MAX(0.0f, MAX(rect.position.x - point.x,
                                point.x - (rect.position.x + rect.size.width)));
        float dy = MAX(0.0f, MAX(rect.position.y - point.y,
                                point.y - (rect.position.y + rect.size.height)));
        return sqrt(dx * dx + dy * dy);
    }
};

Threading and Concurrency

Thread-Safe Data Structures

#include <atomic>
#include <mutex>
#include <thread>

class ThreadSafeTaskQueue : public RefCounted {
    GDCLASS(ThreadSafeTaskQueue, RefCounted)

private:
    struct Task {
        std::function<void()> function;
        std::atomic<bool> completed{false};
    };

    std::vector<std::unique_ptr<Task>> tasks;
    std::mutex queue_mutex;
    std::atomic<int> next_task_index{0};
    std::atomic<int> completed_tasks{0};

protected:
    static void _bind_methods() {
        ClassDB::bind_method(D_METHOD("add_task", "callable"),
                           &ThreadSafeTaskQueue::add_task);
        ClassDB::bind_method(D_METHOD("process_tasks", "max_tasks"),
                           &ThreadSafeTaskQueue::process_tasks);
        ClassDB::bind_method(D_METHOD("get_completion_ratio"),
                           &ThreadSafeTaskQueue::get_completion_ratio);
    }

public:
    void add_task(const Callable &callable) {
        auto task = std::make_unique<Task>();
        task->function = [callable]() {
            const_cast<Callable&>(callable).call();
        };

        std::lock_guard<std::mutex> lock(queue_mutex);
        tasks.push_back(std::move(task));
    }

    int process_tasks(int max_tasks) {
        int processed = 0;

        while (processed < max_tasks) {
            int task_index = next_task_index.fetch_add(1);
            if (task_index >= static_cast<int>(tasks.size())) {
                break;
            }

            Task *task = tasks[task_index].get();
            if (task && !task->completed.load()) {
                task->function();
                task->completed.store(true);
                completed_tasks.fetch_add(1);
                processed++;
            }
        }

        return processed;
    }

    float get_completion_ratio() {
        int total = static_cast<int>(tasks.size());
        if (total == 0) return 1.0f;
        return static_cast<float>(completed_tasks.load()) / total;
    }
};

Lock-Free Structures

template<typename T, size_t Size>
class LockFreeRingBuffer {
private:
    std::atomic<size_t> write_index{0};
    std::atomic<size_t> read_index{0};
    std::array<T, Size> buffer;

public:
    bool try_push(const T &item) {
        size_t current_write = write_index.load();
        size_t next_write = (current_write + 1) % Size;

        if (next_write == read_index.load()) {
            return false;  // Buffer full
        }

        buffer[current_write] = item;
        write_index.store(next_write);
        return true;
    }

    bool try_pop(T &item) {
        size_t current_read = read_index.load();

        if (current_read == write_index.load()) {
            return false;  // Buffer empty
        }

        item = buffer[current_read];
        read_index.store((current_read + 1) % Size);
        return true;
    }

    size_t size() const {
        size_t write = write_index.load();
        size_t read = read_index.load();
        return (write >= read) ? (write - read) : (Size - read + write);
    }
};

class HighPerformanceAudioProcessor : public AudioStreamPlayback {
    GDCLASS(HighPerformanceAudioProcessor, AudioStreamPlayback)

private:
    LockFreeRingBuffer<float, 8192> audio_buffer;
    std::thread processing_thread;
    std::atomic<bool> running{false};

public:
    void start_processing() {
        running.store(true);
        processing_thread = std::thread([this]() {
            while (running.load()) {
                process_audio_chunk();
                std::this_thread::sleep_for(std::chrono::microseconds(100));
            }
        });
    }

    void stop_processing() {
        running.store(false);
        if (processing_thread.joinable()) {
            processing_thread.join();
        }
    }

private:
    void process_audio_chunk() {
        // Generate audio samples
        for (int i = 0; i < 64; i++) {
            float sample = generate_sample();
            if (!audio_buffer.try_push(sample)) {
                break;  // Buffer full, skip this chunk
            }
        }
    }

    float generate_sample() {
        // High-performance audio generation
        static float phase = 0.0f;
        phase += 0.01f;
        return sin(phase);
    }
};

Profiling and Benchmarking

Built-in Performance Monitoring

class PerformanceProfiler : public Node {
    GDCLASS(PerformanceProfiler, Node)

private:
    struct ProfileData {
        String name;
        uint64_t start_time;
        uint64_t total_time = 0;
        int call_count = 0;
        uint64_t min_time = UINT64_MAX;
        uint64_t max_time = 0;
    };

    HashMap<String, ProfileData> profiles;

protected:
    static void _bind_methods() {
        ClassDB::bind_method(D_METHOD("start_profile", "name"),
                           &PerformanceProfiler::start_profile);
        ClassDB::bind_method(D_METHOD("end_profile", "name"),
                           &PerformanceProfiler::end_profile);
        ClassDB::bind_method(D_METHOD("get_profile_report"),
                           &PerformanceProfiler::get_profile_report);
    }

public:
    void start_profile(const String &name) {
        ProfileData &data = profiles[name];
        data.name = name;
        data.start_time = OS::get_singleton()->get_ticks_usec();
    }

    void end_profile(const String &name) {
        uint64_t end_time = OS::get_singleton()->get_ticks_usec();

        if (profiles.has(name)) {
            ProfileData &data = profiles[name];
            uint64_t duration = end_time - data.start_time;

            data.total_time += duration;
            data.call_count++;
            data.min_time = MIN(data.min_time, duration);
            data.max_time = MAX(data.max_time, duration);
        }
    }

    Dictionary get_profile_report() {
        Dictionary report;

        for (const KeyValue<String, ProfileData> &E : profiles) {
            const ProfileData &data = E.value;

            Dictionary profile_info;
            profile_info["call_count"] = data.call_count;
            profile_info["total_time_us"] = static_cast<int>(data.total_time);
            profile_info["average_time_us"] = data.call_count > 0 ?
                static_cast<int>(data.total_time / data.call_count) : 0;
            profile_info["min_time_us"] = static_cast<int>(data.min_time);
            profile_info["max_time_us"] = static_cast<int>(data.max_time);

            report[data.name] = profile_info;
        }

        return report;
    }
};

// RAII profiler helper
class ScopedProfiler {
private:
    PerformanceProfiler *profiler;
    String name;

public:
    ScopedProfiler(PerformanceProfiler *p_profiler, const String &p_name)
        : profiler(p_profiler), name(p_name) {
        profiler->start_profile(name);
    }

    ~ScopedProfiler() {
        profiler->end_profile(name);
    }
};

#define PROFILE_SCOPE(profiler, name) ScopedProfiler _prof(profiler, name)

// Usage example
void expensive_operation(PerformanceProfiler *profiler) {
    PROFILE_SCOPE(profiler, "expensive_operation");

    // Do expensive work
    for (int i = 0; i < 1000000; i++) {
        // Some computation
    }
}

Best Practices Summary

1. Minimize Cross-Boundary Calls

Batch operations when possible
Cache frequently accessed data
Use PackedArrays for bulk data transfer

2. Optimize Memory Usage

Use Structure of Arrays for cache efficiency
Implement memory pools for frequent allocations
Avoid unnecessary copying of large objects

3. Choose Appropriate Algorithms

Use spatial data structures for spatial queries
Implement proper data structures for your use case
Consider algorithmic complexity

4. Threading Considerations

Use lock-free data structures when possible
Minimize lock contention
Keep thread synchronization simple

5. Profile and Measure

Profile real workloads, not synthetic tests
Focus on the biggest bottlenecks first
Measure before and after optimizations

Conclusion

Performance optimization in GDExtension requires understanding both the engine’s architecture and low-level optimization techniques. The key is to minimize cross-boundary calls, use cache-friendly data structures, and choose appropriate algorithms for your specific use case. Always profile your code to ensure optimizations are actually improving performance.