diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac0974f..44c962a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: πŸ§ͺ CI on: push: - branches: [ main, master ] + branches: [ main, master, dev ] pull_request: - branches: [ main, master ] + branches: [ main, master, dev ] jobs: build: @@ -12,7 +12,7 @@ jobs: strategy: matrix: - python-version: [ "3.11", "3.12" ] + python-version: [ "3.11", "3.12", "3.13" ] steps: - name: 🧰 Checkout repository diff --git a/.gitignore b/.gitignore index 17f8d64..85cd986 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ __pycache__/ *$py.class /.pytest_cache +.ruff_cache + diff --git a/README.md b/README.md index 99f8218..2ab5c02 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,38 @@ # TinyGPU πŸ‰βš‘ -[![PyPI version](https://img.shields.io/badge/version-1.0.0-blue.svg)](https://pypi.org/project/tinygpu) +[![PyPI version](https://img.shields.io/badge/version-2.0.0-blue.svg)](https://pypi.org/project/tinygpu) [![Python 3.13](https://img.shields.io/badge/Python-3.13-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) [![CI](https://github.com/deaneeth/tinygpu/actions/workflows/ci.yml/badge.svg)](https://github.com/deaneeth/tinygpu/actions) +[![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Tests](https://img.shields.io/github/actions/workflow/status/deaneeth/tinygpu/ci.yml?label=tests)](https://github.com/deaneeth/tinygpu/actions) TinyGPU is a **tiny educational GPU simulator** - inspired by [Tiny8](https://github.com/sql-hkr/tiny8), designed to demonstrate how GPUs execute code in parallel. It models a small **SIMT (Single Instruction, Multiple Threads)** system with per-thread registers, global memory, synchronization barriers, branching, and a minimal GPU-like instruction set. > πŸŽ“ *Built for learning and visualization - see how threads, registers, and memory interact across cycles!* - + | Odd-Even Sort | Reduction | |---------------|------------| -| ![Odd-Even Sort](outputs/run_odd_even_sort/run_odd_even_sort_20251025-205516.gif) | ![Reduction](outputs/run_reduce_sum/run_reduce_sum_20251025-210237.gif) | +| ![Odd-Even Sort](src/outputs/run_odd_even_sort/run_odd_even_sort_20251026-212558.gif) | ![Reduction](src/outputs/run_reduce_sum/run_reduce_sum_20251026-212712.gif) | + +--- + +## πŸš€ What's New in v2.0.0 + +- **Enhanced Instruction Set**: + - Added `SHLD` and `SHST` for robust shared memory operations. + - Improved `SYNC` semantics for better thread coordination. +- **Visualizer Improvements**: + - Export execution as GIFs with enhanced clarity. + - Added support for saving visuals directly from the simulator. +- **Refactored Core**: + - Simplified step semantics for better extensibility. + - Optimized performance for larger thread counts. +- **CI/CD Updates**: + - Integrated linting (`ruff`, `black`) and testing workflows. + - Automated builds and tests on GitHub Actions. +- **Documentation**: + - Expanded examples and added detailed usage instructions. --- @@ -51,10 +72,11 @@ TinyGPU was built as a **learning-first GPU simulator** - simple enough for begi > 🧭 TinyGPU aims to make GPU learning *intuitive, visual, and interactive* - from classroom demos to self-guided exploration. --- + ## ✨ Highlights - 🧩 **GPU-like instruction set:** - `SET`, `ADD`, `MUL`, `LD`, `ST`, `JMP`, `BNE`, `BEQ`, `SYNC`, `CSWAP`. + `SET`, `ADD`, `MUL`, `LD`, `ST`, `JMP`, `BNE`, `BEQ`, `SYNC`, `CSWAP`, `SHLD`, `SHST`. - 🧠 **Per-thread registers & PCs** - each thread executes the same kernel independently. - 🧱 **Shared global memory** for inter-thread operations. - πŸ”„ **Synchronization barriers** (`SYNC`) for parallel coordination. @@ -69,23 +91,31 @@ TinyGPU was built as a **learning-first GPU simulator** - simple enough for begi ## πŸ–ΌοΈ Example Visuals -> Located in `examples/` β€” you can generate these GIFs yourself. +> Located in `src/outputs/` β€” run the example scripts to generate these GIFs (they're saved under `src/outputs//`). -| Odd-Even Sort | Reduction | -|---------------|------------| -| ![Odd-Even Sort](outputs/run_odd_even_sort/run_odd_even_sort_20251025-205516.gif) | ![Reduction](outputs/run_reduce_sum/run_reduce_sum_20251025-210237.gif) | +| Example | Description | GIF Preview | +|---------|-------------|-------------| +| Vector Add | Parallel vector addition (A+B -> C) | ![Vector Add](src/outputs/run_vector_add/run_vector_add_20251026-212734.gif) | +| Block Shared Sum | Per-block shared memory sum example | ![Block Shared Sum](src/outputs/run_block_shared_sum/run_block_shared_sum_20251026-212542.gif) | +| Odd-Even Sort | GPU-style odd-even transposition sort | ![Odd-Even Sort](src/outputs/run_odd_even_sort/run_odd_even_sort_20251026-212558.gif) | +| Parallel Reduction | Sum reduction across an array | ![Reduction](src/outputs/run_reduce_sum/run_reduce_sum_20251026-212712.gif) | +| Sync Test | Synchronization / barrier demonstration | ![Sync Test](src/outputs/run_sync_test/run_sync_test_20251027-000818.gif) | +| Loop Test | Branching and loop behavior demo | ![Test Loop](src/outputs/run_test_loop/run_test_loop_20251026-212814.gif) | +| Compare Test | Comparison and branching example | ![Test CMP](src/outputs/run_test_cmp/run_test_cmp_20251026-212823.gif) | +| Kernel Args Test | Demonstrates passing kernel arguments | ![Kernel Args](src/outputs/run_test_kernel_args/run_test_kernel_args_20251026-212830.gif) | --- ## πŸš€ Quickstart ### Clone and install + ```bash git clone https://github.com/deaneeth/tinygpu.git cd tinygpu pip install -e . pip install -r requirements-dev.txt -```` +``` ### Run an example @@ -93,7 +123,7 @@ pip install -r requirements-dev.txt python -m examples.run_odd_even_sort ``` -> Produces: `examples/odd_even_sort.gif` β€” a visual GPU-style sorting process. +> Produces: `src/outputs/run_odd_even_sort/run_odd_even_sort_*.gif` β€” a visual GPU-style sorting process. ### Other examples @@ -108,30 +138,50 @@ python -m examples.run_sync_test ## 🧩 Project Layout -``` -tinygpu/ +```text +. +β”œβ”€ .github/ +β”‚ └─ workflows/ +β”‚ └─ ci.yml +β”œβ”€ docs/ +β”‚ └─ index.md β”œβ”€ examples/ -β”‚ β”œβ”€ vector_add.tgpu +β”‚ β”œβ”€ odd_even_sort_tmp.tgpu β”‚ β”œβ”€ odd_even_sort.tgpu β”‚ β”œβ”€ reduce_sum.tgpu -β”‚ β”œβ”€ run_vector_add.py β”‚ β”œβ”€ run_odd_even_sort.py β”‚ β”œβ”€ run_reduce_sum.py +β”‚ β”œβ”€ run_sync_test.py β”‚ β”œβ”€ run_test_loop.py -β”‚ └─ run_sync_test.py -β”‚ +β”‚ β”œβ”€ run_vector_add.py +β”‚ β”œβ”€ sync_test.tgpu +β”‚ β”œβ”€ test_loop.tgpu +β”‚ └─ vector_add.tgpu +β”œβ”€ src/outputs/ +β”‚ β”œβ”€ run_block_shared_sum/ +β”‚ β”œβ”€ run_odd_even_sort/ +β”‚ β”œβ”€ run_reduce_sum/ +β”‚ β”œβ”€ run_sync_test/ +β”‚ β”œβ”€ run_test_cmp/ +β”‚ β”œβ”€ run_test_kernel_args/ +β”‚ β”œβ”€ run_test_loop/ +β”‚ └─ run_vector_add/ β”œβ”€ src/ β”‚ └─ tinygpu/ +β”‚ β”œβ”€ __init__.py β”‚ β”œβ”€ assembler.py β”‚ β”œβ”€ gpu.py β”‚ β”œβ”€ instructions.py -β”‚ β”œβ”€ visualizer.py -β”‚ └─ __init__.py -β”‚ +β”‚ └─ visualizer.py β”œβ”€ tests/ +β”‚ β”œβ”€ test_assembler.py +β”‚ β”œβ”€ test_gpu_core.py +β”‚ β”œβ”€ test_gpu.py +β”‚ └─ test_programs.py +β”œβ”€ LICENSE β”œβ”€ pyproject.toml -β”œβ”€ requirements-dev.txt -└─ README.md +β”œβ”€ README.md +└─ requirements-dev.txt ``` --- @@ -156,6 +206,8 @@ TinyGPU uses a **minimal instruction set** designed for clarity and education - | `BNE Ra, Rb, target` | Branch if not equal. | Jump to `target` if `Ra != Rb`. | | `SYNC` | *(no operands)* | Synchronization barrier β€” all threads must reach this point before continuing. | | `CSWAP addrA, addrB` | Compare-and-swap memory values. | If `mem[addrA] > mem[addrB]`, swap them. Used for sorting. | +| `SHLD addr, Rs` | Load shared memory into register. | `Rs = shared_mem[addr]` | +| `SHST addr, Rs` | Store register into shared memory. | `shared_mem[addr] = Rs` | | `CMP Rd, Ra, Rb` *(optional)* | Compare and set flag or register. | Used internally for extended examples (e.g., prefix-scan). | | `NOP` *(optional)* | *(no operands)* | No operation; placeholder instruction. | @@ -267,7 +319,7 @@ MIT - see [LICENSE](LICENSE) ## 🌟 Credits & Inspiration -❀️ Built by [Deaneeth](https://github.com/deaneeth) +❀️ Built by [Deaneeth](https://github.com/deaneeth) > Inspired by the educational design of [Tiny8 CPU Simulator](https://github.com/sql-hkr/tiny8). diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..7708c40 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,123 @@ +# TinyGPU πŸ‰βš‘ β€” v2.0.0 + +[![Release v2.0.0](https://img.shields.io/badge/release-v2.0.0-blue.svg)](https://github.com/deaneeth/tinygpu/releases/tag/v2.0.0) +[![Python 3.13](https://img.shields.io/badge/Python-3.13-blue.svg)](https://www.python.org/downloads/) +[![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) +[![CI](https://github.com/deaneeth/tinygpu/actions/workflows/ci.yml/badge.svg)](https://github.com/deaneeth/tinygpu/actions) +[![Tests](https://img.shields.io/github/actions/workflow/status/deaneeth/tinygpu/ci.yml?label=tests)](https://github.com/deaneeth/tinygpu/actions) +[![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) + +TinyGPU is a **tiny educational GPU simulator** β€” a minimal SIMT-style simulator with: + +- Per-thread registers & program counters +- Shared global memory and per-block shared memory +- A small GPU-style ISA and assembler +- Visualizer and GIF export for educational animations + +> πŸŽ“ *Built for learning and visualization - see how threads, registers, and memory interact across cycles!* + +--- + +## πŸš€ What's New in v2.0.0 + +- **Enhanced Instruction Set**: + - Added `SHLD` and `SHST` for robust shared memory operations. + - Improved `SYNC` semantics for better thread coordination. +- **Visualizer Improvements**: + - Export execution as GIFs with enhanced clarity. + - Added support for saving visuals directly from the simulator. +- **Refactored Core**: + - Simplified step semantics for better extensibility. + - Optimized performance for larger thread counts. +- **CI/CD Updates**: + - Integrated linting (`ruff`, `black`) and testing workflows. + - Automated builds and tests on GitHub Actions. +- **Documentation**: + - Expanded examples and added detailed usage instructions. + +--- + +## Quick Screenshots / Demos + +### Odd–Even Transposition Sort + +![Odd-Even Sort](../src/outputs/run_odd_even_sort/run_odd_even_sort_20251026-212558.gif) + +### Parallel Reduction (Sum) + +![Reduce Sum](../src/outputs/run_reduce_sum/run_reduce_sum_20251026-212712.gif) + +--- + +## Getting Started + +Clone and install (editable): + +```bash +git clone https://github.com/deaneeth/tinygpu.git +cd tinygpu +pip install -e . +pip install -r requirements-dev.txt +``` + +Run a demo (odd-even sort): + +```bash +python -m examples.run_odd_even_sort +``` + +> Produces: `outputs/run_odd_even_sort/run_odd_even_sort_*.gif` β€” a visual GPU-style sorting process. + +--- + +## Examples & Runners + +- `examples/run_vector_add.py` β€” simple parallel vector add +- `examples/run_vector_add_kernel.py` β€” vector add with kernel arguments +- `examples/run_test_loop.py` β€” branch/loop test (sum 1..4) +- `examples/run_test_cmp.py` β€” comparison and branching test +- `examples/run_test_kernel_args.py` β€” kernel arguments test +- `examples/run_odd_even_sort.py` β€” odd-even transposition sort (GIF) +- `examples/run_reduce_sum.py` β€” parallel reduction (GIF) +- `examples/run_block_shared_sum.py` β€” per-block shared memory example +- `examples/run_sync_test.py` β€” synchronization test +- `examples/debug_repl.py` β€” interactive REPL debugger + +--- + +## Instruction Set (Quick Reference) + +| **Instruction** | **Operands** | **Description** | +|-----------------------------|------------------------------------------|-----------------| +| `SET Rd, imm` | `Rd` = destination register, `imm` = immediate value | Set register `Rd` to an immediate constant. | +| `ADD Rd, Ra, Rb` | `Rd` = destination, `Ra` + `Rb` | Add two registers and store result in `Rd`. | +| `ADD Rd, Ra, imm` | `Rd` = destination, `Ra` + immediate | Add register and immediate value. | +| `MUL Rd, Ra, Rb` | Multiply two registers. | `Rd = Ra * Rb` | +| `MUL Rd, Ra, imm` | Multiply register by immediate. | `Rd = Ra * imm` | +| `LD Rd, addr` | Load from memory address into register. | `Rd = mem[addr]` | +| `LD Rd, Rk` | Load from address in register `Rk`. | `Rd = mem[Rk]` | +| `ST addr, Rs` | Store register into memory address. | `mem[addr] = Rs` | +| `ST Rk, Rs` | Store value from `Rs` into memory at address in register `Rk`. | `mem[Rk] = Rs` | +| `SHLD Rd, saddr` | Load from shared memory into register. | `Rd = shared_mem[saddr]` | +| `SHST saddr, Rs` | Store register into shared memory. | `shared_mem[saddr] = Rs` | +| `CSWAP addrA, addrB` | Compare-and-swap memory values. | If `mem[addrA] > mem[addrB]`, swap them. Used for sorting. | +| `CMP Ra, Rb` | Compare and set flags. | Set Z/N/G flags based on `Ra - Rb`. | +| `BRGT target` | Branch if greater. | Jump to `target` if G flag set. | +| `BRLT target` | Branch if less. | Jump to `target` if N flag set. | +| `BRZ target` | Branch if zero. | Jump to `target` if Z flag set. | +| `JMP target` | Label or immediate. | Unconditional jump β€” sets PC to `target`. | +| `SYNC` | *(no operands)* | Global synchronization barrier β€” all threads must reach this point. | +| `SYNCB` | *(no operands)* | Block-level synchronization barrier. | + +--- + +## Publishing & Contributing + +- See `.github/workflows/ci.yml` for CI and packaging +- To propose changes, open a PR. For bug reports, open an issue. + +--- + +## License + +MIT β€” See [LICENSE](../LICENSE). diff --git a/examples/block_shared_sum.tgpu b/examples/block_shared_sum.tgpu new file mode 100644 index 0000000..0e71640 --- /dev/null +++ b/examples/block_shared_sum.tgpu @@ -0,0 +1,37 @@ +; block_shared_sum.tgpu +; R5 = block_id, R6 = thread_in_block, R7 = tid +; R0 -> temp +; R1 -> base (global base index for each block is block_id * block_stride) +; We'll assume runner sets up base_addr per block in memory (or use a simple scheme) + +; Each thread loads its input and stores it into shared[thread_in_block] +; Then threads synchronize at block barrier and thread 0 sums the shared +; values and writes the block sum to memory at address (100 + block_id). + +; Load own value from memory[tid] (R7 contains tid) +LD R3, R7 ; R3 = memory[tid] +SHST R6, R3 ; shared[thread_in_block] = R3 +SYNCB ; wait for block + +; Only thread with thread_in_block == 0 performs the reduction +CMP R6, 0 +BRGT not_zero ; if R6 > 0 jump to not_zero (i.e., only R6==0 continues) + +SET R4, 0 ; R4 = sum +SET R2, 0 ; R2 = loop index +sum_loop: + SHLD R0, R2 ; R0 = shared[R2] + ADD R4, R4, R0 ; R4 += R0 + ADD R2, R2, 1 + CMP R2, 4 ; compare with TPB (4) + BRLT sum_loop + +; write sum to memory at 100 + block_id (R5 holds block_id) +SET R1, 100 +ADD R1, R1, R5 +ST R1, R4 + +JMP done_block +not_zero: +done_block: +; end diff --git a/examples/debug_repl.py b/examples/debug_repl.py new file mode 100644 index 0000000..c0b5939 --- /dev/null +++ b/examples/debug_repl.py @@ -0,0 +1,57 @@ +import os +import sys + +# make local 'src' package available so imports resolve when running this script +src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")) +sys.path.insert(0, src_path) + +from tinygpu.assembler import assemble_file # noqa: E402 +from tinygpu.gpu import TinyGPU # noqa: E402 +from tinygpu.visualizer import visualize # noqa: E402 + +# config - change program path +prog_path = os.path.join(os.path.dirname(__file__), "test_loop.tgpu") +program, labels = assemble_file(prog_path) + +gpu = TinyGPU(num_threads=4, num_registers=8, mem_size=64) +gpu.load_program(program, labels) + +print("TinyGPU debug REPL") +print("Commands: s(step), n (step k), p(print snapshot), v(visualize),") +print("r (rewind k), q(quit)") + +while True: + cmd = input("dbg> ").strip().split() + if not cmd: + continue + c = cmd[0] + if c in ("q", "quit"): + break + if c in ("s", "step"): + gpu.step_single() + snap = gpu.snapshot(mem_slice=(0, 16), regs_threads=[0, 1]) + print(f"cycle {snap['cycle']} pc: {snap['pc']}") + print("R0..R1 for threads 0,1:") + for tid in [0, 1]: + print(f" T{tid} regs:", snap["registers"][tid][:4]) + elif c in ("n", "stepk"): + k = int(cmd[1]) if len(cmd) > 1 else 1 + for _ in range(k): + gpu.step_single() + print("advanced", k, "cycles") + elif c in ("p", "print"): + snap = gpu.snapshot(mem_slice=(0, 32)) + print("PC:", snap["pc"]) + print("Flags:", snap["flags"]) + print("Mem[0..32]:", snap["memory_slice"]) + elif c in ("v", "viz", "visualize"): + visualize(gpu, show_pc=True) + elif c in ("r", "rewind"): + k = int(cmd[1]) if len(cmd) > 1 else 1 + try: + gpu.rewind(k) + print("rewound", k, "cycles") + except Exception as e: + print("rewind error:", e) + else: + print("unknown command") diff --git a/examples/run_block_shared_sum.py b/examples/run_block_shared_sum.py new file mode 100644 index 0000000..59464ea --- /dev/null +++ b/examples/run_block_shared_sum.py @@ -0,0 +1,56 @@ +import os +import sys +import numpy as np +from tinygpu.visualizer import save_animation +import time + +# ensure src/ is on sys.path so examples can import the package +src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")) +sys.path.insert(0, src_path) + +from tinygpu.gpu import TinyGPU # noqa: E402 +from tinygpu.assembler import assemble_file # noqa: E402 + +ARRAY_LEN = 8 # total elements (must equal num_blocks * tpb) +NUM_BLOCKS = 2 +TPB = 4 # threads per block +SHARED_SIZE = TPB +MEM_SIZE = 256 + +prog_path = os.path.join(os.path.dirname(__file__), "block_shared_sum.tgpu") +program, labels = assemble_file(prog_path) + +# create gpu with total threads +gpu = TinyGPU(num_threads=NUM_BLOCKS * TPB, num_registers=12, mem_size=MEM_SIZE) +gpu.set_grid(NUM_BLOCKS, TPB, shared_size=SHARED_SIZE) + +# prepare input values per thread in global memory at index tid +arr = np.arange(1, ARRAY_LEN + 1) # [1,2,3,...] +print("Input values per tid:", arr.tolist()) +for tid in range(ARRAY_LEN): + gpu.memory[tid] = int(arr[tid]) + +gpu.load_program(program, labels) +gpu.run(max_cycles=200) + +# Save animation GIF to src/outputs// +try: + script_name = os.path.splitext(os.path.basename(__file__))[0] + output_dir = os.path.join( + os.path.dirname(__file__), "..", "src", "outputs", script_name + ) + os.makedirs(output_dir, exist_ok=True) + timestamp = time.strftime("%Y%m%d-%H%M%S") + out_gif = os.path.join(output_dir, f"{script_name}_{timestamp}.gif") + save_animation(gpu, out_path=out_gif, fps=10, max_frames=200, dpi=100) + print("Saved GIF:", os.path.abspath(out_gif)) +except Exception as e: + print("Could not save GIF:", e) + +# read back block results at mem[100 + block_id] (as used in kernel) +results = [int(gpu.memory[100 + b]) for b in range(NUM_BLOCKS)] +print("Block sums (expected):", results) +print( + "Expected manual sums:", + [int(sum(arr[b * TPB : (b + 1) * TPB])) for b in range(NUM_BLOCKS)], +) diff --git a/examples/run_odd_even_sort.py b/examples/run_odd_even_sort.py index 2b21ca9..bd9b379 100644 --- a/examples/run_odd_even_sort.py +++ b/examples/run_odd_even_sort.py @@ -1,9 +1,15 @@ import os +import sys import time import numpy as np -from tinygpu.gpu import TinyGPU -from tinygpu.assembler import assemble_file -from src.tinygpu.visualizer import visualize, save_animation + +# make local 'src' package available so imports resolve when running this script +src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")) +sys.path.insert(0, src_path) + +from tinygpu.gpu import TinyGPU # noqa: E402 +from tinygpu.assembler import assemble_file # noqa: E402 +from tinygpu.visualizer import visualize, save_animation # noqa: E402 # configuration ARRAY_LEN = 16 # must be even for odd-even transposition; adjust as needed @@ -48,7 +54,9 @@ # produce gif (limit frames to 200 to avoid huge files) script_name = os.path.splitext(os.path.basename(__file__))[0] # e.g., run_reduce_sum -output_dir = os.path.join(os.path.dirname(__file__), "..", "outputs", script_name) +output_dir = os.path.join( + os.path.dirname(__file__), "..", "src", "outputs", script_name +) os.makedirs(output_dir, exist_ok=True) # create timestamped output path diff --git a/examples/run_reduce_sum.py b/examples/run_reduce_sum.py index c788c49..3576326 100644 --- a/examples/run_reduce_sum.py +++ b/examples/run_reduce_sum.py @@ -41,7 +41,9 @@ # Save gif script_name = os.path.splitext(os.path.basename(__file__))[0] # e.g., run_reduce_sum -output_dir = os.path.join(os.path.dirname(__file__), "..", "outputs", script_name) +output_dir = os.path.join( + os.path.dirname(__file__), "..", "src", "outputs", script_name +) os.makedirs(output_dir, exist_ok=True) # create timestamped output path diff --git a/examples/run_sync_test.py b/examples/run_sync_test.py index 6df0d40..651404d 100644 --- a/examples/run_sync_test.py +++ b/examples/run_sync_test.py @@ -1,7 +1,9 @@ import os +import time from tinygpu.gpu import TinyGPU from tinygpu.assembler import assemble_file from tinygpu.visualizer import visualize +from tinygpu.visualizer import save_animation # Path to sync test program example_path = os.path.join(os.path.dirname(__file__), "sync_test.tgpu") @@ -24,3 +26,17 @@ # Visualize execution visualize(gpu) + +# Save animation GIF to src/outputs// +try: + script_name = os.path.splitext(os.path.basename(__file__))[0] + output_dir = os.path.join( + os.path.dirname(__file__), "..", "src", "outputs", script_name + ) + os.makedirs(output_dir, exist_ok=True) + timestamp = time.strftime("%Y%m%d-%H%M%S") + out_gif = os.path.join(output_dir, f"{script_name}_{timestamp}.gif") + save_animation(gpu, out_path=out_gif, fps=10, max_frames=200, dpi=100) + print("Saved GIF:", os.path.abspath(out_gif)) +except Exception as e: + print("Could not save GIF:", e) diff --git a/examples/run_test_cmp.py b/examples/run_test_cmp.py new file mode 100644 index 0000000..ef11491 --- /dev/null +++ b/examples/run_test_cmp.py @@ -0,0 +1,35 @@ +# examples/run_test_cmp.py +import os +import sys +import time +from tinygpu.visualizer import save_animation + +# make local 'src' package available so imports resolve when running this script +src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")) +sys.path.insert(0, src_path) + +from tinygpu.gpu import TinyGPU # noqa: E402 +from tinygpu.assembler import assemble_file # noqa: E402 + +example_path = os.path.join(os.path.dirname(__file__), "test_cmp.tgpu") +program, labels = assemble_file(example_path) + +gpu = TinyGPU(num_threads=8, num_registers=8, mem_size=64) +gpu.load_program(program, labels) +gpu.run(max_cycles=20) + +print("R0 per thread:", gpu.registers[:, 0]) +print("Flags per thread:", gpu.flags) + +try: + script_name = os.path.splitext(os.path.basename(__file__))[0] + output_dir = os.path.join( + os.path.dirname(__file__), "..", "src", "outputs", script_name + ) + os.makedirs(output_dir, exist_ok=True) + timestamp = time.strftime("%Y%m%d-%H%M%S") + out_gif = os.path.join(output_dir, f"{script_name}_{timestamp}.gif") + save_animation(gpu, out_path=out_gif, fps=10, max_frames=200, dpi=100) + print("Saved GIF:", os.path.abspath(out_gif)) +except Exception as e: + print("Could not save GIF:", e) diff --git a/examples/run_test_kernel_args.py b/examples/run_test_kernel_args.py new file mode 100644 index 0000000..3ebb0ee --- /dev/null +++ b/examples/run_test_kernel_args.py @@ -0,0 +1,25 @@ +import os +import time +from tinygpu.assembler import assemble_file +from tinygpu.gpu import TinyGPU +from tinygpu.visualizer import save_animation + +program, labels = assemble_file("examples/test_kernel_args.tgpu") +gpu = TinyGPU(num_threads=8, num_registers=8, mem_size=64) +gpu.load_kernel(program, labels=labels, grid=(1, 8), args=[10, 5]) +gpu.run_kernel(max_cycles=10) +print("mem[0..8]:", gpu.memory[:8].tolist()) # expect [10+5+0, 10+5+1, ...] +print("R0 per thread:", gpu.registers[:, 0].tolist()) # expect [10, 10, ...] +print("R1 per thread:", gpu.registers[:, 1].tolist()) # expect [5, 5, ...] +try: + script_name = os.path.splitext(os.path.basename(__file__))[0] + output_dir = os.path.join( + os.path.dirname(__file__), "..", "src", "outputs", script_name + ) + os.makedirs(output_dir, exist_ok=True) + timestamp = time.strftime("%Y%m%d-%H%M%S") + out_gif = os.path.join(output_dir, f"{script_name}_{timestamp}.gif") + save_animation(gpu, out_path=out_gif, fps=10, max_frames=200, dpi=100) + print("Saved GIF:", os.path.abspath(out_gif)) +except Exception as e: + print("Could not save GIF:", e) diff --git a/examples/run_test_loop.py b/examples/run_test_loop.py index b26ff2f..5f273e0 100644 --- a/examples/run_test_loop.py +++ b/examples/run_test_loop.py @@ -1,7 +1,9 @@ import os +import time from tinygpu.gpu import TinyGPU from tinygpu.assembler import assemble_file from tinygpu.visualizer import visualize +from tinygpu.visualizer import save_animation # Path to loop program example_path = os.path.join(os.path.dirname(__file__), "test_loop.tgpu") @@ -24,3 +26,17 @@ # Visualize execution visualize(gpu) + +# Save animation GIF to src/outputs// +try: + script_name = os.path.splitext(os.path.basename(__file__))[0] + output_dir = os.path.join( + os.path.dirname(__file__), "..", "src", "outputs", script_name + ) + os.makedirs(output_dir, exist_ok=True) + timestamp = time.strftime("%Y%m%d-%H%M%S") + out_gif = os.path.join(output_dir, f"{script_name}_{timestamp}.gif") + save_animation(gpu, out_path=out_gif, fps=10, max_frames=200, dpi=100) + print("Saved GIF:", os.path.abspath(out_gif)) +except Exception as e: + print("Could not save GIF:", e) diff --git a/examples/run_vector_add.py b/examples/run_vector_add.py index 9475fa4..0bf052a 100644 --- a/examples/run_vector_add.py +++ b/examples/run_vector_add.py @@ -1,4 +1,5 @@ import sys +import time import os sys.path.append(os.path.join(os.path.dirname(__file__), "..", "src")) @@ -6,6 +7,7 @@ from tinygpu.gpu import TinyGPU from tinygpu.assembler import assemble_file from tinygpu.visualizer import visualize +from tinygpu.visualizer import save_animation # Path to program example_path = os.path.join(os.path.dirname(__file__), "vector_add.tgpu") @@ -48,3 +50,17 @@ # Visualize visualize(gpu) + +# Save animation GIF to src/outputs// +try: + script_name = os.path.splitext(os.path.basename(__file__))[0] + output_dir = os.path.join( + os.path.dirname(__file__), "..", "src", "outputs", script_name + ) + os.makedirs(output_dir, exist_ok=True) + timestamp = time.strftime("%Y%m%d-%H%M%S") + out_gif = os.path.join(output_dir, f"{script_name}_{timestamp}.gif") + save_animation(gpu, out_path=out_gif, fps=10, max_frames=200, dpi=100) + print("Saved GIF:", os.path.abspath(out_gif)) +except Exception as e: + print("Could not save GIF:", e) diff --git a/examples/run_vector_add_kernel.py b/examples/run_vector_add_kernel.py new file mode 100644 index 0000000..8568b79 --- /dev/null +++ b/examples/run_vector_add_kernel.py @@ -0,0 +1,55 @@ +import os +import sys + +# make local 'src' package available so imports resolve when running this script +src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")) +sys.path.insert(0, src_path) + +from tinygpu.assembler import assemble_file # noqa: E402 +from tinygpu.gpu import TinyGPU # noqa: E402 +from tinygpu.visualizer import visualize, save_animation # noqa: E402 + +# config +ARRAY_LEN = 8 +NUM_BLOCKS = 1 +TPB = ARRAY_LEN # one block all threads +MEM_SIZE = 64 +MAX_CYCLES = 50 + +prog_path = os.path.join(os.path.dirname(__file__), "vector_add.tgpu") +program, labels = assemble_file(prog_path) + +# create gpu with total threads = NUM_BLOCKS * TPB +gpu = TinyGPU(num_threads=NUM_BLOCKS * TPB, num_registers=12, mem_size=MEM_SIZE) + +# init memory: A at 0..7, B at 8..15 +for i in range(ARRAY_LEN): + gpu.memory[i] = i + gpu.memory[8 + i] = i * 2 + +# launch kernel: grid = (blocks, threads_per_block) +gpu.load_kernel( + program, labels=labels, grid=(NUM_BLOCKS, TPB), args=None, shared_size=0 +) + +# run +gpu.run_kernel(max_cycles=MAX_CYCLES) + +# inspect results (C at 16..) +print("Result C:", gpu.memory[16 : 16 + ARRAY_LEN].tolist()) +visualize(gpu, show_pc=True) +# Save animation GIF to src/outputs// +try: + script_name = os.path.splitext(os.path.basename(__file__))[0] + output_dir = os.path.join( + os.path.dirname(__file__), "..", "src", "outputs", script_name + ) + os.makedirs(output_dir, exist_ok=True) + import time + + timestamp = time.strftime("%Y%m%d-%H%M%S") + out_gif = os.path.join(output_dir, f"{script_name}_{timestamp}.gif") + save_animation(gpu, out_path=out_gif, fps=12, max_frames=120, dpi=100) + print("Saved GIF:", os.path.abspath(out_gif)) +except Exception as e: + print("Could not save GIF:", e) diff --git a/examples/test_cmp.tgpu b/examples/test_cmp.tgpu new file mode 100644 index 0000000..9b53c02 --- /dev/null +++ b/examples/test_cmp.tgpu @@ -0,0 +1,18 @@ +; test_cmp.tgpu +; Thread-local compare test -- each thread compares R7 (tid) with an immediate and sets R0 + +SET R0, 0 ; default result +SET R1, 5 ; threshold 5 +CMP R7, R1 ; compare tid vs 5 +BRGT greater +BRZ equal +; tid < 5 +SET R0, 1 +JMP done +equal: +SET R0, 2 +JMP done +greater: +SET R0, 3 +done: +; end diff --git a/examples/test_kernel_args.tgpu b/examples/test_kernel_args.tgpu new file mode 100644 index 0000000..9164081 --- /dev/null +++ b/examples/test_kernel_args.tgpu @@ -0,0 +1,6 @@ +; test_kernel_args.tgpu +; R0 = arg0, R1 = arg1 +; Each thread writes R0 + R1 + tid into mem[tid] +ADD R2, R0, R1 +ADD R2, R2, R7 +ST R7, R2 diff --git a/outputs/run_reduce_sum/run_reduce_sum_20251025-210237.gif b/outputs/run_reduce_sum/run_reduce_sum_20251025-210237.gif deleted file mode 100644 index c68c627..0000000 Binary files a/outputs/run_reduce_sum/run_reduce_sum_20251025-210237.gif and /dev/null differ diff --git a/pyproject.toml b/pyproject.toml index 68196e4..cafacc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tinygpu" -version = "1.0.0" +version = "2.0.0" description = "A tiny educational GPU simulator for learning SIMT architecture, branching, and synchronization." readme = "README.md" requires-python = ">=3.11" diff --git a/src/outputs/run_block_shared_sum/run_block_shared_sum_20251026-212542.gif b/src/outputs/run_block_shared_sum/run_block_shared_sum_20251026-212542.gif new file mode 100644 index 0000000..aeacdc6 Binary files /dev/null and b/src/outputs/run_block_shared_sum/run_block_shared_sum_20251026-212542.gif differ diff --git a/outputs/run_odd_even_sort/run_odd_even_sort_20251025-205516.gif b/src/outputs/run_odd_even_sort/run_odd_even_sort_20251026-212558.gif similarity index 93% rename from outputs/run_odd_even_sort/run_odd_even_sort_20251025-205516.gif rename to src/outputs/run_odd_even_sort/run_odd_even_sort_20251026-212558.gif index 33940a2..2b8509f 100644 Binary files a/outputs/run_odd_even_sort/run_odd_even_sort_20251025-205516.gif and b/src/outputs/run_odd_even_sort/run_odd_even_sort_20251026-212558.gif differ diff --git a/src/outputs/run_reduce_sum/run_reduce_sum_20251026-212712.gif b/src/outputs/run_reduce_sum/run_reduce_sum_20251026-212712.gif new file mode 100644 index 0000000..48a8e27 Binary files /dev/null and b/src/outputs/run_reduce_sum/run_reduce_sum_20251026-212712.gif differ diff --git a/src/outputs/run_sync_test/run_sync_test_20251027-000818.gif b/src/outputs/run_sync_test/run_sync_test_20251027-000818.gif new file mode 100644 index 0000000..6420733 Binary files /dev/null and b/src/outputs/run_sync_test/run_sync_test_20251027-000818.gif differ diff --git a/src/outputs/run_test_cmp/run_test_cmp_20251026-212823.gif b/src/outputs/run_test_cmp/run_test_cmp_20251026-212823.gif new file mode 100644 index 0000000..9dadcfc Binary files /dev/null and b/src/outputs/run_test_cmp/run_test_cmp_20251026-212823.gif differ diff --git a/src/outputs/run_test_kernel_args/run_test_kernel_args_20251026-212830.gif b/src/outputs/run_test_kernel_args/run_test_kernel_args_20251026-212830.gif new file mode 100644 index 0000000..7fe333e Binary files /dev/null and b/src/outputs/run_test_kernel_args/run_test_kernel_args_20251026-212830.gif differ diff --git a/src/outputs/run_test_loop/run_test_loop_20251026-212814.gif b/src/outputs/run_test_loop/run_test_loop_20251026-212814.gif new file mode 100644 index 0000000..026b174 Binary files /dev/null and b/src/outputs/run_test_loop/run_test_loop_20251026-212814.gif differ diff --git a/src/outputs/run_vector_add/run_vector_add_20251026-212734.gif b/src/outputs/run_vector_add/run_vector_add_20251026-212734.gif new file mode 100644 index 0000000..75e2905 Binary files /dev/null and b/src/outputs/run_vector_add/run_vector_add_20251026-212734.gif differ diff --git a/src/tinygpu/gpu.py b/src/tinygpu/gpu.py index 7a76a96..af94678 100644 --- a/src/tinygpu/gpu.py +++ b/src/tinygpu/gpu.py @@ -1,139 +1,328 @@ +# src/tinygpu/gpu.py import numpy as np from .instructions import INSTRUCTIONS -# TinyGPU core class definition class TinyGPU: def __init__(self, num_threads=8, num_registers=8, mem_size=256): + # core sizes self.num_threads = num_threads self.num_registers = num_registers self.mem_size = mem_size - # registers[tid, reg] + # registers and memory self.registers = np.zeros((num_threads, num_registers), dtype=np.int32) - - # shared memory self.memory = np.zeros(mem_size, dtype=np.int32) - # per-thread program counter + # per-thread PC, active mask self.pc = np.zeros(num_threads, dtype=np.int32) - - # per-thread "active" flag (unused lanes when needed) self.active = np.ones(num_threads, dtype=bool) - # barrier / sync state: per-thread whether it's waiting at a sync point + # flags from earlier enhancement (int8 bitmask) + self.flags = np.zeros(num_threads, dtype=np.int8) + + # global sync waiting (already used for SYNC) self.sync_waiting = np.zeros(num_threads, dtype=bool) - # histories for visualization - self.history_registers = [] # list of arrays shape=(num_threads, num_registers) - self.history_memory = [] # list of arrays shape=(mem_size,) - self.history_pc = [] # list of pc array shape=(num_threads,) + # block-level sync waiting (SYNCB) + self.sync_waiting_block = np.zeros(num_threads, dtype=bool) + + # grid / shared memory defaults (1 block covering all threads) + self.num_blocks = 1 + self.threads_per_block = num_threads + self.shared_size = 0 + self.shared = np.zeros( + (1, 0), dtype=np.int32 + ) # shape (num_blocks, shared_size) + + # history for visualization + self.history_registers = [] + self.history_memory = [] + self.history_pc = [] + self.history_flags = [] + self.history_shared = [] # for debugging/visualization of shared memory self.program = [] self.labels = {} - # initialize thread ids in a fixed register (R7 by convention) + # initialize thread id in R7 and block/thread info in R5/R6 if possible for tid in range(self.num_threads): - # make sure register index exists if self.num_registers > 7: - self.registers[tid, 7] = tid + self.registers[tid, 7] = tid # global thread id else: - # if too few registers, set R0 as thread id (unlikely), but warn self.registers[tid, 0] = tid + def set_grid(self, num_blocks: int, threads_per_block: int, shared_size: int = 0): + """ + Configure grid parameters and allocate shared memory. + Must call before running (or call before load_program/run). + """ + self.num_blocks = int(num_blocks) + self.threads_per_block = int(threads_per_block) + self.shared_size = int(shared_size) + + total_threads = self.num_blocks * self.threads_per_block + if total_threads != self.num_threads: + # resize register and pc arrays to match requested total threads + old_regs = self.registers.copy() + old_num_threads = self.num_threads + self.num_threads = total_threads + self.registers = np.zeros( + (self.num_threads, self.num_registers), dtype=np.int32 + ) + # copy what fits + min_threads = min(old_num_threads, self.num_threads) + self.registers[:min_threads, : old_regs.shape[1]] = old_regs[ + :min_threads, : + ] + + self.pc = np.zeros(self.num_threads, dtype=np.int32) + self.active = np.ones(self.num_threads, dtype=bool) + self.flags = np.zeros(self.num_threads, dtype=np.int8) + self.sync_waiting = np.zeros(self.num_threads, dtype=bool) + self.sync_waiting_block = np.zeros(self.num_threads, dtype=bool) + + # allocate shared memory + self.shared = np.zeros((self.num_blocks, self.shared_size), dtype=np.int32) + + # initialize block_id (R5) and thread_in_block (R6) registers + # for each thread if available + for tid in range(self.num_threads): + block_id = tid // self.threads_per_block + thread_in_block = tid % self.threads_per_block + if self.num_registers > 5: + self.registers[tid, 5] = block_id + if self.num_registers > 6: + self.registers[tid, 6] = thread_in_block + # keep R7 as global tid (already set in __init__) + def load_program(self, program, labels=None): self.program = program self.labels = labels or {} self.pc[:] = 0 self.sync_waiting[:] = False + self.sync_waiting_block[:] = False self.active[:] = True self.history_registers = [] self.history_memory = [] self.history_pc = [] + self.history_flags = [] + self.history_shared = [] def step(self): """ - Execute one cycle: each active thread executes the instruction at its PC. - Instructions that modify PC are expected to set self.pc[tid] inside the - instruction. If an instruction doesn't change PC, we increment it by 1 - automatically. SYNC instructions should set sync_waiting[tid] = True and then - the core will release them when all threads have reached the same sync point - (or simple condition). + Execute one cycle: each active thread executes one instruction at its PC. + Interactions: + - SYNC (global) uses sync_waiting + - SYNCB (block) uses sync_waiting_block (released per-block) """ + # execute per-thread instruction for this cycle self._execute_threads() + + # handle synchronization barriers (global and per-block) self._handle_global_barrier() - self._record_state() + self._handle_block_barriers() + + # record history snapshot + self._record_history() def _execute_threads(self): + """Run instructions for each active thread for this cycle. + + This will execute consecutive non-control instructions for a thread + within the same cycle until either the thread sets a waiting flag + (SYNC/SYNCB), an instruction changes the PC explicitly (branch/jump), + or the program runs out of instructions. This preserves the previous + behavior where simple sequences (e.g., LOAD; ADD) execute in one + cycle. + """ for tid in range(self.num_threads): if not self.active[tid]: continue - if self.pc[tid] < 0 or self.pc[tid] >= len(self.program): - self.active[tid] = False - continue - instr, args = self.program[self.pc[tid]] - func = INSTRUCTIONS.get(instr) - before_pc = int(self.pc[tid]) - if func: - func(self, tid, *args) - if int(self.pc[tid]) == before_pc and not self.sync_waiting[tid]: - self.pc[tid] = before_pc + 1 - def _handle_global_barrier(self): - if self.sync_waiting.any(): - active_waiting = self.sync_waiting[self.active] - if active_waiting.size > 0 and active_waiting.all(): - for tid in range(self.num_threads): - if self.active[tid] and self.sync_waiting[tid]: - self.pc[tid] = int(self.pc[tid]) + 1 - self.sync_waiting[tid] = False + # repeatedly execute instructions for this thread until a + # synchronization point or an instruction that changes PC occurs + while True: + if self.pc[tid] < 0 or self.pc[tid] >= len(self.program): + self.active[tid] = False + break - def _record_state(self): - self.history_registers.append(self.registers.copy()) - self.history_memory.append(self.memory.copy()) - self.history_pc.append(self.pc.copy()) - # Loop threads and execute their instruction if active and in-range - for tid in range(self.num_threads): - if not self.active[tid]: - continue - if self.pc[tid] < 0 or self.pc[tid] >= len(self.program): - # thread finished - self.active[tid] = False - continue + instr, args = self.program[self.pc[tid]] + func = INSTRUCTIONS.get(instr) + before_pc = int(self.pc[tid]) - instr, args = self.program[self.pc[tid]] - func = INSTRUCTIONS.get(instr) - before_pc = int(self.pc[tid]) + if func: + func(self, tid, *args) - if func: - # execute instruction - func(self, tid, *args) + # if instruction changed PC or thread is waiting, stop + if ( + int(self.pc[tid]) != before_pc + or self.sync_waiting[tid] + or self.sync_waiting_block[tid] + ): + break - # If instruction didn't change PC (still same before_pc), increment - if int(self.pc[tid]) == before_pc and not self.sync_waiting[tid]: - # increment to next instruction - self.pc[tid] = before_pc + 1 + # otherwise advance to next instruction and loop to execute it + self.pc[tid] = before_pc + 1 - # handle global barrier: - # if any thread is waiting at a sync point, check if we can release + def _handle_global_barrier(self): + """Release all threads waiting at the global barrier when appropriate.""" if self.sync_waiting.any(): - # crude policy: release when all active threads have sync_waiting True - # only consider threads that are still active active_waiting = self.sync_waiting[self.active] if active_waiting.size > 0 and active_waiting.all(): - # move all waiting threads forward by 1 and clear waiting flags for tid in range(self.num_threads): if self.active[tid] and self.sync_waiting[tid]: self.pc[tid] = int(self.pc[tid]) + 1 self.sync_waiting[tid] = False - # record state + def _handle_block_barriers(self): + """Check each block and release threads waiting at per-block barriers.""" + if not self.sync_waiting_block.any(): + return + + for b in range(self.num_blocks): + start = b * self.threads_per_block + end = start + self.threads_per_block + block_active_mask = self.active[start:end] + if not block_active_mask.any(): + continue + block_waiting = self.sync_waiting_block[start:end][block_active_mask] + if block_waiting.size > 0 and block_waiting.all(): + for tid in range(start, end): + if self.active[tid] and self.sync_waiting_block[tid]: + self.pc[tid] = int(self.pc[tid]) + 1 + self.sync_waiting_block[tid] = False + + def _record_history(self): self.history_registers.append(self.registers.copy()) self.history_memory.append(self.memory.copy()) self.history_pc.append(self.pc.copy()) + self.history_flags.append(self.flags.copy()) + self.history_shared.append(self.shared.copy()) def run(self, max_cycles=1000): for _cycle in range(max_cycles): if not self.active.any(): break self.step() + + # --- Step debugger helpers --- + + def step_single(self): + """ + Execute exactly one cycle and record state (alias to step()). + Useful for interactive stepping. + """ + self.step() + + def snapshot(self, mem_slice=None, regs_threads=None): + """Return a human-friendly snapshot of current state. + + - mem_slice: (start, end) to extract part of global memory (tuple) + or None for full memory. + - regs_threads: list of thread indices to show registers for, or None + for all. + Returns a dict. + """ + if mem_slice: + start, end = mem_slice + mem_view = self.memory[start:end].tolist() + else: + mem_view = self.memory.tolist() + + if regs_threads is None: + regs_view = { + tid: self.registers[tid, :].tolist() for tid in range(self.num_threads) + } + else: + regs_view = {tid: self.registers[tid, :].tolist() for tid in regs_threads} + + return { + "cycle": len(self.history_pc), + "pc": self.pc.tolist(), + "active": self.active.tolist(), + "flags": self.flags.tolist(), + "registers": regs_view, + "memory_slice": mem_view, + "shared": self.shared.copy().tolist() if hasattr(self, "shared") else None, + } + + def rewind(self, cycles=1): + """ + Rewind simulation by 'cycles' steps using stored history. + Note: this only restores state from history arrays, and discards newer history. + """ + if cycles <= 0: + return + + if cycles > len(self.history_registers): + raise ValueError("Not enough history to rewind that many cycles.") + + # target index after rewind + target = len(self.history_registers) - cycles + # restore last snapshot at index target-1 if target>0 else initial + if target == 0: + # reset to initial empty state + self.registers[:] = 0 + self.memory[:] = 0 + self.pc[:] = 0 + self.flags[:] = 0 + if hasattr(self, "shared"): + self.shared[:] = 0 + self.history_registers = [] + self.history_memory = [] + self.history_pc = [] + self.history_flags = [] + self.history_shared = [] + else: + self.registers[:] = self.history_registers[target - 1].copy() + self.memory[:] = self.history_memory[target - 1].copy() + self.pc[:] = self.history_pc[target - 1].copy() + self.flags[:] = self.history_flags[target - 1].copy() + if hasattr(self, "shared") and len(self.history_shared) >= target: + self.shared[:] = self.history_shared[target - 1].copy() + # trim history + self.history_registers = self.history_registers[:target] + self.history_memory = self.history_memory[:target] + self.history_pc = self.history_pc[:target] + self.history_flags = self.history_flags[:target] + self.history_shared = self.history_shared[:target] + + def load_kernel( + self, program, labels=None, grid=(1, None), args=None, shared_size=0 + ): + """ + Load a kernel program and configure grid/thread mapping. + + - program, labels: assembled program (list, dict) (same as load_program) + - grid: (num_blocks, threads_per_block). threads_per_block None -> keep current + - args: list of scalar kernel arguments. These will be written into + registers R0..Rk for ALL threads. + - shared_size: allocate per-block shared memory size (optional) + """ + num_blocks, tpb = grid + if tpb is None: + tpb = ( + self.threads_per_block + if hasattr(self, "threads_per_block") + else (self.num_threads // num_blocks) + ) + # configure grid (this may resize internal thread arrays if total differs) + self.set_grid(int(num_blocks), int(tpb), shared_size=int(shared_size)) + + # set kernel args into registers R0..Rk for every thread (if provided) + if args: + for tid in range(self.num_threads): + for i, val in enumerate(args): + # write into register i (R0, R1, ...) + if i < self.num_registers: + self.registers[tid, i] = int(val) + + # finally load program and reset pcs/history + self.load_program(program, labels) + + def run_kernel(self, max_cycles=1000): + """ + Convenience wrapper: run until completion or max_cycles. + """ + self.run(max_cycles=max_cycles) diff --git a/src/tinygpu/instructions.py b/src/tinygpu/instructions.py index 93fe90a..efa4327 100644 --- a/src/tinygpu/instructions.py +++ b/src/tinygpu/instructions.py @@ -98,6 +98,111 @@ def op_cswap(gpu, tid, addr_a_operand, addr_b_operand): gpu.memory[a], gpu.memory[b] = vb, va +# Flags helper: set bitmask in gpu.flags[tid] +def _set_flags_from_result(gpu, tid, diff): + """ + diff = a - b + sets flags bitmask: + bit0 (1): Z (zero) if diff == 0 + bit1 (2): N (negative) if diff < 0 + bit2 (4): G (greater) if diff > 0 + """ + z = 1 if diff == 0 else 0 + n = 1 if diff < 0 else 0 + g = 1 if diff > 0 else 0 + gpu.flags[tid] = (z << 0) | (n << 1) | (g << 2) + + +# CMP Ra, Rb +def op_cmp(gpu, tid, op1, op2): + v1 = _resolve(gpu, tid, op1) + v2 = _resolve(gpu, tid, op2) + diff = int(v1) - int(v2) + _set_flags_from_result(gpu, tid, diff) + + +# BRGT target -> branch if greater (G bit set) +def op_brgt(gpu, tid, target): + if (gpu.flags[tid] & 0b100) != 0: + gpu.pc[tid] = int(_resolve(gpu, tid, target)) + else: + gpu.pc[tid] = int(gpu.pc[tid]) # leave unchanged (core increments) + + +# BRLT target -> branch if less (N bit set) +def op_brlt(gpu, tid, target): + if (gpu.flags[tid] & 0b010) != 0: + gpu.pc[tid] = int(_resolve(gpu, tid, target)) + else: + gpu.pc[tid] = int(gpu.pc[tid]) + + +# BRZ target -> branch if equal (Z bit set) +def op_brz(gpu, tid, target): + if (gpu.flags[tid] & 0b001) != 0: + gpu.pc[tid] = int(_resolve(gpu, tid, target)) + else: + gpu.pc[tid] = int(gpu.pc[tid]) + + +def op_shld(gpu, tid, rd_operand, saddr_operand): + """ + SHLD Rd, saddr -> Rd = shared[block_id][saddr] + saddr can be immediate or register operand. + """ + if not (isinstance(rd_operand, tuple) and rd_operand[0] == "R"): + raise TypeError("SHLD target must be a register") + rd = rd_operand[1] + # resolve shared address index + sidx = int(_resolve(gpu, tid, saddr_operand)) + # bounds check + if sidx < 0 or sidx >= gpu.shared_size: + gpu.registers[tid, rd] = 0 + return + # determine block id from tid and current threads_per_block (more robust + # than relying on register R5 which user code can overwrite) + if getattr(gpu, "threads_per_block", 0) > 0: + block_id = int(tid // gpu.threads_per_block) + else: + block_id = int(gpu.registers[tid, 5]) if gpu.num_registers > 5 else 0 + + # bounds-check block_id against allocated shared memory + if block_id < 0 or block_id >= getattr(gpu, "num_blocks", 1): + gpu.registers[tid, rd] = 0 + return + + gpu.registers[tid, rd] = int(gpu.shared[block_id, sidx]) + + +def op_shst(gpu, tid, saddr_operand, rs_operand): + """ + SHST saddr, Rs -> shared[block_id][saddr] = Rs + """ + sidx = int(_resolve(gpu, tid, saddr_operand)) + if sidx < 0 or sidx >= gpu.shared_size: + return + # determine block id from tid and current threads_per_block to avoid + # relying on register R5 which user code may overwrite + if getattr(gpu, "threads_per_block", 0) > 0: + block_id = int(tid // gpu.threads_per_block) + else: + block_id = int(gpu.registers[tid, 5]) if gpu.num_registers > 5 else 0 + val = int(_resolve(gpu, tid, rs_operand)) + # bounds-check block_id before writing + if block_id < 0 or block_id >= getattr(gpu, "num_blocks", 1): + return + gpu.shared[block_id, sidx] = val + + +def op_syncb(gpu, tid): + """ + Block barrier: mark this thread as waiting at block-level barrier. + The core's step() will release the whole block when every active + thread in that block waits. + """ + gpu.sync_waiting_block[tid] = True + + # Instruction set mapping INSTRUCTIONS = { "SET": op_set, @@ -110,4 +215,11 @@ def op_cswap(gpu, tid, addr_a_operand, addr_b_operand): "BNE": op_bne, "SYNC": op_sync, "CSWAP": op_cswap, + "CMP": op_cmp, + "BRGT": op_brgt, + "BRLT": op_brlt, + "BRZ": op_brz, + "SHLD": op_shld, + "SHST": op_shst, + "SYNCB": op_syncb, }