From 73dd947c6f801e6f540de84501c7cc94a0d26de7 Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sun, 31 May 2026 21:25:33 +0800 Subject: [PATCH] v0.1.4 release. Co-authored-by: Guojun Chen Co-authored-by: Yuxiao Guo Co-authored-by: Yuqing Xia Co-authored-by: Jilong Xue Co-authored-by: Lingxiao Ma Co-authored-by: Liu Heng <18821707235@163.com> Co-authored-by: Zheng QiHang --- .gitignore | 3 - Dockerfile | 148 ++- README.md | 179 +-- assets/glm5-mtp.png | Bin 240244 -> 0 bytes assets/glm5-without-mtp.png | Bin 249777 -> 0 bytes assets/glm5_tilert_mtp.png | Bin 0 -> 108303 bytes pyproject.toml | 27 +- python/__init__.py | 62 - python/__version__.py | 20 - python/benchmark/long_prompt.py | 46 - python/generate.py | 192 --- python/models/common.py | 361 ------ python/models/deepseek_config.py | 66 - python/models/deepseek_v3_2/modules/mla.py | 107 -- python/models/deepseek_v3_2/ops/__init__.py | 109 -- .../models/deepseek_v3_2/ops/expert_select.py | 49 - python/models/deepseek_v3_2/ops/head_proj.py | 22 - python/models/deepseek_v3_2/ops/projo_wkvb.py | 283 ----- .../deepseek_v3_2/ops/rmsnorm_proj_top1.py | 29 - .../deepseek_v3_2/ops/rmsnorm_projq_wqib.py | 689 ----------- .../deepseek_v3_2/ops/rmsnorm_projx_wqkvia.py | 1095 ----------------- .../deepseek_v3_2/ops/top1_allreduce.py | 25 - python/models/deepseek_v3_2/ops/top_p.py | 68 - .../models/deepseek_v3_2/ops/up_gate_silu.py | 24 - python/models/deepseek_v3_2/refs/kernel.py | 354 ------ python/models/glm_5/params.py | 1 - python/profiler/__init__.py | 1 - python/profiler/utils.py | 477 ------- requirements.txt | 16 +- tilert/__init__.py | 91 ++ {python => tilert}/benchmark/__init__.py | 29 +- {python => tilert}/benchmark/coding_prompt.py | 36 +- tilert/benchmark/config.py | 69 ++ tilert/benchmark/long_prompt.py | 82 ++ {python => tilert}/benchmark/short_prompt.py | 50 +- tilert/generate.py | 299 +++++ {python => tilert}/models/__init__.py | 0 {python => tilert}/models/base.py | 66 +- tilert/models/common.py | 133 ++ tilert/models/deepseek_config.py | 24 + .../models/deepseek_v3_2/__init__.py | 0 .../models/deepseek_v3_2/generator.py | 127 +- .../models/deepseek_v3_2/model_args.py | 11 +- .../models/deepseek_v3_2/modules/__init__.py | 0 .../models/deepseek_v3_2/modules/dsa.py | 87 +- .../models/deepseek_v3_2/modules/end2end.py | 257 +++- tilert/models/deepseek_v3_2/modules/mla_v2.py | 248 ++++ .../models/deepseek_v3_2/modules/mlp.py | 43 +- .../models/deepseek_v3_2/modules/moe.py | 53 +- .../models/deepseek_v3_2/modules/mtp.py | 19 +- .../deepseek_v3_2/modules/mtp_preprocess.py | 8 +- tilert/models/deepseek_v3_2/ops/__init__.py | 160 +++ .../ops/broadcast_selected_token_ids.py | 36 + .../deepseek_v3_2/ops/down_allreduce.py | 93 +- .../deepseek_v3_2/ops/eh_proj_allreduce.py | 295 +++++ .../ops/expert_down_allreduce.py | 500 ++++++++ .../ops/expert_sel_up_gate_silu.py | 713 +++++++++++ .../deepseek_v3_2/ops/flash_sparse_mla.py | 74 +- .../ops/layernorm_rope_rotate.py | 32 +- .../deepseek_v3_2/ops/padded_allreduce_add.py | 147 +++ tilert/models/deepseek_v3_2/ops/projo_wkvb.py | 483 ++++++++ .../models/deepseek_v3_2/ops/projq_wqb.py | 233 +++- .../models/deepseek_v3_2/ops/projx_wis.py | 80 +- .../models/deepseek_v3_2/ops/projx_wqaki.py | 247 ++++ .../models/deepseek_v3_2/ops/projx_wqkva.py | 329 +++++ .../models/deepseek_v3_2/ops/qkv_rope.py | 60 +- .../ops/receive_selected_token_ids.py | 35 + .../deepseek_v3_2/ops/rmsnorm_expert_proj.py | 19 +- .../deepseek_v3_2/ops/rmsnorm_head_proj.py | 296 +++++ .../models/deepseek_v3_2/ops/rmsnorm_kv.py | 34 +- .../deepseek_v3_2/ops/rmsnorm_projq_wqb.py | 540 ++++++++ .../deepseek_v3_2/ops/rmsnorm_projq_wqi.py | 340 +++++ .../deepseek_v3_2/ops/rmsnorm_projx_wqakis.py | 341 +++++ .../deepseek_v3_2/ops/rmsnorm_projx_wqkva.py | 516 ++++++++ .../models/deepseek_v3_2/ops/rmsnorm_quant.py | 49 +- .../deepseek_v3_2/ops/rmsnorm_up_gate_silu.py | 41 +- tilert/models/deepseek_v3_2/ops/rotate.py | 226 ++++ .../models/deepseek_v3_2/ops/sparse_index.py | 22 +- tilert/models/deepseek_v3_2/ops/topk.py | 171 +++ .../deepseek_v3_2/ops/unproj_o_allreduce.py | 526 ++++++++ .../models/deepseek_v3_2/refs/__init__.py | 4 + tilert/models/deepseek_v3_2/refs/kernel.py | 306 +++++ .../models/deepseek_v3_2/temp_var_indices.py | 28 +- {python => tilert}/models/glm_5/__init__.py | 0 tilert/models/glm_5/_dsa_v32/__init__.py | 1 + tilert/models/glm_5/_dsa_v32/generator.py | 531 ++++++++ tilert/models/glm_5/_dsa_v32/model_args.py | 95 ++ .../models/glm_5/_dsa_v32/modules/__init__.py | 11 + tilert/models/glm_5/_dsa_v32/modules/dsa.py | 229 ++++ .../models/glm_5/_dsa_v32/modules/end2end.py | 703 +++++++++++ .../models/glm_5/_dsa_v32/modules/mla_v2.py | 248 ++++ tilert/models/glm_5/_dsa_v32/modules/mlp.py | 74 ++ tilert/models/glm_5/_dsa_v32/modules/moe.py | 80 ++ tilert/models/glm_5/_dsa_v32/modules/mtp.py | 62 + .../glm_5/_dsa_v32/modules/mtp_preprocess.py | 238 ++++ tilert/models/glm_5/_dsa_v32/ops/__init__.py | 160 +++ .../ops/broadcast_selected_token_ids.py | 36 + .../glm_5/_dsa_v32/ops/down_allreduce.py | 343 ++++++ .../glm_5/_dsa_v32}/ops/eh_proj_allreduce.py | 52 +- .../_dsa_v32}/ops/expert_down_allreduce.py | 157 +-- .../_dsa_v32}/ops/expert_sel_up_gate_silu.py | 171 ++- .../glm_5/_dsa_v32/ops/flash_sparse_mla.py | 261 ++++ .../_dsa_v32/ops/layernorm_rope_rotate.py | 243 ++++ .../_dsa_v32/ops/padded_allreduce_add.py | 147 +++ .../models/glm_5/_dsa_v32/ops/projo_wkvb.py | 483 ++++++++ tilert/models/glm_5/_dsa_v32/ops/projq_wqb.py | 466 +++++++ tilert/models/glm_5/_dsa_v32/ops/projx_wis.py | 211 ++++ .../models/glm_5/_dsa_v32/ops/projx_wqaki.py | 247 ++++ .../models/glm_5/_dsa_v32/ops/projx_wqkva.py | 330 +++++ tilert/models/glm_5/_dsa_v32/ops/qkv_rope.py | 192 +++ .../ops/receive_selected_token_ids.py | 35 + .../glm_5/_dsa_v32/ops/rmsnorm_expert_proj.py | 172 +++ .../glm_5/_dsa_v32}/ops/rmsnorm_head_proj.py | 69 +- .../models/glm_5/_dsa_v32/ops/rmsnorm_kv.py | 204 +++ .../glm_5/_dsa_v32/ops/rmsnorm_projq_wqb.py | 530 ++++++++ .../glm_5/_dsa_v32/ops/rmsnorm_projq_wqi.py | 330 +++++ .../_dsa_v32/ops/rmsnorm_projx_wqakis.py | 341 +++++ .../glm_5/_dsa_v32/ops/rmsnorm_projx_wqkva.py | 516 ++++++++ .../glm_5/_dsa_v32/ops/rmsnorm_quant.py | 64 + .../_dsa_v32/ops/rmsnorm_up_gate_silu.py | 363 ++++++ .../models/glm_5/_dsa_v32}/ops/rotate.py | 67 +- .../models/glm_5/_dsa_v32/ops/sparse_index.py | 135 ++ .../models/glm_5/_dsa_v32}/ops/topk.py | 43 +- .../glm_5/_dsa_v32}/ops/unproj_o_allreduce.py | 302 +++-- .../models/glm_5/_dsa_v32/temp_var_indices.py | 118 ++ {python => tilert}/models/glm_5/generator.py | 137 +-- {python => tilert}/models/glm_5/model_args.py | 11 +- .../models/preprocess/weight_converter.py | 88 +- {python => tilert}/models/utils.py | 43 +- {python => tilert}/tilert_init.py | 0 {python => tilert}/utils.py | 32 +- 131 files changed, 17308 insertions(+), 5224 deletions(-) delete mode 100644 assets/glm5-mtp.png delete mode 100644 assets/glm5-without-mtp.png create mode 100644 assets/glm5_tilert_mtp.png delete mode 100644 python/__init__.py delete mode 100644 python/__version__.py delete mode 100644 python/benchmark/long_prompt.py delete mode 100644 python/generate.py delete mode 100644 python/models/common.py delete mode 100644 python/models/deepseek_config.py delete mode 100644 python/models/deepseek_v3_2/modules/mla.py delete mode 100644 python/models/deepseek_v3_2/ops/__init__.py delete mode 100644 python/models/deepseek_v3_2/ops/expert_select.py delete mode 100644 python/models/deepseek_v3_2/ops/head_proj.py delete mode 100644 python/models/deepseek_v3_2/ops/projo_wkvb.py delete mode 100644 python/models/deepseek_v3_2/ops/rmsnorm_proj_top1.py delete mode 100644 python/models/deepseek_v3_2/ops/rmsnorm_projq_wqib.py delete mode 100644 python/models/deepseek_v3_2/ops/rmsnorm_projx_wqkvia.py delete mode 100644 python/models/deepseek_v3_2/ops/top1_allreduce.py delete mode 100644 python/models/deepseek_v3_2/ops/top_p.py delete mode 100644 python/models/deepseek_v3_2/ops/up_gate_silu.py delete mode 100644 python/models/deepseek_v3_2/refs/kernel.py delete mode 100644 python/models/glm_5/params.py delete mode 100644 python/profiler/__init__.py delete mode 100644 python/profiler/utils.py create mode 100644 tilert/__init__.py rename {python => tilert}/benchmark/__init__.py (83%) rename {python => tilert}/benchmark/coding_prompt.py (54%) create mode 100644 tilert/benchmark/config.py create mode 100644 tilert/benchmark/long_prompt.py rename {python => tilert}/benchmark/short_prompt.py (66%) create mode 100644 tilert/generate.py rename {python => tilert}/models/__init__.py (100%) rename {python => tilert}/models/base.py (83%) create mode 100644 tilert/models/common.py create mode 100644 tilert/models/deepseek_config.py rename {python => tilert}/models/deepseek_v3_2/__init__.py (100%) rename {python => tilert}/models/deepseek_v3_2/generator.py (81%) rename {python => tilert}/models/deepseek_v3_2/model_args.py (92%) rename {python => tilert}/models/deepseek_v3_2/modules/__init__.py (100%) rename {python => tilert}/models/deepseek_v3_2/modules/dsa.py (70%) rename {python => tilert}/models/deepseek_v3_2/modules/end2end.py (68%) create mode 100644 tilert/models/deepseek_v3_2/modules/mla_v2.py rename {python => tilert}/models/deepseek_v3_2/modules/mlp.py (51%) rename {python => tilert}/models/deepseek_v3_2/modules/moe.py (53%) rename {python => tilert}/models/deepseek_v3_2/modules/mtp.py (75%) rename {python => tilert}/models/deepseek_v3_2/modules/mtp_preprocess.py (95%) create mode 100644 tilert/models/deepseek_v3_2/ops/__init__.py create mode 100644 tilert/models/deepseek_v3_2/ops/broadcast_selected_token_ids.py rename {python => tilert}/models/deepseek_v3_2/ops/down_allreduce.py (82%) create mode 100644 tilert/models/deepseek_v3_2/ops/eh_proj_allreduce.py create mode 100644 tilert/models/deepseek_v3_2/ops/expert_down_allreduce.py create mode 100644 tilert/models/deepseek_v3_2/ops/expert_sel_up_gate_silu.py rename {python => tilert}/models/deepseek_v3_2/ops/flash_sparse_mla.py (88%) rename {python => tilert}/models/deepseek_v3_2/ops/layernorm_rope_rotate.py (92%) create mode 100644 tilert/models/deepseek_v3_2/ops/padded_allreduce_add.py create mode 100644 tilert/models/deepseek_v3_2/ops/projo_wkvb.py rename {python => tilert}/models/deepseek_v3_2/ops/projq_wqb.py (51%) rename {python => tilert}/models/deepseek_v3_2/ops/projx_wis.py (62%) create mode 100644 tilert/models/deepseek_v3_2/ops/projx_wqaki.py create mode 100644 tilert/models/deepseek_v3_2/ops/projx_wqkva.py rename {python => tilert}/models/deepseek_v3_2/ops/qkv_rope.py (77%) create mode 100644 tilert/models/deepseek_v3_2/ops/receive_selected_token_ids.py rename {python => tilert}/models/deepseek_v3_2/ops/rmsnorm_expert_proj.py (94%) create mode 100644 tilert/models/deepseek_v3_2/ops/rmsnorm_head_proj.py rename {python => tilert}/models/deepseek_v3_2/ops/rmsnorm_kv.py (88%) create mode 100644 tilert/models/deepseek_v3_2/ops/rmsnorm_projq_wqb.py create mode 100644 tilert/models/deepseek_v3_2/ops/rmsnorm_projq_wqi.py create mode 100644 tilert/models/deepseek_v3_2/ops/rmsnorm_projx_wqakis.py create mode 100644 tilert/models/deepseek_v3_2/ops/rmsnorm_projx_wqkva.py rename {python => tilert}/models/deepseek_v3_2/ops/rmsnorm_quant.py (56%) rename {python => tilert}/models/deepseek_v3_2/ops/rmsnorm_up_gate_silu.py (93%) create mode 100644 tilert/models/deepseek_v3_2/ops/rotate.py rename {python => tilert}/models/deepseek_v3_2/ops/sparse_index.py (87%) create mode 100644 tilert/models/deepseek_v3_2/ops/topk.py create mode 100644 tilert/models/deepseek_v3_2/ops/unproj_o_allreduce.py rename {python => tilert}/models/deepseek_v3_2/refs/__init__.py (61%) create mode 100644 tilert/models/deepseek_v3_2/refs/kernel.py rename {python => tilert}/models/deepseek_v3_2/temp_var_indices.py (72%) rename {python => tilert}/models/glm_5/__init__.py (100%) create mode 100644 tilert/models/glm_5/_dsa_v32/__init__.py create mode 100644 tilert/models/glm_5/_dsa_v32/generator.py create mode 100644 tilert/models/glm_5/_dsa_v32/model_args.py create mode 100644 tilert/models/glm_5/_dsa_v32/modules/__init__.py create mode 100644 tilert/models/glm_5/_dsa_v32/modules/dsa.py create mode 100644 tilert/models/glm_5/_dsa_v32/modules/end2end.py create mode 100644 tilert/models/glm_5/_dsa_v32/modules/mla_v2.py create mode 100644 tilert/models/glm_5/_dsa_v32/modules/mlp.py create mode 100644 tilert/models/glm_5/_dsa_v32/modules/moe.py create mode 100644 tilert/models/glm_5/_dsa_v32/modules/mtp.py create mode 100644 tilert/models/glm_5/_dsa_v32/modules/mtp_preprocess.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/__init__.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/broadcast_selected_token_ids.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/down_allreduce.py rename {python/models/deepseek_v3_2 => tilert/models/glm_5/_dsa_v32}/ops/eh_proj_allreduce.py (87%) rename {python/models/deepseek_v3_2 => tilert/models/glm_5/_dsa_v32}/ops/expert_down_allreduce.py (78%) rename {python/models/deepseek_v3_2 => tilert/models/glm_5/_dsa_v32}/ops/expert_sel_up_gate_silu.py (86%) create mode 100644 tilert/models/glm_5/_dsa_v32/ops/flash_sparse_mla.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/layernorm_rope_rotate.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/padded_allreduce_add.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/projo_wkvb.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/projq_wqb.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/projx_wis.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/projx_wqaki.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/projx_wqkva.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/qkv_rope.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/receive_selected_token_ids.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/rmsnorm_expert_proj.py rename {python/models/deepseek_v3_2 => tilert/models/glm_5/_dsa_v32}/ops/rmsnorm_head_proj.py (84%) create mode 100644 tilert/models/glm_5/_dsa_v32/ops/rmsnorm_kv.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projq_wqb.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projq_wqi.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projx_wqakis.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projx_wqkva.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/rmsnorm_quant.py create mode 100644 tilert/models/glm_5/_dsa_v32/ops/rmsnorm_up_gate_silu.py rename {python/models/deepseek_v3_2 => tilert/models/glm_5/_dsa_v32}/ops/rotate.py (79%) create mode 100644 tilert/models/glm_5/_dsa_v32/ops/sparse_index.py rename {python/models/deepseek_v3_2 => tilert/models/glm_5/_dsa_v32}/ops/topk.py (75%) rename {python/models/deepseek_v3_2 => tilert/models/glm_5/_dsa_v32}/ops/unproj_o_allreduce.py (56%) create mode 100644 tilert/models/glm_5/_dsa_v32/temp_var_indices.py rename {python => tilert}/models/glm_5/generator.py (79%) rename {python => tilert}/models/glm_5/model_args.py (92%) rename {python => tilert}/models/preprocess/weight_converter.py (90%) rename {python => tilert}/models/utils.py (76%) rename {python => tilert}/tilert_init.py (100%) rename {python => tilert}/utils.py (62%) diff --git a/.gitignore b/.gitignore index 364b379..6baf500 100644 --- a/.gitignore +++ b/.gitignore @@ -38,9 +38,6 @@ MANIFEST *.so .cache/ -# Development mode soft link -tilert - !src/lib/ !include/lib/ diff --git a/Dockerfile b/Dockerfile index fd1f7ec..b31f15b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,46 +1,128 @@ -FROM pytorch/manylinux2_28-builder:cuda12.9-main +# TileRT release builder / runtime image +# +# Every dep version is pinned to the validated set. Don't bump anything +# without re-running the full release pipeline (build wheel → fresh container +# → pip install wheel → pytest on B200 GPUs). +# +# Especially: transformers MUST be 4.46.3. The 5.x branch is not backward +# compatible with TileRT's tokenizer/model loading paths. +# +# Build: +# docker build -t tileai/tilert:cu132-v0.1.4 . +# Pull pre-built: +# docker pull tileai/tilert:cu132-v0.1.4 +# Use: +# docker run --rm --gpus all -v $PWD:/workspace -w /workspace \ +# tileai/tilert:cu132-v0.1.4 make wheel BUILD_TYPE=Release + +FROM pytorch/manylinux2_28-builder:cuda13.2-main SHELL ["/bin/bash", "-c"] -RUN yum update -y && \ - yum install -y epel-release yum-utils vim && \ - (yum config-manager --set-enabled powertools || \ - yum config-manager --set-enabled crb || true) && \ - yum --enablerepo=epel install -y glog glog-devel && \ - yum clean all && \ - rm -rf /var/cache/yum /var/tmp/* /tmp/* - -RUN conda init bash && \ - . /opt/conda/etc/profile.d/conda.sh && \ - conda create -y -n tilert python=3.12 && \ - conda activate tilert && \ - conda clean -afy && \ - rm -rf /opt/conda/pkgs/* /opt/conda/conda-meta/*.json.bak +# ── System packages (glog: TileRT runtime dep; zstd: image transport) ──────── +RUN yum install -y --setopt=install_weak_deps=False \ + epel-release yum-utils vim && \ + (yum config-manager --set-enabled powertools 2>/dev/null || \ + yum config-manager --set-enabled crb 2>/dev/null || true) && \ + yum --enablerepo=epel install -y --setopt=install_weak_deps=False \ + glog glog-devel zstd && \ + rpm -e --nodeps cmake 2>/dev/null || true && \ + yum clean all && rm -rf /var/cache/yum /var/tmp/* /tmp/* -COPY requirements.txt requirements-dev.txt /tmp/ +# ── Conda env: python 3.12, named "tilert" ─────────────────────────────────── RUN . /opt/conda/etc/profile.d/conda.sh && \ - conda activate tilert && \ - pip install --no-cache-dir -r /tmp/requirements-dev.txt && \ - pip cache purge && \ - rm -rf /tmp/requirements*.txt /root/.cache/pip /root/.cache/* && \ + conda create -y -n tilert python=3.12.9 && \ + conda clean -afy && rm -rf /opt/conda/pkgs/* + +# ── Pinned lock set (resolved 2026-05-27 against torch 2.11.0+cu130 + +# transformers 4.46.3 on python 3.12 / manylinux_2_28) ───────────────────── +# +# torch's METADATA transitively pins the nvidia-* cu13 runtime packages +# (cublas==13.1.0.3, cudnn-cu13==9.19.0.56, nccl-cu13==2.28.9, etc.) — those +# are NOT re-pinned here on purpose, so any patch bump in PyTorch's cu130 +# release line flows through. +ARG PIP_INDEX_URL=https://download.pytorch.org/whl/cu130 +ARG PIP_EXTRA_INDEX_URL=https://pypi.org/simple +RUN . /opt/conda/etc/profile.d/conda.sh && conda activate tilert && \ + pip install --no-cache-dir \ + --index-url "$PIP_INDEX_URL" \ + --extra-index-url "$PIP_EXTRA_INDEX_URL" \ + --upgrade pip==25.3 && \ + pip install --no-cache-dir \ + --index-url "$PIP_INDEX_URL" \ + --extra-index-url "$PIP_EXTRA_INDEX_URL" \ + "torch==2.11.0+cu130" \ + "triton==3.6.0" \ + "transformers==4.46.3" \ + "tokenizers==0.20.3" \ + "huggingface_hub==0.35.3" \ + "hf_xet==1.1.10" \ + "safetensors==0.6.2" \ + "regex==2025.9.18" \ + "requests==2.32.3" \ + "charset_normalizer==3.3.2" \ + "idna==3.7" \ + "urllib3==2.3.0" \ + "certifi==2026.2.25" \ + "packaging==24.2" \ + "tqdm==4.67.1" \ + "pyyaml==6.0.2" \ + "numpy==2.3.2" \ + "einops==0.8.1" \ + "filelock==3.29.0" \ + "fsspec==2026.4.0" \ + "jinja2==3.1.6" \ + "MarkupSafe==3.0.3" \ + "networkx==3.6.1" \ + "sympy==1.14.0" \ + "mpmath==1.3.0" \ + "typing_extensions==4.15.0" \ + "setuptools==81.0.0" \ + "importlib_metadata==8.7.1" \ + "zipp==3.23.0" \ + "scikit-build-core==0.12.2" \ + "setuptools-scm==9.2.2" \ + "vcs-versioning==1.1.1" \ + "pathspec==1.1.1" \ + "ninja==1.13.0" \ + "cmake==4.1.2" \ + "pytest==8.4.1" \ + "pytest-cov==7.1.0" \ + "pluggy==1.6.0" \ + "iniconfig==2.3.0" \ + "pygments==2.20.0" \ + "tomli==2.4.1" \ + "coverage==7.10.7" \ + "exceptiongroup==1.3.1" && \ + python -c 'import torch, triton, transformers, tokenizers; assert torch.__version__ == "2.11.0+cu130", torch.__version__; assert torch.version.cuda.startswith("13"), torch.version.cuda; assert triton.__version__ == "3.6.0", triton.__version__; assert transformers.__version__ == "4.46.3", transformers.__version__; assert tokenizers.__version__ == "0.20.3", tokenizers.__version__; print("torch", torch.__version__, "cuda", torch.version.cuda, "| triton", triton.__version__, "| transformers", transformers.__version__, "| tokenizers", tokenizers.__version__, "OK")' && \ + pip cache purge && rm -rf /root/.cache/pip /root/.cache/* && \ conda clean -afy && \ find /opt/conda -type f -name "*.pyc" -delete && \ find /opt/conda -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true -RUN echo "alias cls='clear'" >> ~/.bashrc && \ - echo "alias ll='ls -l'" >> ~/.bashrc && \ - echo "alias la='ls -a'" >> ~/.bashrc && \ - echo "alias vi='vim'" >> ~/.bashrc && \ - echo "alias grep='grep --color=auto'" >> ~/.bashrc && \ - echo "export PATH=\"/opt/conda/bin:\$PATH\"" >> ~/.bashrc && \ - echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - echo "conda activate tilert" >> ~/.bashrc && \ - echo '#!/bin/bash' > /usr/local/bin/entrypoint.sh && \ - echo 'export PATH="/opt/conda/bin:$PATH"' >> /usr/local/bin/entrypoint.sh && \ - echo '. /opt/conda/etc/profile.d/conda.sh' >> /usr/local/bin/entrypoint.sh && \ - echo 'conda activate tilert' >> /usr/local/bin/entrypoint.sh && \ - echo 'exec "$@"' >> /usr/local/bin/entrypoint.sh && \ +# ── CUDA arch (Blackwell sm_100) + scikit-build pass-through ───────────────── +ENV TORCH_CUDA_ARCH_LIST="10.0" \ + CUDAARCHS="100" \ + CMAKE_ARGS="-DUSER_CUDA_ARCH_LIST=10.0" \ + SKBUILD_CMAKE_DEFINE="USER_CUDA_ARCH_LIST=10.0" \ + CMAKE_BUILD_PARALLEL_LEVEL=16 \ + PATH="/opt/conda/envs/tilert/bin:/opt/conda/bin:${PATH}" + +# ── Shell activation + entrypoint ───────────────────────────────────────────── +RUN { echo 'export PATH=/opt/conda/envs/tilert/bin:/opt/conda/bin:$PATH'; \ + echo '. /opt/conda/etc/profile.d/conda.sh'; \ + echo 'conda activate tilert 2>/dev/null || true'; \ + } >> /etc/bashrc && \ + printf '%s\n' \ + '#!/bin/bash' \ + 'set -e' \ + '. /opt/conda/etc/profile.d/conda.sh' \ + 'conda activate tilert' \ + 'exec "$@"' \ + > /usr/local/bin/entrypoint.sh && \ chmod +x /usr/local/bin/entrypoint.sh +WORKDIR /workspace + ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] CMD ["/bin/bash"] diff --git a/README.md b/README.md index fe5d3df..5cb3d7c 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,9 @@ ______________________________________________________________________ ## 📰 News -- 🏭 **2026-05-22 · [TileRT in Production](https://www.tilert.ai/blog/speed-as-the-next-scaling-law-zh.html)**. [GLM-5.1-highspeed](https://docs.bigmodel.cn/cn/guide/models/text/glm-5.1-highspeed) is now live on Z.ai, powered by TileRT — from experimental prototype to real production. TileRT-v0.1.4 is coming soon. +- 🚀 **2026-06-01 · [v0.1.4](https://github.com/tile-ai/TileRT/releases/tag/v0.1.4) Released**. A major performance upgrade for both DeepSeek-V3.2 and GLM-5, with model quality unchanged. See the benchmark charts for details. + +- 🏭 **2026-05-22 · [TileRT in Production](https://www.tilert.ai/blog/speed-as-the-next-scaling-law-zh.html)**. [GLM-5.1-highspeed](https://docs.bigmodel.cn/cn/guide/models/text/glm-5.1-highspeed) is now live on Z.ai, powered by TileRT — from experimental prototype to real production. - :fire: **2026-02-14 · [Try the Online Demo](https://www.tilert.ai/)**. Our online demo is now live! Experience ultra-low-latency inference with **GLM-5** and **DeepSeek-V3.2**. [Try it now !](https://www.tilert.ai) @@ -49,68 +51,72 @@ To achieve this, TileRT introduces a **tile-level runtime engine**. Leveraging a The project is actively evolving, and the underlying compiler techniques will be gradually shared with the community as they are integrated into **TileLang** and **TileScale**. +

+ GLM-5.1-FP8 token generation speed on 8× B200 with TileRT v0.1.4 +
+ GLM-5.1-FP8 token generation speed on 8× NVIDIA B200 with TileRT v0.1.4. Output length 1K, input length 1K–192K. Bars compare TileRT without MTP, with MTP at average acceptance length 3.2, and the peak under best-case MTP acceptance. +

+ ______________________________________________________________________ ## Installation -- [Prerequisites](#prerequisites) -- [Python Package Installation](#python-package-installation) - -### Prerequisites - -Before installing TileRT, ensure your environment meets the following requirements: - -**Hardware Requirements** - -- 8× NVIDIA B200 GPUs - -**Operating System** - -- Linux x86_64 (Ubuntu 20.04 or later recommended) - -**Python Version** - -- Python 3.11 – 3.12 - *(The wheel package is built and tested against these versions.)* - -**PyTorch Build** +> \[!IMPORTANT\] +> TileRT v0.1.4 is distributed as a **pre-built binary wheel**. The wheel is linked against the exact ABI of the versions listed below. Other combinations of Python, CUDA, or PyTorch versions are **untested and not guaranteed to work** — please reproduce this environment for a supported setup. -- PyTorch wheels compiled for CUDA 12.8 or 12.9 - *(Must match the CUDA driver/runtime version required for B200 GPUs.)* +### Build environment of the v0.1.4 wheel -### Python Package Installation +The official `tilert==0.1.4` wheel on PyPI was compiled against the following stack. Treat these as **hard requirements**, not lower bounds. -> \[!IMPORTANT\] -> **Disclaimer**: TileRT is an experimental project. The current pre-built package supports the 8-GPU B200 setup. For the most reliable experience, we strongly recommend installing the package within the provided Docker image. +| Component | Pinned version | +| ---------------- | --------------------------------------------------- | +| GPU | 8× NVIDIA **B200** | +| NVIDIA driver | Supports **CUDA 13.2** runtime | +| Operating System | Linux **x86_64**, glibc **≥ 2.28** (manylinux_2_28) | +| Python | **3.12** | +| PyTorch | **`torch==2.11.0+cu130`** | +| `transformers` | **`4.46.3`** | +| `tokenizers` | **`0.20.3`** | -The recommended installation method is using the pre-configured Docker image, which includes all necessary dependencies. +### Recommended: pre-built Docker image -**Step 1: Pull the Docker image** +The pinned build environment above is preinstalled in our official image +— this is the **recommended way to run v0.1.4** and avoids any version +drift on the host. The image is mirrored to two registries; pull from +whichever is reachable: ```bash -docker pull tileai/tilert:v0.1.0 +# GitHub Container Registry +docker pull ghcr.io/tile-ai/tilert:cu132-latest + +# Docker Hub +docker pull tileai/tilert:cu132-latest ``` -**Step 2: Launch a Docker container** +Launch a container with all 8 B200 GPUs attached, then install the +wheel inside: ```bash -IMAGE_NAME="tileai/tilert:v0.1.0" -WORKSPACE_PATH="/path/to/your/workspace" # Replace with your actual workspace path +docker run --rm -it --gpus all --ipc=host \ + -v "$PWD":/workspace -w /workspace \ + ghcr.io/tile-ai/tilert:cu132-latest -docker run --gpus all -it \ - -v $WORKSPACE_PATH:/workspace/ \ - $IMAGE_NAME -``` +# Inside the container — install from PyPI: +pip install tilert==0.1.4 -**Step 3: Install the TileRT package** +# Or pin the exact wheel from the GitHub Release page directly +# (same artifact, useful when PyPI is unreachable): +pip install https://github.com/tile-ai/TileRT/releases/download/v0.1.4/tilert-0.1.4-cp312-cp312-manylinux_2_28_x86_64.whl +``` -Once inside the container, install TileRT using pip: +Verify the install: ```bash -pip install tilert +python -c "import tilert, torch; print('tilert', tilert.__version__, '/ torch', torch.__version__, '/ cuda', torch.version.cuda)" +# Expected: tilert 0.1.4 / torch 2.11.0+cu130 / cuda 13.0 ``` -You're now ready to use TileRT! Proceed to the [Getting Started](#getting-started) section to download model weights and run your first inference. +Proceed to [Getting Started](#getting-started) to download and convert model weights. ## Getting Started @@ -118,11 +124,15 @@ You're now ready to use TileRT! Proceed to the [Getting Started](#getting-starte Starting from release v0.1.3, TileRT no longer requires downloading pre-converted weights from Hugging Face. Instead, you can download the official model weights directly from the model's source (e.g., Hugging Face), and then convert them using the weight converter script included with the latest TileRT release. -### Step 2: Convert Weights Using `weight_converter.py` +### Step 2: Shard Weights with `weight_converter` -After downloading the official model weights, you can use the following command to convert them into a format compatible with TileRT: +The converter ships inside the `tilert` wheel. It rewrites the official HF +checkpoint into TileRT's per-device layout — 8 shards, one per B200, with +keys suffixed `*_dev_{0..7}` and a fresh `model.safetensors.index.json`. +The runtime loads these shards directly; the original checkpoint is no +longer needed after conversion. -For **DeepSeek-V3.2**, run: +For **DeepSeek-V3.2**: ```bash python -m tilert.models.preprocess.weight_converter \ @@ -131,9 +141,7 @@ python -m tilert.models.preprocess.weight_converter \ --save_dir "/path/to/DeepSeek-V3.2-TileRT" ``` -Replace `/path/to/DeepSeek-V3.2` with the directory where you've downloaded the model weights, and `/path/to/DeepSeek-V3.2-TileRT` with the directory where you'd like the converted weights to be saved. - -Similarly, for **GLM-5**, run: +For **GLM-5**: ```bash python -m tilert.models.preprocess.weight_converter \ @@ -142,40 +150,52 @@ python -m tilert.models.preprocess.weight_converter \ --save_dir "/path/to/GLM-5-FP8-TileRT" ``` -Replace `/path/to/GLM-5-FP8` with the directory containing the downloaded GLM-5 model weights, and `/path/to/GLM-5-FP8-TileRT` with the desired location for saving the converted weights. +`--model_dir` is the directory of the downloaded HF checkpoint; +`--save_dir` is where the sharded TileRT-format weights will land. -### Step 3: Set the Converted Weights Directory +### Step 3: Register the Sharded Weights Path -Once the weights are converted, set the environment variable to point TileRT to the directory containing the converted weights: +Either pass `--model-weights-dir ` on every `tilert.generate` +invocation, or register the path once in `~/.tilert/config.toml` so the +CLI picks it up automatically: -```bash -export MODEL_WEIGHTS_DIR= ... # converted weights +```toml +[weights] +deepseek_v3_2 = "/path/to/DeepSeek-V3.2-TileRT" +glm5 = "/path/to/GLM-5-FP8-TileRT" ``` -Now you're ready to use TileRT with the converted weights! - ### Running the Generation Example -After downloading the model weights, you can run the generation example within the Docker environment as follows: +The simplest entry point is the bundled CLI. Pick `--model deepseek_v3_2` +or `--model glm5`; weights resolve from `~/.tilert/config.toml` or from +an explicit `--model-weights-dir`: ```bash -MODEL_WEIGHTS_DIR="/path/to/tilert_weights" - -docker run --gpus all -it \ - -v $WORKSPACE_PATH:/workspace/ \ - -v $MODEL_WEIGHTS_DIR:$MODEL_WEIGHTS_MOUNT \ - tilert:v0.1.0 +python -m tilert.generate --model deepseek_v3_2 --max-new-tokens 1000 ``` -Once inside the container, run the following Python script to perform text generation: +> \[!NOTE\] +> v0.1.4 ships **two independent backend libraries** (`libtilert_dsv32.so` +> and `libtilert_glm5.so`) and loads exactly one per Python process via +> `tilert.load_backend(model_type)`. Run DeepSeek-V3.2 and GLM-5 in +> separate processes — they cannot coexist in a single interpreter. + +To drive generation programmatically, load the backend first, then build +the matching generator: ```python -from tilert.models.deepseek_v3_2.dsa_show_hands import ShowHandsGenerator +import tilert +from tilert.models.deepseek_v3_2.generator import DSAv32Generator +from tilert.models.deepseek_v3_2.model_args import ModelArgs -generator: ShowHandsGenerator = ShowHandsGenerator( +tilert.load_backend("deepseek_v3_2") + +generator = DSAv32Generator( + model_args=ModelArgs(), max_new_tokens=1000, - model_weights_dir=MODEL_WEIGHTS_DIR, - with_mtp=False, # Disable MTP + model_weights_dir="/path/to/DeepSeek-V3.2-TileRT", + with_mtp=False, ) generator.from_pretrained() @@ -193,6 +213,10 @@ print("Completion:") completion = generator.generate(prompt) ``` +(For **GLM-5**, swap in `tilert.load_backend("glm5")` and +`from tilert.models.glm_5.generator import GLM5Generator` with +`ModelArgsGLM5`.) + For example, TileRT may generate:
@@ -210,17 +234,26 @@ This example demonstrates basic single-step autoregressive generation using the ### Running the Generation Example with Multi-Token Prediction (MTP) -TileRT also supports Multi-Token Prediction (MTP), which allows the model to generate multiple tokens per forward pass and reduces sequential decoding depth. +TileRT also supports Multi-Token Prediction (MTP), which allows the model to generate multiple tokens per forward pass and reduces sequential decoding depth. Enable it from the CLI with `--with-mtp`: + +```bash +python -m tilert.generate --model deepseek_v3_2 --with-mtp --max-new-tokens 1000 +``` -To better illustrate MTP behavior, we use a longer prompt that encourages extended generation: +Or programmatically, pass `with_mtp=True` to the generator: ```python -from tilert.models.deepseek_v3_2.dsa_show_hands import ShowHandsGenerator +import tilert +from tilert.models.deepseek_v3_2.generator import DSAv32Generator +from tilert.models.deepseek_v3_2.model_args import ModelArgs + +tilert.load_backend("deepseek_v3_2") -generator: ShowHandsGenerator = ShowHandsGenerator( +generator = DSAv32Generator( + model_args=ModelArgs(), max_new_tokens=1000, - model_weights_dir=MODEL_WEIGHTS_DIR, - with_mtp=True, # Enable MTP + model_weights_dir="/path/to/DeepSeek-V3.2-TileRT", + with_mtp=True, ) generator.from_pretrained() prompt = "Tell me 10 jokes, keep them all under 100 words." @@ -269,7 +302,7 @@ Of course! Here are 10 short jokes for you. This example highlights how MTP enables TileRT to efficiently generate longer outputs by accepting multiple tokens per decoding step, while preserving the same Python API interface. -For more details, please refer to the [generation script](https://github.com/tile-ai/TileRT/blob/main/python/generate.py). +For the full list of CLI flags (sampling, batching, benchmark modes, …), run `python -m tilert.generate --help`. ## Status & Future Work diff --git a/assets/glm5-mtp.png b/assets/glm5-mtp.png deleted file mode 100644 index d9ebb32534cebc783348a3db2cd204f037154838..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 240244 zcmeEubyQVr*Y5^JbfbX476mC$DG8MjkXAuaT0&YtkVd*gw}DcMh#;NP-J32&QaU!J zbV~P~>%4kC|M>3x|Bmr8#yO%0uJx?vne$h3y;6{qK7pn|qfn?5w`Fc9p->c+C=~k5 z(IfCDPHE3n;1?mgo9cGTR*&o)^`97_?&;fEn_JnLKYqyQVD!ZHv6bajuFE`J*Df)d z*x6az3UPB={QEb!tezNiS83X}z)v}5Eu&$JLUA!8{}DJPh&iALP^jBCBvhO}%?%Ot zs&wHG4l=#$jvi$lVv0_V_hS6a7~6{->TRerFfbr~=!Nx*Wo3Fx=<|~B9NUR=es(gu znXNASpBP>o%I*0klFPENF|Lx=1d1-YH!vEV> z;m@C;F8#m0>igd>|1Zn)?7z$MzpM)CzsvH!t;&Cw<$qn3|1Qgam*wvj_;2y}do}+3 z{J+cc?^68z$$yvS@74JC^Z$>RgU{DV&IXm<3gTxse?x3b>~Y~__7c&Pmc|@`zgWiIW@c8*;SJD8t7fq=iH ztL8iF^ER60zPIGBjUF}|EXvM*{Jq}y6emrg5!|G06)D<9!Q`i|=bn*SPlmlgF(Eyv z*seA8l6CD#=^}-%Pkw&YnD5U|H|#cU!sV zx==hByRTDdIW#vBZY9^q&$Lsr+97Mv5q~>`t;Av8^k9FZB5u1S$EYdIezyA*|C%n^ zxqsu!JC=Z-tx?0jel{6n(U_dEy4R)hAFmjtNn+8eakoySs(gN%Py$c1*u%N@TP>H7I zn)Bhk6U;hV-vj^pgP&jDr@O4p+>T%%kXdkx)~|WKUPa7Mo98y1&95A36^LW|u`-)k zQQ&=uxc|qOcbx4NA5v^~f4q|~kc$w~7$~q9Dm{VvIEr^&%qKW>lQ;+~HpgCTqYf*5@O|}yl+N^nu19ggYpaSq{=V?Q z4!qO&3D^CtNluLW_H>eJrVh@~>d2MkgO1*~0P~@euI@}-hGk*tvo_D@T<5a8bBtuo zz9;&tlk^v^uM}Y!qznK@7c)>wG?2e#J+<)<}_sl~%EX zA_MvxH=nW}RcU-KT5{_C-J>YfH{^Uqx&3;zHvPQ;{$aX)ruJ*)R(O>REy2?L?b-Qn zS6j~?B|!x$xI9-0Jf8D`N$Pq0eblMN(cH|TQpeJOiLW_u^k2@ed8EPt;M&xHWs=7c z|E>;Rck4a3i2v;Gtf&ZK$NtjAaJ!UAb}=ksi%YQzr;EhBKSE4lRI)iQM~X@mN) z0@DxiWzPJ@EuRdnZruqNw0)E#Kp>;xd9YWww^T#6J{1@6;PolaG&qm^rk$+RiSO_3 z?Q9e}%=h<^aF+%CTuOaPWVSZjvnbYNd{>ShLn;>~It0tXCw#EGqJ6ZxiMzs_Lp4oi zOqBo?=vMQBtPqx=Us+xRW4hVp9m1vk&%sD{-&wb8If@c5gA@FP=)Tx;E#=uk^C`Fi z*==D~U0MOlyljJpi>_A#FH(_bdJ24Y4BXCspLj34WFs(DHT}+d;_rPTUQO;r^k)^8 z4rm3R#M{sHwnw`C`VxQY)vfIdu0;9~HuY!tZN_R1tw>4r`|}p`Iy=@%Kxw`Vwn*E*Hw_cn$L;hftjShj8g@!y3xx2poEBy0^WyAQF#v zX8Yl=GSU16fByF^z7!_R`g6D91%>)GM@rYaHRqqxT4upPir-yUFS3r==`zOZCp2+? zGG-BCf$ElT$K$j#VrV6KN15ri(CRVql#jxW8TRx2T~Ng6;e>0l&VCZ}ERS%+J0mrY z#8sEOY)O)q&r~IOwk~EKj~gJm9_i5kczrInl_HM-)p$xZLu-2CJQVKygcNzPy-p~X zZ~493QC^+#Fpmr)I%qV`R3TO%t?g;%OO(%yi=U}O0&Ri4KY&7$(D%q zZcVG==o4#`D3l?)NDtetDIACpQg1w(QzJ`HB1eD-75IqXa?nJ?W!1XXsbYWIEJuKt zZ491ku88ZpI=iyMK>Y1@Z*gZD8B^6V3(PWc-MEe4waV+R#9#MPaOC)Dy${+BAJesp zJ36|SMr-D0QZozY2MSqSm952#I-x!*ZHF$Uu2eyxKUw_|b!Fg_pzU0TEO)-x!R}S2 zrk*F6MPq2q3J=_ccfOp9xD6fJ%MWiT6&@lc>%X>GwlW9BXL1RJy3zvee11amRhZd@ zYNVfj(0cNUQzPemUpOI3<`6n*ak!!)-)b1Yy*6tMph9l_B2%4@mk0DaStF_E$45(U z4u0l#TkkWTPxzoSHud{<8jlvg#r;oD{j#(^V5JE)Z#*ZBIpBJSzu3N0>iyJ^_iW1b zUqd*}67IaufwQFn3pylrurghY&88-Qii(Z@0&T4sV{al|;ih7X! zux`vj2D9Irt|_24ciNAT2(6y`s5$NN_e9k~OKlFFvLX!UgHELWBCvws`FKM>;bnvR zqNVe?G))I$9X~g6V`Edf_Zu0WPqvih(_Ghp|!HaR_6(`F`TY+Dw=2 zO3a=2?8S~}b|!ARdVA{v`Mv_xdH@rbbiA)7QtFLY z6soz8?wR%F{Pf)4hgRBKIP5V?D2_43`1R!)57dx}nahi;si+JeLK$+CoVp~levz@x z{a|lnz48!QktgvxQz{i9qp!ix!*i;3GRFOdTlN&lelK4<$KSQ_R~@x(?O6|dxF*dc zxrn)x)QlYDnOlTYHMYk{5pQVVT=H>HyVt+4MD;?ekA2XR6y-64R@BTl#S~gd&B2k> zhr6)%HvarvH-fCdc2X`J@byvSL)lZ^>9hp=kN6+f9_7$1FdyWKg%$0O6md20$v~S0_Rx#dJBgTpK@Ly0<=% z4za z4aNs9A6zFR97`Db6s8BaR z6Ev<9AC2b#Uc8yv*%o4ADp4UCYHNu+P^C157)jL}!!HMqlf*Ci$)lZ>WE6RI+?H=rZ>r~+ zv|qAF4T+1r%)t2|FYAMj%NGFa)Za_&t}j6ORU331cHb!W$!B!mUd;wb)aEz4RJsmK z<)>)L`VR%$45Xj{BAoM;S!+LFkUgr}9LA?zc?N}=ma(NBZZmK8R;ly^SbWjk6{wbl znvQdVW&F>b1E#X4 zRti*N0C4+4byjQ)c{&`V0ZZIkcaf#Q@AZ9lLE6OZUkEMpE7Oak@=is@SwAReeJ>b1 zOrV+Fb=vIOr#k)%&^`hlY43-20hiKdJxU>e6D^7r`21*hbD68`NDvgct4rhzf#?A)e~ZKzNN2K8^2O_39K39CE5n3mh}#G6#Kz-o9w zGK<_3-DujK_A#6VHv3oT(GRIVVf^6alvK8FtxVebdYLDi_vKh#I<^hl^Fm)Z`s}ab zVS!7Utq5McX9Cv>MAQP>o_`nY*OaAElbQ+w)R!ddy6COW#vtZ*xQ~q&7e`;5pq-!# zray(V4Ha>9K=9P96%(AY+t$wy>3UT@itcJFOYb?$>R!O1?AYP?dH+Rn_~2KHx+)N3J6$ zXsDJK0kxm7(CiRF%>t*%t|1k^6g*F$6hug)>--Z_I+xQFyB%4dUvaQ6Jh4W4nx`^V z+nbOm4LXC(v5NjlK#Vpkhlc0h@Hjk%PSM>a;gfQ~QdN^Sv7! zos$PGu1+@)M!5r!Kh5yVTRgETK*9C>okd%{-z`7>!e2+UUYW5Ix=!qN%L#!&J#Kh< zofTQ1jv6(E*jEf{0jbGchq|8w2Zn8Q;QbJvF;q*}c$x z5%NDpiuSJ+xTOhvN@V5RRYmtnApHdcmZi-~w_j0Qn42#gUgx5x6S7N&GJG9wKjJ4e z>piri5C;E-UZkJj3h&lfK@FfJjL+;*-w@I%T0RNtcyplkTPfwQ>jUBRO*zeR3i{0& z%`i}SbAXK+l5&VPFjuzHvSPY39w2tnkk)!B+7A?oE-2)0l1T$5)Lv6_PXrR8I;x?$ znM2w3o>w{V*~D3>e&s!vc0Ul2i;L?32<;^cfh@*nlF|tsF4%&HJQVOW-UrcuPH{f4 zYpj_3`o2j{cs@^|@YwY2Q7f|ROJ0tyM8Amljv|&Z@)3XT@1>aSO8u}yR_-1FLP$5@ zFxf6$p^C{!*Rn%yqC0cB&#aUkCY7Dy7!%-NYqFo_HBqRo;AoSr(&-z52W9~~&3hgW z-F{S=^U8ybg&`c-7at-Ltf*aNYUW`8les!;D`uEB3>(EGiE=POqnpyXb5O74-`)`aOmTyLmA?G;yNhg>)uddeIuc%YrK)kKFi8I1_g;2Vf1!AF;1)^l zzrL{G=jE?2|I?)|z1Oy`B66rc#^RJeKV0GYb$84;F-5nc9G)Y4)(}E^`rw(LRo`rb zo!VBw^1eCrGIN$6Iv8Lw?bmhGbnMO7r|<^ByTG{>GYyu<>U8GA1gtDy#qV9xD#}ue zrmubvMc)+wdHSQ~F#PZtVTZJPk=F%d28dAN*LUFo!Z+|+gi#P*2W-_{Y-^ke0Hq}# zPTG05a1=^>n@u_KjT%aUz3jzssbeOz$ZHS7zsrk>Kc zX+_0-7$JS1e5gi0x-Vfxob(g0Na^F*o10 zxlcb{H9k_ngd(|C1Y6K~;{rhHfNdTqa9ycGLDKY(Urm&fV)dBZFdV9RfK!@EaT9lC zLoVH(;s=%FK2VP((k(d*)c4#H`&pNK?Q%Eg>VM8_5-2BKP#{lUQ`-jdi}U?(O`&<; zm4pdX88VmQl#Pt?ht7+CPUx3lj?ZTQNm7$xpU&4bAgE-*^gZZ#x;*HGDCGBQhKAUK)2}o*W*i|d=mfX20^IQ zfSPO$o5K4=W754ew@H+SVzjt-aNnu3gIH=ElS9O$d9e8o)5ikB=-pYYumZd$_JR@> zm;;pTR8MqT)>+$cmw`%F5ETp**~BHFvFfd%muvQvtF+L3T`ch2{^3Z7knNfjPbL8} z?s-+vsi9`w4dZ275^;m(`}T24casc7z7;}p5cj`$I4q*VLG*1pBFI~_e zPzS&l!BH+4{PjIYsgGf3X%b-W#Y(9dLSq!M8t|D+>r_zP`hl?<0IqUUA3DqXZS|~NNa7^Qsj}Ub)0Yuq^l{fViu;@1o7qD_*m=BP=1+uy+ zf|t=4_M*M}o5}V1bO9=PE^DSxEyD&1u_V+b#dUCe_HF6ywP&u+#1UE&eKs9dX>{3L zZqNiyeMy^@>efz*0*HbWMS{VSj&}6+9(L)bp72eN_)I&q-pPyhX{Wx#cO=LT;bP59 zp=vGAE0PCDp0ejej;g=twh4!Bd6{r`iFE84-FG)zSxfxwDohrlXQc^bN}hKTvzjZp zdqt|*(M4ESAG7q#y*iANTg(1jyMdQ?K!t42dF*ZX!Dd-p5Z0X;;JbwgcSvayuuKqI zW16Y)+*?ya*c9yxwKVn7dJcE0#PS2Dg#84hrDy7hIC3WroK@Rv{`T?e(02mAB*$ISp6UmwDTOwQ&ta-hW+P-jmO$D?M-X6s-+N({YIMNft|bDRlmy+3YDV= zazRGp;0xbgDw;q`mEgpO0Z*}9pR=TP>^p9=Xnaq+cZ?q`0fxn)Ls_7tETSV~*_2*> z0KNU@u5g9W!9g-3|4ZMGvhUe3FRmUkOl0=IR$ZEoZn^J9w`i0aD)J_fmH|wX+nWU= zZB%V-@wX|%dGsYm^p%2UfUcN>Bu6?QXlfa4;fanPE{_Q3iH{4Suc%Cy`i!H9 zj@%md*qINpqzW4`KpNs;5*qu2qfN?I(4F@{xK|8wjf%tXHAp0Wy%VD^7(`8Y5(#}= z*5@8WrF(Ol-ko)3qpRof8CbFa40sN9 z7o~2<(WFuxrEhi@*7u7nSo(;hs;tGtFe=X>QU1B7w+U5w4g?@#mSiPsvLEF=j&A>T8pjET&4kHmRy zjuK~%jKfL_1_w$G8Kd<(+yy~7uiT&1Ye)yEpE0cH`T8IU=-7I%LBP$knS2h9xjLah z1iO-DZfRWcDGodn`_Xnfg5&qbj;H*qzt$^ef5_y>D5~yw?|U3m5+BQoWuGT0Q;Jpt z;Q9{d+;=DaL4KN2!kq%^(dw7;DkjuW6?`zD`+az6akSEx3ng1OfXbo&mX*oMOH&rHx7s8y&O$V1jsfe|dYE%fEGa5bq+ z1&M+x?!)0<4$OBB>6wPgfo;%Gr+hb`5|;af#4)Dz=z0!W7L6X+xQ>9O27Ar#LmT1yO9Bqc)%F7<|3au+8ab?bGCK(#?5vy2N}+xW0`8GcW5-I@`^~fi zd#kpE(kJ1EHW#5T4STWr8KW;Mo_@fdtXY^bE2b3W-}RJCbSk6WNg+8{5^u-NF_#Gx z}hwz<5!nMobczi|_Fg(GQ1n-8L zLT74b`3&Lxf0UHdQT9XYGhI!hTR$kNML)iuq;0+jX721ni@=cGJCd`_=g~>4$0|Ec zze`4?0mgOl^A0nc7L;JJeHATU#E9nnc866zk1B_nO6!c>iCM};NAh;1k-KovwGQG& z#?+_P-C%_l;x4PSI-(Dr6<@yJI)^Uuy|LliOQ2p-&P)9A*~U^M8XL9ad(i|y-aj8o z5~llu%)ftw5`UvYKC7v&3v#Mid+g1A`dL7CrMl^A`XQ1Ph_hkVopxqa?e>^*@Px7n zwG~44PB%uMOMwpgtmNM3f8eQw-8a+?uI&ws#kBydpyK*^lLF=kXsYF7rMI!MPNCt8 zTSw$s4>NO&ib^0wSCWW|;$-Gh>LJjE8W$=KJZ3vTzuVg!4+xpl`+}%J*u`3o-5I@| zokh=s0V2J7;Eq*CuO#cS&-e6Z8;qaw$QR>WL97H=_f`;WRH=2|9%mneJ(k2##Bj}8 zZ*U<1z(fgS8a%D<6D<+6AB+-9LANcMy%WQ(ajZf}o&2ffJ#ewcC8;R-8t7(dI<_%W ziQ+HSYcfgmJoa2%!RH*Z4V84tAsIG4iS=T2A`fD_HU)&$UlSA*KqWsHim$d@W9E6o zsYvA(y(Bs0$>6pmp;etp8gFFUN5euzny~TYOZ7S@#}x-9+qU(Tmd`)od4~*c4-*0oGvcw2*xC zYGOzV)H;fiKawH2)!{$V1j_48>^aY2p9h%9o3v#?ru~IhR$vRsuRp5%Cgd=8+UcxH z{13`3DtS0}7PS>CsVfJxohRcFOZz&9_@mb|T{oOme*EM5DWR{+RG{lHHto(!>~5`kwOBPkc^Et9(0fe05Ql}F#9{qAqN z!=`(FYCA?Sq`H9WNDwQ!8*JhpEfZy4f+HGyc2f$#ufNX$2g~Q-)G6z25j${m1u2G7 z^}47P{<>qt;bx7iLs|MYS}!%vWpaqHh=Gpp0BmFcNX(_ zI5lXIK@LEuq6A#A`wB79*RBnb>1zp~Wjgp$c+z+f)mdZ_9IROz5@|HL)b;0kHLZ~z zkU=k*3$;L%FCGFnfD6TSoTijl6!>vDG$4TngmqGO7iOquS96ulU6~yy%rx=^)~*SH z_Mrh_p?%t|ugLF3UDvOE_RRdLqzuq(s$PH8CC?HvD-aw?L6@iKKbRVJof0&J&Koi` zy|oWSm+U$>nww0y?W*6tdoW%emF@dzu=N8s12N5ij2eDY_D@};)5S{IfhB^yUnguXSw+KLvHPUDI{I8+fkikX z(>Wj__lVC8#D<==syS+PUwoiqe~o(wnS1sQ`-ZxtbG^k9cQ&2xm? zAueZI{r(=OR#6+A%crk#HV6sK^Kowh2O2N8^eYX*x6_#b*TT1Uw?AhkB64GMf;daRjjrQG zrKzlHtr~Pf$5rnMKL*d;Y$M|^GCtV43HTb0P??osnavHw#uVT|-%Bk(7P~g``(bvV z|MIxvc0t0?nfIk)dk7cy5O{8%EaWx_2aeMTbyi<7X^Vd8Y1J0(jYuhZ^SA9qz4_S* zFR7;WGQF0 zI0c~#t%`r1Qmds4K`bFKdd|*h31V2&6i1!(0o!Ge+`6M812^D4>2p9qmt&tb85M3> z!p5fmRDWi$IM-2=Xkxl=97F>f?+PAT#Ep4;_c`K1V3jW@f%Q7HB{$(7jbaj^ob9{T zP9+rQ_4S{M4435U0M&$LwTR>TG-&f^cU}96Rj}o*NI+s@Z2u5R`(gQ;!UFehwDkcf z8Ke@jS^&w2|Yjw|(A6c9W-(^0QXe>&lQXpH39&5(7{ zZc~Ch*E@Qzy3Ce?Lhyl!=)Z&Fu`jP)^XwSqT)0)a4OTA5clW+^LclgV(X%&n!rkCl zPwpexD961|0NbCvWlHA+Ga0;}(XImE$CuN`ljHnz5LgMptmevsDEIY`ngRXBU7&7S z^k_Rq>=*}v8rTPlSJ+Sca~@Q;RP#;mQ*Tq;(0{(1_2lOrM3UEXQcCl`{!3=#OSnMh z{9y5Suw4xizIiu>4=q~+9Agf!R@O{cs%BYpl-Mwwi5ba?hws4<6oxXLF78dpB`#?& z0X@zD{Z64V-bWNMnl)5O26AcAV)m4V6ZSd=^_xOC7p~u85A-+u8qAi_yARH@m?4j` zDj^m$`-#v)*U_ggO%Y1vxAuz>Eu=lj|2kVL#E<3@9(&1rNq7o8d>m+-w>2vWhql+} z&A>Oe(E5-nxB$tG>1zFVKu$ceJP&q;7hJxoz(VIj!m7j=Jsf=;%f+^NTOt-c)3qY) z5bt;keS0+qWk6>}L}MnZk*vVfBaosCef&{emVnI1fg zU54X5gmvu7-6h`v(4j1}f0P3Bw8WS}B=!8$`gPcgHEU1%)*4Rm`lncdlsv#RkgVIo zm$`hsZ*g=DPu-XTI!|Az1?W`&dkr#RRcS1=P6h6KY?$6qjQxm;cn2I6<^tL)hBDQM zvL3ZY&J)s--mHtzr#gs#_m8}XSPnZ$zD^J~BXNbck0WViR$NXH%`E%niwG4CKw&kZ z{p6NuF`9!awPvXaX|vEC+nN&sMPt=SRZLydiP3_iUkL8N)xp&?aeWqgl*3Lp^dv_e%=sKKk-}to=oPWE!fW!F9BImD zG3B>xPZGt~0o|WeY`9J%RF7T;4E>QLS@&jq+iOkB;tMH@(@seI zU?Fa@9V`xYXarZ!b=H6Isr13qejZkJ8nXe9wQXOUNpWVD*I@{>g#{f*zf9sX?kVa* z`fXzX98}sK@)|tORhrHtVOeS?#zx8AuRisuV&Ev`BDG(>-2P>T@xUl<>;)C1Js3OoW8zwMDQ!Z{@SG(?4z7oa#wf9DQjIv z1~(u@<*;%GTSRw_y4`sfOfK!vLLVt+tl4zQ{b<*VZEN&YUJ1@2^}S91$t)($4;lqi)?{>fNPuf1amQCX-3n@=R42CMz^+C@^LJK?hsi^kDPBW4BuW3W8 z;1up~D*~UvdogB}UbKC&5;J(#E}+5n?2Yv}Pln5YvKPrW)jgg5K(4pgzvR;NYP0p2 zK;l=i=N?9(E@3(^uBc7kM}3w&x+{hzTRK*$1Ofv$?gyrfk`qhBsy+*~J3BR_+xO!Nu&mQdQ+>3{f4N@(HrC#&S8k-DK_83R9X*M6ly?2Bb3sla*K6u zPlAXKSEuawTOWNllU=3ga6*L~ieLE^NazmxWUHe$QSUR-8!%X6A{1MiB!flag7MHn zIe{>l$Z8Y!D8TLrBI&U3EHAaX3kwSa-(QK5F_5 zJD8paD~?#yjB@tFn~r-(`haUnwx_K6_PH;sv$9q9(Kd?K_tKbHw^@0m3<3$yQ$0-b zB3&!}$gXsdvIiODgZXVkT^ebOUfOIfxMk}AUtHX+pgF7q5*k3_n#<7&DJJ!V%?VJ+ z?jUMfnn`=C_dH+Xv}l*~9(Fsd+k1aMN$I3>6jAf&dBxz&d{371?E#Q$_&e|IU77|px)MnOpswQp$b}c^s!y3jB zKdUp9vD+wYmuXt})oSn6ipTzG>_iy8mvCC+o_0#X)puEgkl2Hx} zdoPCj@>vg$QsEC{s4r<2d;_ttF#d?%GH}X!*iv-l(b$J2KkT&6z<@xtuu(qUSPcoYH76A+bth@+>DSs0_TB?m#d>viuPw z?c#1c)A^Ze^`ef1sA$#0oz7IX)+3~6O`xo~>TPya`Ov7y?1S3rOdQNsc?bKWy#tpy z{zLX=GsJrxs~9C9E*!^rX-yctsl+fXTa|)t{l-odi%1*zdCa+3uK<&p#$J3+7bQWB!@S}1wm)>6|&;TEEytLPsq3CoZ>t>P-1)kA)| zsFHia_T4omVv^uSIV7MPT0^suGxlO5*yRZWouz># zf{aLHtK|?T1SwceAT_G?0cu*Li%ia9#$s8;zp}tej4GhxJB5ikn_f)xpf%n|XT7bS zU%k5zUV>{tW>`34+F!;VWjmS-+}$!G&b9VKcT!tiS_fb37M2?))i!tuS!}SJ3>cN^_)PFIF$&%y$@o7v%YcO zDv4tYkVk1R#i1p}W8e#1c{klNGZ6&jvO1-Ym|3%#T?{D-TG;sq_+u=eMtlc`q5W%? zI;3wr5q6lHBGg6_PUCpS_Sj>`Uhz#S3@eX#2Y@!&hnRN&GSf-a8rij*(9eIJC|6uI zc~^F zez<_L{+V*cUXAO7NBIuA-88%(mrIx}_v)_kp$pbxigA+rROKJ<&ty_rCuWI7wUvWe zSGuloV#fuTG;V<7n&O73YNRyP)aS}9$aJ({c`rHRhJVH8bZ3&0K8GT44V0fcBU)bS zNDv}M?D0T0i_*A-J8VOJJoqdhU7*^5+)5sj<0;((S(>uYjBf~lN5ia#+z;1HM1g|e zk!?9@lep*}HpE6oxkYWQm30AzBfa*Y7VeezcW&V{5+*TKD|g#^W<`=C5EUD8;|L8B zn>%@sL3ot#!TtcCP|o`Rw3;nT=lPKw&|4f^r3Bm2QOJ{96R?+$vawIS`5crgZl}gi>L|1g!|D^Ot$T~rk zQ78OEwu&5LQzYi8X4*X#s;q{SpiciXb?w$LE+!|AYCFn+WQdsPMK>Y{wW)l5&I0+( zh6xDQ{QJXkZLv-*=?ESK%fY_ts%U|EAI1HkU2i*1ggfb2Ft|)H%|d{qt_EP11yuM9 z;SYTggqe0Nn3mI-;tWKmw~jnUCL1&!4F7{yE~bvKV;hpiUcNwJn<%&N3q7{9TMgpl zch1fA=QknglWv>Y^uiLo;Pv@|QdDl$Q(|e%sW4yOi#WjOKpne^I9DK{Z7-oN%1cal z_0ipLvmK7oel$P3)**W6FpAYUW+d2KwqL%pkZYDvT%U^Db|b>p-swTQ?Oay@sB+z< zj#dI{0W5wyGb>bFCvQk-YdCLQHECm{=AD!BPC+F0I;qiWtagzGq~AG}w+>I3SjM~Z zrt(fk@;+7yAF`!QlxC6 zM6`7}TcJob9(9F?wqdqJd_CT|s5&mLm}Trt;2*tB|8ME-FvciDv#=%A0&>R`L~Dv#rX>ua z7>2Z_R$q%{bNFHiIkgmyb@}%JR^qoWhQY9b_LDCc@Yud5z)@;N-YDr`Mt3T_Yyy;S ztl#*a`;pgeSgW`*X8(yX6m$t+uI;Q9g;TJWO;&1q@mW2_{w~=!RieVa-aH zyh_zVZYCaAebS9}Y8IjZug#{OPn~A}R?`5OZUJKod9-UgfVFWUeL!)jXE3~ruW#<} zZfUd1*7;&^{hpA#zT9j~6}bk$)G1#C3D^6OaAi5MYvjZaw9e7D^h4IVC_tCS42k3~ zTKlO(j@F9*$q$*#f{j7j01|oMBuHjz0O>2MHo5YCA`THEz~~eXUPIi1bosqZ!faj! zCDae7PeY|2?PAT8TFsZ&Gd=g3cHB(9`D1;|scL^nF|n&@+rPiyyRPP~Eb1MciCcg0 zZMNNMlu|Et4R{ zDWHkDVsC&1!{0R9Q7YD4_%MMn!>KIy*RiIR;sBeaN$%#GRN^#?+UpX~=;_4(m zf{mH!>k{C(f)~3{$5sS)$aCx!OIbynBNsI3GsJZNsW##d&iy6VJ_Fr{c;9zV zba(qQ#XTY)aA1#`C$$ATDKBq>Y65i&BllV6%z?$kw~v<@C@=#s8uf*%*8l+sGKR>k zn_eY~T0KGBcX_;_^dT>HW$;N8C&a~@L%Y`ys0z~)g6=XUFy~bAa1Q_@OUe=lwzjtP znycrGE2c3zIvjS(1w%T-mnR@Bp~kF7FP(}fCYqbt-Rgy$V`r0Edv2@SDvW>Jb5hWR zY0CMS7xceO2MVo*L%U5pRo_V4z3+y;+P zyDGKSKGm#MAkf@1taS5J1Cn%tLoX`(41kFX@hihvh|{TqNl%$$j#b&#iUSYo72Bq~ zTc$A9^nA)`P5pS7fJrhkMzTKgqFc3GB~!6QFHRg;6UjAhMVeXB3QU(!Qui)Hc`MC| zQz5Ht7kPWs|$w*Ss_Ktc?(~J)=YL7`=UmSA^^XtKzw=h{$*Dhvqht zMDO$zwYtf#kPZv#rcrsNwqc?wN8010rEzXVU#Ck&SJFWIzQVMD!&}^>ED4gcbLX|f z4|1I5OZ5pE&7~`$&)MD;1#Q35*dBYIxq$II$nSM~Z9kjCdtlCItjl40bvi`*@wc^O zRpnel!gq56DZAB-yN*oJ^7_+N=ysJK=~*0@{U}Hpssc7s<8?}w z)Pn5+vGsL0(4}MAMD#ZdfbTkhDl-5oqaaoWO>{Onx(xctI>ZS2AfOnx4P-xR(MOKL zS&Soe1_W3?+3a62-7!Y9_~9ZO1Icdh+T{r1TeMzK;MqaHH`FUdg3=f@Eq(ndF!_-2 zKpg@abj7P~KwG0-3njA|xpDqNUnwp~`=7-(@J^k5EIKma$U2hXv^*AK0v#_G84XIE z2S+DX^aErCR!tp&ycQSVjn#4*#d5`85Jm<@C=1Mq@{v4@j>QK~%~Q(c`P+evB@T$W zD0{xS3Fxgw-$!VS9?6q$4CzRk{islb5V)i!jh_cy%0Gd|j}Drp{FD&yk)0zs2scsW z9%eayhu&?eN&!!hTu}_|ylkQa;i7N`tV86J!|Lx6RbL1rOhOXvnImjOtG8q~g2;$g zE?x;#TfDS7_u&;(GEDCTkcV7oS4NLT&^H2ZRAD8=y2+_@eo{tbR_8K!Zeu5yg0@kx3bO8qITjw-AmwlDMMm z(ifbLf2S4n3z=&H!Oyo?W&vE5Jop}w6x0cf8QZgm;N4tKr@lr-^y@Q&V%2+JmL&No zOC+!evtc!sAlD0T{dkuS)Bx$rfQv0UiIF)bE}n}g^M(+B8v$*K%TVs-%l7BCH=)nL zXjm3#aRUJJB*;~1zCgoLD+M5RlkV%tcW#d0Do5I}DwflC4D|9OsAibmd@qw=*bPC& z|9qhzNURYs+~ei~k2~UDda7lACL_{`bYJf~6@s7zvy5n4$UEq|a63Yc;CKG@qtXG* zBfEN5NuQU-EGHakF@OmuOGujq-P1G+xn_*a%GN$gbBxRZfGfD6D+Tg}nq*H97gSwi zWOfs1%2#&Pw9veHh0`IDVaNyw^-ILIkxdv02aD;Vn+RBe&RRA8>l!f8q^4&8=BIY@ zJ#}9G?ZZ=>&rKHOd+f-YiK;^#=NObePZC76y@w|6%B?1$3lcl7XQ!EGU2T!!N{|oE zXX0nru<-@AJ7T>&P@tstwP$9=C%cb$~@#B9mP&Z&mu9p0Kqn0 zu`tB~a}83VKiC(8_f-5h{TL{YE?)w5eqW)r4myYSY3VDUdsmTB?HPVU0rqWk7--?J zdt=xb$b`grYmFk<{d)lz)Gqn(yw?3*m!^?hHh>I$bA1|x1j!q-<$ND$7!Tcki4EF; zgj^!DyVlZZnvb7hx;ummndE@9V;+p6)w=f~fk6@{^>Vl33(Q_C;LPO0aCJ%fDBuZe zg0nK(5~IP{&r-LKuuN|+w#7Wfeg<%h%(7JK(PHZ0oOa=x=q^8eM$)`;tXihl-7^|Y zOJu@L%Vao)@ElBFjQHr~4bilcU1XK9ivrbcCa_878Iq)iojwm4%%kS1v6e*XTkBD- z>vPA<9l@A>0KAd2o$8@_;@vYl;39LD9*vVv*1?rH%1GxSa%v@ii9>Z*?#xyy^ijO+ z%-zJTm(>KwdEL0X4JWp=T(o~=*hw)opq><~&qkgP(_A5Uk(fH`YUMckIhU)m`J3T|fi)EDA5t0nZ7%=?(6^}={Mb`RyrAWvXYxVp# zfv3cxyIn?Sy})(lMF((TsCpOjzRTsFyXmWbTHg;DF#l&#_O{)eUBD5d_!&3WAEKvV z;M1VCI7E^MGnhDk8o~oi18jp^P`X}k?J*Ikzlxdsgf!*YuFuL1+OiU2Ie`(6%Rld| z0*NPeBV))^U?++@fX#I>O7U2TtS(xxhb01kQrE8i<`?|&->s{WVn0>(_vfgNL?Lj%ZA2XsXv{pAy(U${Ca**;0NlA>+e2XBuDSuq-3e{6n# zDbt6A6r+Cn2$7Tk5SmV&uTK8yR~+WlGe!B%co&zHyB)^fs_WqI4vbAxV3@QOyO~?@ znO{^ECUZF{w+5MKgbgrM5Av+&M^?B@s-GRZZ1HrWP+~L|9k3d~J#z54GdYMv@J=4U zts@1He19sEs8WF>w6Pepn;Iy|C)bNgydYM1e%-7lt){KR59b=D+bOOi?-) zLM1hmEF|icuJ{!R!Mn<~H}GnO3r)oLh-Sf`c+-2alnA{EH8Mm^7(0-xF080RwFWLb z;`8A8Ig)>XCn(dqj#@@*PvLQ;NUYFCB${1arSe|hhF7%#4Zk@ez?C8?D`Z#@At2i1 zcz8>Ow*EHq&IAysDo8jQqr^N_n$#};1WCsX$&Nn>AP)hc>yrOEF|d#2nWP*fR%MS6 zNfIe$uYw4G3rQN0iwg9oxNxTzsE(Pw;P{)XgIdY*QKAd<<%og{dTd7MFbhh>-J1+^ zMS4J>mgAE0&2>?Ns}SXU*K@^vA{se>>?Hw{yMm<(q5YQ|&c0h(JL~)-NM|0VwdlK^ zbdhZy-r9bCQ2Uy2#;tlRCk_deHY+2NbvW2U|GZa#=h8Q?9QV=3g?N`##8bhKKe?W7 zF~HxcXA)(j?$)3`lqx!I4YM_GDasTpK+36jn2}&hb1&Mi%tUS^7jLrOGx?zH zGAZ*7n}g`&yK|cEgjrPd`4EzOc35woBn2!XOTECb?Rj;Wq!G%xccuE*74H_!)5v=+ zx-0=|heS*}Yqs7EGQnSV+gvJ2vjR%-&v?@_iPo8a7$U2J*CEpX>;lGlBYcJ5zjk$> zl0swK{?STFbro|L81W$f#{mrKfhv;xvd00|C>^SyhK~G2^-$1!DyqMfj(sg*}(8C zMt4-0m?0ON&#`~sq#L%xqOTtq!?V7%cCDZN2pLs)KU=3mOYXy5;@FiV;32xiC{0v%9KpZFLRyGmouQ#jJjty?*hh4AVZhvedoct-`hSsaD%co zdAZ+Ml=6|&s5eur=tlHnnE}lB=osWpESED~UPiLonInbJO*yh6@U!3GjS5-Xki?D+ z%dBUFnIr&W_Ut7kVy*`MCIG=thzA>}Fmu`ugnDgK6ec zV!f^M1J&q=*3EbCVc7UR$<=uBzSsDBNU#GLQATnQZ*hXZ3p%+#@mL}2pRVlngY7|H zGBB)xf(v${MC@~BrkiYq| zy|K7>e@Yladb5DYj98#C*kVOxM4(9LY1=mQIpCcqonPF%DHZboePY!fcw8THa0&0{ zNMc_}a3_d7j6qg%0K2&tCR`#C?3q1yPwgBjL*C7k8WFMqgsIm4<8q>_ zVqG7)`)J7{_e&54OpTE5m1fs~;RdxSOXx0~8dP$D(1I$xf&n2P;o_Kb=}m&P%PIb5 zoJwfqG`!y@s2hTRJsLUN-ZXrs=Z$b5PrXcNE!WeDup$a%&zgZYo&(sl&O#ggLs{W5 z$Wl4ijY(p8I!*8lbyw{KS&L@CBglpqJZVB79jEYmpXU6+sxr#Q0%M7xatUSIDYeP~ zb$K?AKI&>v%WVk3F&8KvB%xAF2rYaFv*d)P;}5nEFkWl$YM`6DEg)(gOu7caTc0jk z<4@b&8*7U_(d++y;^0+ilhHXpHkb81fSt_#g<#6rlAKX`FK^u^3PIBi(h)p;%H+Y}IbIjVT)=wZSsiRN%iq!bkd1RgmhI%T=uV7PWie*}*HADjb zIfoYvBEF0av>Lu&JgcbB+rF9vt?pi*M>KG}+Zt9EzZZ8(+j3}gfHLGDVUK>e4?7Nr zue)&tG{=9Er4pX!E->@TxYbbdRUAiG{yHJk@yQ;%^N^GkQlN4-^-v*;`|yIEJuzjt<6kc}6Y^BPy|)%0KRi*{flHrEmlFKd$ea#N&t=a4 z!`^%U<=p;%z-JMz6BWvc%5F;(GFoUT8rqu%P14?pC|Xu%FEq8Zr%)MbXitPBEora& zdAP1zKA+G12i%Xx{q@6DS8{gtejo4Sc)gzMwSHfn(1Tvh;q#s;|2&?CyZ-P2NGfc~ zbF)2Eo8mmMN_i*!b1j_WaJ3ZSgmQj=aq(%Y09DhDd-~fn6t1SXwjZb3hH-~O9~Hu< zLvUdpimcZ?Zm|@shvy%YkR+nn1W<9+czi z^*0w7Ulip!=8>jOEk9D>=FT_0H3u?*fvCmi>n_l%<2gDc2`YTUaZ^y+Uokp2NH0;JwxHQ*0uj#ZO%znJ1zN zD+k5~!4k5FDtYiK24;2)_lEV#aD*A6Z2K7ekIFBQ?~jajBv zNbci($B&v`O^1GX4dZ7mx$L>I?I%JV;44Nx*^I=#T6v-!P6=_viLqLMvUD(Nb~B_6 z{npUzXuG0esOZRxE_`}H>A zo9Xl?7LIT0Jvh1OMeG5;?E7q-U)OJ6doH;6tnMm7$t@y+XOtaMM5yN{aZ2=oksqet z(Jq>F^l22*#O%sO_=|c~)*2n`-W~t;4DYMh+@Px2$tTT?RbG6|U&MWgm@0*mM1XlS zM{?+mkD+*B|5^CN^8|h}Nl_|1puEI35Wx-k-Cw1drXLKgta*KlgW)RQQLf00e{3&@ zv#@Q;mve3ec)^9wXk-H4)(S|%fsXALy)K^5j2B~*#zH5aG=o^ zH|4s>-uL*QYfYBGMHCn7FNY+Tx#TES5tV|n^6Mj}3h$5LWVAqEru)>b!cEe5?I{$c zncu}esG(0>YW|2e9rsS~cA)G?Bi!JY`cY3f7kW~XK56CI#M^6irtNMyAE2BKyrX1~ z$k=H@+=h*RzZEAY)a;VI>8tj`AC7X8aZmLwnt-i-DNSWs{lN^rxlx=fTG-QfCAbeQ7g+Kw1Z&F#n`x$8)R=X%$>Yx% z#UfqSKtL8OkXpz7T*iW#wU@ta5+C>wL$*Cu6Ybgqd8ky6X3E7r%y_&1HC@KM zFt+qrRo3s~#COv6oF1aQWWQ!n7V{MR6|EN-z+`N@^KW~C{kJ-wTQ` zI5|Cj$(Kcl`(r1w@oI}DqJcYpmCur6wGi!gO|s1bwhJ(IoWBU~=l&!aeW8hQ&Ypq8 z7zU4$izp|ylJagd+gj%K@zv=hb;SbP@YU%l$XYFNzIGYL3UrHHe-4FH*VtA%1Tm8z5l%}cXteZzC5F{!ak|}i>k)2 zn8lX5O;|nyf_&?<$MVW;w>BXXFKEl+ISnoD;as<^_qzp{`_=9d)PEHGk2*}ag$!3c zqku7lDmHpe3bBh{Y%03w*He1NeV0i@7qeh+(vWa-?};a>$(nRo>kinN>xV!iUjt45 zsv!-M*|c~@3jkR%T;VJ3&XMpVT!1ZEq~x1A5?#VycFZsVkJ-YvhtP2v;m~qbuJT0% z_~ydk?-WJyj$fKlc5OYf2MQS4ki2A$g6-9-*WgalvcyJaNf&DoTPURROdMC2Y67>a+Jz@ zbu)ACK$o?sWN<|6=FYz3UNcmxl+PJ4FHgE#USe0?Ffu(4k`{4gE>*zF<%)aks5(_{DjG3CwUzKQprR1F17?G)jVM5nvp**7LjW*fFk9l|^tMS0vC zCofG_5XpTYw7iS5Xln1vuT>Eo_`vMO+%uC%n;rev^zV*+&hL-XP(XRBE2v4Q%rM;T zWS#syA5a;lp#8hJ_(<`=0j;EDN|H463(xSfHCyvVUv#G*n|DZWYL zh9J_EURPzp9Q8&C=0+v>nOEenZG6}Hj7RRoSSCZqv)`16`i(hRj`_RoAsDkc8r10G ztX9N1_(2%|_Tcavg{-B45U#X9GGPL$QTP+h1T;Hje{KLA?7i3mt;inyg|}A*N|28w zkA#`GO;+GBF3qNs4niEk^FvQK!|4xv-?_$92!U7LeRkiQP9d5g4NLKImc_U`&aiIR zeJpz|zqju}c9L!%;SYLI_dp@OkdFVMgo3i5OI1vNmm$L5%(Nwxe~v+`{m`)FVR-!# z{#U+~B$PZF>B#kYvCFG}3dBKvx3rfa=Nsp_Ip@x95OJ9}`7{y`K9fwy@|_1Ip}`0b zw2*EXf8sc7oMJ~uW%7RQkjd{Nv?_d+>zGEPyF#kM{v|BMw+^wt{5hmCP{S0u-lyY0 zWBkePz=3Z;{CaFd%>rLF4p#-`(QmGt_}Q_uwrL7cNs`)9F_L~H{AooirROmio_^gK z(5zdo^b_qeoW`EsuWMaLKwupy$aba8p1<}bQJ=jB=W~@P9#7y|FdHB~0}Wt@`vR}Y zLlj?EvS2sVZoJuwHM)@8?EA~)?Oq2@Yd^WwOx5Y~D42LP2ykC-33BDH&eeu|@Ung( z>AYj7e)hSa+_aDD&HE6W{07(3?S`n8g3PJ8LyB;Y^XlbOjgMa;T`e|r_}&Fl?5Jmt zHMDKIC7~JaHfaKaFiZzvHA!J130mJtXe~PV>MICC#r~GTr=D5@vg%BHKfsDH(=jk( z+YD!3ert3o4~Q&k-RrXB))%&-sZg$|-rXVtFuR&q^L1`~E9aK7p(l6qSE+T_X4dVy z!o5U?X7UE7`hu`A~24X>dMD*JqB8o(2~z>y5yLMK(IbmFq-bJ-}< zqJMIGN@(B`!8@q60&0B$U^{Hkf_Bs%VM}yufv_1aT%8E*0O!X}xXx|pT^0@U?Z9uK zPZsRz&X4^FPSH%i&8xBN@hM&J6s+VteMWYWG4@Bhhv+X7MA+yWIACX;UZ~H~Z9c%4 zjI5H{ZIoW`E&dfh9NP*Ut(9nxx{R%OX#SobGIc9DZ@#cVh8*+Te%S_ zZLx3f-(-H3g_1VK4kcE=xP7Ifr$qWLi~bM0b&bkrMhYK(Gy$w-vgQH{U;8DX9oL(T zVe7E;x7qoNlNi`zQv2NOfJ@8q(0xOZb5r*O3^uzBq&}m+{%!GA4;y!e6f=>Vs0J60 z#ur?jXRbC5UGPf%2dtz}JQTRbyHOC1nYIG{z+qIrvgmk;2w_54agvYXUb6m(jEht_ zY7LFaKEuFr`1H0cViN>N^niX6BIv;H$E9VFf#xl{Px5M{JopZ3JcuPU-`F=1TO(vmM-^ky+f=*5ILZ^6V=nALo(>- zL^!w&iTWXE8FrZ=(?>`(KRDE!h5BKLOuJ~YUGAPfYJiE9x#Qg&oRnU8w1?^4>1xBm zrI6aN3Ky+!q6|)9FPMMSDK|bZ89y-pe_}0)L-LQc-HI3d z0H=Ip*=E|0$BXw5oMhT<^R@b(yDb{YQJLkQGH0mY6%@PSC+$P+Zbzqpj{QW-==dUw z@hgm!9!>xNK-G~&`f?@H$5aMa>v6KnqV!81oAB8~dA;|&%qF_~rF(g~!)VWx)mpUGJW!{G)==4XJ8qJa{XRL?vv!!nlo71Af?zBD74@X+jd7czGA-9>2Bw@CW#GmLzNx@5B(aZYr5IFBPqqtE|9 ze=5M4HXP3c^vu9Vb-Moh|oQzEOZC;P!H3g|$!DpDXxK zD)9)YFt*H*SxaC91)iff*S!@)YGRJp83Z#on`w|}^i(5F2gQe|f>iZGh~5}5IIlSg zo|TK<$kdq|I9a5e4YfeY9w7e5EArOhd0^FM;YX7&Of5+OdZS(b!SH9NPhk6K<6g@3 zAz>>bIxQKDzeJ}$gG;cY_#`%^xbVfy39AIlxC>m}aV}A{ElI9bHciFC$tdp)!kh2D zf!B-ws*ByHLkH&<MFs1d++h-6+K{A!O{q$kt=muGVPaVy7>7xw*;rw3EFne}^p{n~M? z-i&iyuiVxVdB{6PO^HxME8(ghaxMi@!{l=elX}ONuku}U=o$NW;bF1m>-YTB)U^w- zANN;tvV#XB+eqR)iIP~*Dj>q~7N#}hN1+#vJ>P?!#wy}0ZK@}2GJUM0dEkQ1_5Nt< z1}A?k7pL3hMce0DrtZg%gZaSO>b74Z+d{&)<9>Ffhs|GlL;F8ZvE^%ja51%XLv)&n zV{n^Ih{ufW*Xm7!9&oY#bf3*8s?q5a1lMw}XPbM3eoXi3X>7G5&A*zVl_-=QuFG#y z1JrxCdHcq7p0o>2Mpd|@Td?Z0XJjX8*AG`jaGDsPZo6i3s7Ra%4@ZV!6Dj_Xo~ZLk zBx;K|SgH@6G@jF-Bv)r$f;l_O!kWgBy6|gbN`xx}jf-b2L8}xr2)Io<$gb|R?bYC{ zAUatTe$by7y<%GPYTatD=W=2Dg`SLTvcqpOV)|&7<9lip(MxYv9KSQ5j=Xn=pCdlY za^B@I1#B!%Mx*vUrcw(S`R;`W33*x9p?6~>FKi%yYE=}wOx zO^L398tBX04B8v%PH$|BBOE^ZT1}8GL%vdy zaE>gxB9U$KZhL|SC4CAT(779BX?P_~G*KA^uJgCWynJ1&(p-R#}wO;USpjtl3HsbL{bA;yt*zWld zLxqaxa1PV9-r?1#MPdZgt36M(hwSm(7BIX3(X-atBles?zJxwvrC29>6s-t4dk&;+ z7D6C%6ugi$kUK~6=^7r%v%^;Cy>#19RBsc9_fc3TZY+C^s@)s`N&<_oj)#}y2_}*$ zUURCDLZo*9@qZ+SRf(Ky&l9S145SsyJP}(&Iu*is3W;1@7yWRSch$s|Ej_b7#?wn% zXD+&Uth#5?{wAb-SNG-xp%@< zm|w+JhmoThB}^Zw84>ANY7ugdv2b8*vOVX^L86?45Q#YPY^PdLHs!zfT>71C+l1s4 znf$tCuRCD7quOJQ-7VMd5`045F3LJ`s&;Ris`V4;;k!`7IMCf@TRZ!3>JVr3IlX-W zRl_@J)GHnnI}E(44)154pdF6;a_5iw$8Egl%>2y_tf9-r`Sa5q#VvlQI4#GDZON$;2ri?6eUsqKR*-TI=UQ}>FhnR*9iyVSxj_Q99F$K+Pv zxHBi!YtbMSvdxJ1K24k-e(a(hX7|y@-i4*q3~t=#d-5KR9Xj_bhs|+Qu(L}95u@y+ z*g=lkKiAMEva-_F_0p(^T+Er-$n)Zbjm@H4Pa52x{Bf8IqbtV7e>d`;EB+1c-_-)s z-REm7TnObd3LAt2a}`I(HO)?-eI)BS5=DU`ofLx7ht?=hKMA0!s61y`&HS>zDQ!!t z4x$D)vm~-OMMO|`6t;jym!1d< zM5YR;nzM6Co+Vb%C*z{3vE^wrdt~>G@AD70JQrlYR+C$~PCVU|(m|;z092H5wSaajIp%Z#hJwDMh z@1|WAu9Unm7*e?x$nzCwV)O{N>p@a7iNPi6}|Mlxw9iEFT-3Jmo1p=P&J= z*P-^iz#IRw;CS*&?hKJiT+uryC4WUhmY%tJ3cELn9?-%uUvTYvV29UUZgrIsPpai8 zf9Xrp^K`TFIG76Qy-E)dB3FSvgnHX`fvqxU7-GgRpIa43Kl8bW5(?~7LtQd}t&jOh ztfy<|>>>@gwyJ(bgtr#~Kd`Ggy2fZCH3GOeyi)gOQBbI>lMXl~a=YI)m~qB*`-!Xp zy#s@JHoE(`w^B(qor{@gWYJJ3L)@^UTqQ!^14SE&_2uI|S1@$NK29E4#RYd0clsl6 zXLoAY<0ulgP`czAGCj0P8!wI59QwSfHeLTI?ZAXZ5|ipwi&=`s_5)D-7q-3)*;|EN z1pnKPDFW`ZE+i9Gtrfh%fe)=QJJ}1hWMYAnv?5b+K`WY>R|ll6=fpU@HZ4{hY`e~1 zboSIb%%UNJHeHJtNB5{y$JckT3{Jp4;7^Ro?<}l;ZmX5k9Lp@~TH8?VlrSGtK4!W9!CK3oSMm z_JsL{yKh#!*A7T$Zj}e~UjrZJZ?iZ5AM9Kik+Xciee#6Qw*$b`(_xztd_XNwc9;ba z+GO%5?2F0~pE@q zU+rvk79>(WQGYKI4hgrq4TY<6R2n~^rHg$Ct27~<8fWVh+g{ZHTZmNKPAO~0LBwC!A_7t$WtFk4AaiK|N+aikzF$~Wq;Ho-yIfkg zlGXGl4l+kNS1^adA3I;);W=|WSa3+U8}eAb8o@E_00C-5$Xf$b zvB|imqgZ-*qqeo5y~+t3-gvrv=;5i#3>zkHx>9$B7I zGcCe`U8e{YH^x3ZsZK#auv3*)^zvghp)a~8q@KzGD|xW(hwfin40)Kza07t1ifB)c`O$(@qhmr{}ZHN%3X>?M;GNz#rJ{KA9#9?t5w{U<5pE3 zY9)GW80YPkvgwBEqYhs!*#lyqf!Fu|I-IT?mFBAw@-9nF?15zm9Q@%ubtd{CMAnZT zYRZX9hP8t(HlGN}4wy?Bd_g^ddrUA?bD}Y^U>3Rf{64G>zXrFQe_s+;WRDo0ocE6$ z6at??k^h19&3Od0I(eNo+-81cSW*v)ETg*K>hb&Fb4u(;UAdG-hRH8P5S39_#D3pK54-H|7vbjA}`=;nI$=F%bR(kw8Udtx3zQHW*p z${WADi)+It61GiKJVcCyO-L!>eaE%o*r|Y;aWp6G8;;G%T_KfEcMu9zr%W}*+|tvf zgD6S1^R9&pN>{hM0MQ@(s}h8E(cC9Ctz+yUXrGtB<=55!U>1yie~z8l z*U;3wSt;{0aB{qQsseDhWac#FuT8}4$@+Cab{W~rKzKzr&DiCg#lHKzxC-+p0QIq? z9VZRKljY)7Ji(&N*X`PRIs`>_j=)x(iOWwdno>92Mv#&N!yJymR`f~?&5~7jYObiw zvsE4yf5@SlJ*gdf80s@tbv90V3_siYg(J6%w4uV&r_`s+pCV7!ueZ#3O47+%@@@os z?Y$wfSJ{F7dhYoqj*`R5&y#4Yk2rMs?In>ZgQkhmgJ``4Hnq~ka(mT|ukJ(X7_)~k z78b9O4D+5fs}92JM7?6S(yVOg|5gRBNpjEOLd{6Te(yDhyy-hrKHb!D)m+b(nd8k_CJ%-9|LDy4s9r@qX7C6PFXpJ3G*iL_6_E&iHL}k&e0uRl-jvdq?!wxmNqbf3gX2wsE~9 z?v+7Jk~g(E%f~3ZP&A^XBXX?1W>M+`mWb41oUvZ%eA4Hfbg6maL|eD^dz9)nVX7$Q=l?=Ohg&j6kJSf5Bt|C&*mfOI71^h-d_`JRRAb{iX-I1(uPm;SX? zEq0JkO5B+wXO7tu9~BQE zMePQL*pu!(&MDkSl#ad^KYBf_EhTSGC}Qvi%7vV(yhip&mbg07*V^cHi(AZQPvKe28n86bvS_fA~gCc@_YU0@0{ zKc6mS=ATLr64sz83pIir%ooo5THP-=7>_MtU^6Gu)&rwM=|~=J?BsHIcU3}&4Jz4t zjcN!HI)GT^f|gkr68Z~xl>}|_G~mn53mV)@%;#iO25nSGhwCRX>NZxx}@ znn$w5FdOhUb6KHEG|bM($T8Tlp*H2EC~7-9NNQWjFacteMEvqD+1A+73SsLnP0(NV zQfZjeN1#qHbj+m+Dtwqm|7S>Aq}udAbtLUq)Q(NmPb9r7j4qYbL&4&|)LlXBsOnb{ zHF1(f(lZlAG7euiE(?5YSui9y;H&~A;}$2#i?nmZReRi2#60`m^-NQu?H8W>FplP1 zJI?>wU1AJED_|8x#fo{dbm!TYC-Litf6noUZEFbR_R>50u3PHxj5Vl*f>wqN`>HGF z1lJ%Zfn+_`qjY7hzo&$9@?K;7AWvuD-bzPy*Pn`nlu6&P`hf=td1xikLiJ?okq)f!s7i~}b zCE9EuV;4Z*m`antd+r^g(;{kEpcokSGXN1W(}>PQG6Hr*Z7JITO= zbYe}1{)TPu;*qFoPOl5bUbN2yvnFdd$kIeR-9?qmY{So8p)odu6AIuF9$K zZtpV9O)=DX@<{4L%BT`tYKP&gu|pmmuX}>4MGYPlDT&`9==S@vZbV@`$M0Cfl*4*l z%>Lm?_jNpntmCTQQt86>K(zX0fX9QU<7x4!lWRj6MxJ z`uY8g!k9(ob|CJ%oBDz|opC9%ABGXty2b(kB8g&S@~%H+--BN~|H5}NE>(eUAJtAF zstpv1j`T&h`))rzy(A4?PCvnYuYZq4{q4@4AGvB5;^5$^PjOV)^cWy78_C65G& z95@#_4D-`(q4&r=e}GwMYW09W`ez{9pJs%qT%1(K#=RT+# zMlBf(gMm0U5r=xeu-^;ux|DjfioJ^LHs4Lzc=&mCr^oa!*SM;qxX zFV>2aDVF=ZzIgiltk@FJghYXYXL4u_NIbHYxH?tmT9&jMaLrM`wD~)adMO^+{gBg_ zJCP|GqT!uuXg_xW_p&mGqrI!_vf;SL^)EHS5Q%?n#dMGw5_vh3iJDA?O~Hdk6jIrA296&m}J&qmGj- zLmhoZyGLTTK-Ra@F}{!}pM#|JLWJukK0^1Mav~B+z#sYNw|PDMsv*;7%f{p%Q<8-C zkC;Gp^gjdG{xUnKZqiQWJhuH%M2~~%LDI6uP*+4ihE4We%Y$PNhBoU)mF2mQdTCBn zyQtEnkxIKAZR!qvL6&djw*241Zk5sMu?*N2g7<1m|47m~hS$)wPu3nZLAa%50mLO4 zOON{*w(!r=Pu2olPnvsH(ZP!V# zYHb;^NvBLOOl*FY`}{d_E(lD6dM|?Fct%5n%v6K!61{Pbd^#(vY>zqnAv#Y-If+?% zj7K-!rKU1nS1WRd1kt9& zLgl$5M}}r=a3`zu2hBW5BrTAMbyiVZYZM}1VrbdQY=NWV;Z)gsNLRFPxzp>H-!?qZBHnjF2W{0ogoF#* zMol0U?FEsTKQ#AFb%qR3HUIeBXB0Ap+?nCO!uRidF4rxf>+0u8%hd4z%g|kDF{;@B z?MUy@F=XY;MJOnEUFY5?*Vs#3!k)vP{;qEzF%10<5q8_m}8*IRlym&GsZ!2F78a|*#*Z|9W_XxWkvnfN1MR3 z%OSPf!q0#zvw>to7W!V1D1%jacAsTtGpg;cZzIZN^@621Ib{rd!i|I$LC4x1X^>W? z>lJ1)L48a8R?hGR4b3$z>3q*PX3l{Fx;~!gY}$mzs%IhhYC}_OUu}P8s*I0yOXVS^ zk~OmWHPHE*4n;<~TI3Jd#<+|^5<^2nuNz$=|&~g zBtQ@_HNT)e#{V>GX>o@uR(O<}DX$IcPf$%ZCEX-JTbn~X*T!)Q1W|b1Y(#|))WtR`FHIw%c z?wZudA-XDv4k}z@Ar5(bPh(DbQ+DY!rt}Z%HEnikhEFnj*>4oufRSW7wo22)og&=V zcYq-_;yY_Dr*bOmzqu#Szl9p5u+Q5wL9oaiW7%}3CZ8Yk8VsH@?O)5!@`!1*EwpY)zxD^Yi4`2xx6K&3NgTQjoSnFvXxIF8MpJPU9{2oi^~m6emeuz}uKoI$O38 z_j4Gm8rFTbe}UJ8WNYyjTTVmGZtbUHX%(jIkHpT7tfvZG+Zv=_plpMZEMG)&#wpLM zmh+RM^~1w-H@9Ce`#5vBndwMU0PXJ4O6L0P7n5uWDE${Nm%~xKip$|dS3PuyF_Vu= zPYcS|N=1dRTpu}@n5I6JIHg#XPW8HW$5WEYgIPYi-7X}=`bcXdTt{z3#@{a(YhDu+ z@0kDAhxVvYW9_VEXVD?q#!*VPJ8hx${f`hSxB1He)H3DF^#le)E{Z8i^p{HGcdl(v30+_1DY|i0T*(}^ z)7MI;a!Z%4-l2V#KsCgY#r({8+w?K(zvw;yMiTg9w6RF=gsDH1G34p!dC$`vWl1r3 z@SavDh9vFepTOLx+Yj{DdTA7Z5Jt$tBaH&CSVbD?v_<#NPHRKdi;DiWUqs8fKC`3` zYAy22GiOx55q<|+dm(B?no@D}tdX8xOzMrK%X(b;$7zbFN;t@Y#_~u3X{s8K# zcKgb22PKi%v$GaE?~5mvvGiU7Iw%#RQpPaCEu9<)nSP_qJoVj2sG>@N@_G9{Twaxg zmR;tr;=CWPcyuU%{B&>M0E+NC4tCj3c>37XB#MqRT=xW4(lLC?72q+5DAT5M>05t2 zvly50mC}1fSU%hDB~cza)cz3EHEBcIyT!NrI#Y==s+`KIr741vBth`6zgY8NjUutW zZ)!2a~oF3**A1`f)no(9)24~BwpJbJ3TqQ+S~Hjug3xp7{vy7)oFQMvQ}fCb@Zi1 zp@w1ykw8ed~nUSMwbeQfMf%vJJ7Hsj@H{umP zs68(e6MOhaoXmSr=G&Q16ImXeX{;$Dnf{}raMv@13^XHljr6^LS*`3#sj&avbi*J* zj2d)OfiT-)x4T@P(yeKNqs0Wj+woVE7j396n04WS35!-T8pYSDRN7Kf_xsgI?_iD< z`B{Bop; zvp%JTk8?g=i(M$j*(AGZGc?llMzblA7eQ(^A;d3NL1GcJ+X%`aV{J|pu~XY>J_NEj zli0N}--0TzuC&E>(G9#7+`pPiQP#V0jb=I3zj{2<`p$Rf8L%Etz`gN(m+D^-h_y_#qs(1%`hOVKjK@WY-{~>(uk$F+Vra$~D^K4AG1b zan%lViZ(ehg4~0L-w)KLo_*uJ$K#lI&JdGDq$;)ur3<{?cP48czE`0a>`Ff*?Os!) z*8Q*C#OII6%q8O2BnJS|kZkjIr{3f!X0Anw`cHx2=|jD8L0pI1Q_5IT;i}ukj;n9o zvCLK#9dFh_@6d+=-u6WjB#>02Fd$_1UcETOrL*{V(YX*9o$lS)X?kmyB(;=qt(0q1 z*gjZWoQ68ld>N4NI-a+4IMuZWZ_Z{bDX#;|KETnet zJp^l3SDu?+RLw!DCl;*@yL?JNGil$e@1B9%ubBw%vvj2O03$ykBbuk~A05~*zwwEE z{KLg_ehDCh3?Q>L#Km#0a3m)tr??(`>CDGXYX@z z8@rL=15N(d`c~}lbEhUM?2D2}fsBvC8GuAG%JRoMh3;?_{2=(~nQ+H4uhOm8JgJDb z;GQCCF|>p7Eum4kUOPfR#6Jo@CXM+lvNY?dBKe57k!Y3o(&Jhvdvu&5^t^;bL)3XT zoh~PO2LMd%88Ye03Cb_Ma=zY|zPMxPpg*ep+@)jn84SDbPJY2wS9M4WfXnM0soV`y zDqZ=LLhb7T4%=5LEN)3NkjkPjY1zxKG#6(jHkQ(H=4{(g~-%~u*poEv(>&^xtPd-1$|ABjVaYy%^oxh+Gc^r z34N0oCWmv01Rq25^+|iW7S3HNlw5501)XjJt-20A zU7UseE$(g_cyPkhNMtee9G%H6U4LotU+nGq0pdAvKCC`}q*KHsCxL96_qXNtXPm9s zOiW-rEm$?lJbTFeYO@UIG0WEo++Vs`Vc$inBi#CZw6;uNA_J(7DspriT`|zKnYoW= znfy!%Dl;ASRI4t@R!k-1;cO^OaV|neM1Ob+;A);GGfa$_;z$+MIRIiXyir`yK{{{Y z(ybx=F94(DAY{6J8VGB@TfKU6;Bvo4?jH>^309g%hglw}RHO`P9Q}M*X>$!ScuwIi z+FsmBRjjp5@U-+2`ntc~MQIajp5z$aB@=oLRC4xV;11GERxN|hjg5?kXj{AAdDI*| z<^3tnxD#VBW!0`z2@`4kQeF`zqRfvkpoY*$Q0j~xj+>RNTGWyPXFxs0ME&v zij;sav?wXhN9pdr8oyHe%i+ccqblN-Cw!ZUP|BAEs^HPYheBs$DU9n zrcD1)$o&2FXeFp0QuGrbuH%@lAkqGNK;)Fzp(4-8#K_bS|8Y|&^Olssxio9`_bNf% zb0q_p88UX#{A0CyK_)+|JLT7gN#f1~n7k+Fn>9*Dx*cLh^TxA{6efidy>#E|v-=VS$^ik1Z zaqjr9Kj?5CsVDgjc4P2P1Yo+grC%A_`4SpXcEg&z^U$tG!XFwrin0t?>eDD=<89io zYCfjF-f{ir&6}YEgOH(oJR>@#0T*?2ZQ8`dgcVSdF9q{N93?^ij}@z61Rrf3i=Q)Q z6@A2+?GKGoxB1;UWW|?5K9yQ#&4%-=1~SReLvIJg2#uO4v8lhPMO_0uQ;pab>Sq0nA38pn`%0URQ zU{J6U)QIuNW|*0nVjE?>evb%KDE@OhNHBsp_&QN%p)Ly-a(jz(t1f&d%g%N5=+PdJ z1k_*;<_aJ*Q%^UP*}u~aI%6THGj21zdgX|O)sC5u9ihar{;QUvc$g9`SQl|kqv_1m ztdgfY{lH)Q)$7;SWq0ImgerqTMELrRqZv=(a|>J4d{JX(c(zkSj^vxgh!FVRsU{YXSbbAh{ z0=j^NBxUN5N%5iZ+*rYoRJLfn`TO?yS5UwHNkv&%xz2Xg{8uO?c_x6+Nj8%M=*H92 z<3qxK6}A`z96F6?+hgorUS7gI^WW`Z_~$}fDFElpIJ6AWIXU}KlrbT|!x!iIb1eVU zP|hoAYO;_EhF=FntqCzbVGbNS6vJ}t(8gRxpDn4Y8*V@EhS%rg_}4!1`~7(w`OK03 zyGQ!{WlHaXKQ7_#FK<2e$FubNt3E6L6|?^SqFD5QoHYM@J&^PNUY7EI7Xho`|1QG+ zU4*|kh5uU#|9@Nw)SEi=9v~dbF&S1Ze+5dOQ%p?bXXmHa|GhNO3dG*NbLRy{WT|N( zz{6P(8Jr=COJkELTX~t?Y*Fd8*w26K{KvR%Hi!2YiMc^1JMF})l-~b-O?;#Je^f{F z|8k=u-OC+|la`a5e{_^pmYb8C zJE1qr00_wt#NLmlNc@KZ9Bn;C5FxLnqob>Vk+bP+j9)Y`-Wu3wn!x)RKn|4BoAqyT z^UphJIN->PKep`)llm-?{p5fiXO-R2i>bzuP(#P z$1V=h2^kyA4Y&#Sb*uvzBm;~pNCTnIeJC6AoUMQzmC-ETzzDwAdn-vVAvp?18vbm) ze!Yc+0o1kLP~9|@YAPtKzg0SE3?)R`p7j_9qm5s82z9P%@5Huk+lIhfzCaz%j`0U_ z+OXooFLXW50+`r!=zN4`r@KX^Y%`{bJ`7oZWPs=)x2e|7hYQl?ulnzgaXPfA8y8=@ ze!a29)dj`XM+|Usy&V+P;P6;^Nj;HF0;pBR&pXtk5Z|)PvMpyH0GiXl%7vE-$a>im z&dv%rtds>2rZzk;_uJc%quThp6PW)&v&BSBhKs53If>B}I=#ATGI9jL#qFB80ia5x z+VH7x#2A`=e>l%}uzF?!(Ea2xOD4!60>bwmCLf;Y=HurN=$z@d7chEtd%W+Q`})iW zfN0m5#SbP{?rBYGX!xInci|SZMCGT(!HLukw1?j{a~EfVDJcY5^Fp{gbQPT4CV1H$ zJN)s>3Uz-_o9n@R1hY2OpP73^=imPP$CAm%k&hAC-rlm-LkPhyIq;@(vu3wi=D!eM z%*uq6J{|MCo`*FkmqCk&lX*TJ5GJq1ls{gGi@S#5v|%BBR$%#pz;boEilEL84d-tS z=7Oc>I^q4u%Ek&hjEwwWDp4W`2CBIeUe}4AW));2yg%gnWbK)?{SJwke}|MrRE9#z z+S&!GeIGDyr3>M_e%G#D^EChmu@WGv>(Bo7sbW)DHDuV49c31Zm$ibKrW}@p8&*XS zO6(y_Qx4bxv$iR$IW{0vmFnjpdnHTF3h^hB=nGZj0W0ycp1rf5SjZeeV|;`g;W}JS z#-RfsiLPNG$1ceH{kG;msbHm1S2zrzY^8cWF2rf9fMhjiTgHw zVLHn+xqp7cSY6qxGAiZhOxG8qz1`JN42!||ROg&-a;_fI{nuibD4QgFCt#t+4M^NNWOe%fsHVFnem4n#q0Hn1SuFcyq}X*8U2R`6dNB^8e}XlX3S`F&02KZ#?(4_WZ* zj=x^LP-PAJ#(3XMsOKB@9pT=FjuazrOMMY`Hon>Htop-4;QnnIs*^oEJ>%l*daG7b zMgbZ$I4qxkz26^a@8X(0pd{fR)P)rC@$vb=6=Iw;H(oC4kKX#TVKT^Kp{y3uMBfnv z;3=CFf#=z7kc>y)3azEoj#!B%1NLRjQiuU=h9{|^|8{GgJ_6O`4;C%KoAmU3M$ zknyCw&LkkJ3@zcVhQ-U)24-ab`0?W-;%VQgy8~NK5bQ*69-Nj{B{Fs7)TvXSUSVgr zod(xt+geO{c>BK_zk0-9C7z~}*m+q2>#181aw-ksgN-4DWO~x>2E=HugF(Bp$0=Sv zK@O=oD^_inQ;%^ZdV0hyy;ASQl)f!@vzks{$K;5UWs}}3wp_@Ym#F@}UL})gC6!NF zSy`144{mRLksA~!>&#pyfBL{f{K=*e+lnNfys}9+vN9xxiGo(6HMl`Aq) zRGdjdMu`A&*@K6F!!*C&Xv1FG)X3m;uVY3_-H)~8San4wCGi?yUH50ETV^>T=Jqq`#+CVlKevbr~BDFgseV3PZ~jHOv9@@EsIQ*GIZ|mPNQhh zh=`!}K?Zx_qwBe}7pv>(KmX5b=vOYmt+@wb;~QvyGc(cnldZY1xzT)+5nr-7=PpkBDT^bz`v*+TDmKHqB}y zoH+1(f6S*Z1DM4|QawbopzOCowfac!3F98(s+NI#SO zXf!50Frnq9g&ueBzryH$?ybiwEcFeG^+&$eZ0UfRD+e#0;s5#}?k^_9CjTTLU(UCL z#AsZT`d~BEa<(fyAgfBW2~_)*ne-M{C_}^@xq2GCnaxv}8g?&DRpWoRbLAcG%gZRD z3fkJCR`SF>G5%8CO{s4%ksU5L!wpcMT7meM0t|O2B7!e99uJ(N7M@}Pmagl?VBsXY zPQXMpSbA}M&7(#@<}$N({9d2`d{Qzgf#<2_<~G^W$cMPz;63|Q8l$*QgCYoGR!h12yS8=UsBpmUej3a9&f786LOj1l2B9X~%+Jr? z+l1ZID0wf2TbZ83?pCI$p`r2MgN|*zJ^0u32A*El|9%G^-^uTU05LniGU67(R>MRP zVbcY7PyvOB?6>a^rLRbci}MD<^5{~sQj~f#4t<^>Y^l#NyyT9pwv3ESgAkSyuYRoiF!^%iRxbznUhb~3 zv1#9yL#rF*MYgmVSOvT|6f`i=zM)&vZEty1M4Oh~%qM&6S<&L`@7?(J(DlN5FIawR ztH`}!y`@{ytaRW@lC!9LLgm2~yHzG&LGlxWD>;hhpi#?`JuGD8|T%z0?os~nw9AiN`q7&_* zb5#R;QxkaUZplBdRZZ|D*h;Ia}YNhlbX8o%+#IWVd%U4 zkxgQ}!McQI@pA~t(pp;3|DFbc@^-up8AbXGG0;784Qo(+htJtZyKbfSORmW{8L~ zvyn{eIj4drPXuR1av!ol;2`3*_F52M4)00zW`@I0l{TGL`HOq_|LYG?u&}i;+2s?% z&S1t)TC;J8iit&oFUrMk1R#%$H&?j@rFB5W>wse*1}RQ+Du_i?{Mp;c-(}4YciKJ( z8Dp|GKxR(5?#EeKSX5ot zAd9*}&{Q_dH9qOwf4>Zc``62$kvD%Ti%saYdh7XT%Xc>r({H8`l8pX7{NP(Lbag|# zWky2=rgy)};zt3J43_l2U$0>eoozcM)fm45GYTnP3xsPMt&39b?C( zw{5346Q)s54I-3KnI`U3SqmP)@U&4xxYgyPe0$ptUMOkA+|^xlZcnTEDw#3e|E^h* zLP(%EdT%>lc2N;h;R8wwAnCUzCMG)BiztCP$`n;@_D@qW5X00IyU%fQGMYRO9rax}%2$W=$;(@|WYEsf9s-4uFh1!3`<$!WhmEu2_XD{*rZe@qLtcav5< z$sXmbYQsG3_xGJ1|NAX zYXD`$_h!YF|LD!5QE%nfh8K(#GSKU&RO(yyL*n#?#Um4YfKd`j%pwhOnh5gn;+8{+ zVDzgMYCP+2k=49?sHog`-L14B++47EiFf{MQN^(%D3lHT@98#RI;ANTN&;XH-D`*y z*sKfly0f$--F?{-_hYX29x4SxY`z}LSXM9Tq0(X=ZtmI<(vAa=DArtR#LdT7sSb5+ za>kWCq!uO)_4)>6F$~A~>8G00r~py2@LP}yU@j&$b}M!1(k*uZ=Y`ouT)%ehe_Fed za@-Pw>PT?`Sn=u`PR^zD;3Q-sTq^pLqf?`OF%a62YW6I?>EjG7Wvd08fN8J|lGa8p zWDg&KUs1qmT3mtZP4*)+=m9wH#%luHK2?4MaO){_n_A?A_BMSHN4+XV_2FQ`03wQc z%v>Sftk@@IWUVG;%CFQJmp+oQ9p_O*S`c;N6!6##|9F4%sT;ljg#^Wk;#cc}NX7>s z@3fi)Zf^kOqq}D2UIJHbka4F_Mr62`U{FKR+}Id~>TK7AY{UWm>bs|Eow8nfGn0+Q z{?pUbny-yd_1)2HTwc-=4WJ5r!HTUTg_wbKz|0l1!d_~2g81FlK<*rxW^Zj4fzwCI zTF^PH1rv=PdJjT(noJXucQth4rdB+_+*t1I^5LOr;dui8xu=xmJ;JSEeKYuMFcD)W zpF=;p9(EHgQcaV^u&c@N*71+_Sv)s0&@*zu2#&yYW@K{?v+oc-WA>kKS;74v7M1{J`l+z>LSuSxdhl7=Gx`H{ z8hR9p83dgz&tZpVgwy#acL2Xe`c^1#-|p|dJ}E<4ofHLlK??CFyBn~2KSH?hdcuuV zJsQ1I8?25zu#TmDL_`C=BLr0mI~KN-AjJ(EQ*j&rANJlnuI9XdA3v5D8H@;_GNUXZ zBc!rqrzmODrjWIT387Fi4ALeN3Zb-}(?%g>8xvLe7YFmEe zCw#r}UiJ29pa_!*KUiR`dkOg0esEjNmwwFHw6wI#*fF&Nx!SJ$J(>h=*tK1=b8!pK zrJB~CUY(M}%Am*{o4S2V&gV~vx<1gtjk8ZjTjv5Tble{{=r>F=i(qHaEIFH|1468V zXWj408e*u72NJXms#vIMzZLo19%bbQ|D>d(sAwYnI7cCg9{0y(<+jG3s!s!KLUgWb zX-WqW=z^;;7n>uhFOC&>B~R@vIeSpu8;x4rhdLNhRttw$D+4N#**AFO9nYOn&5=MO zZv!enc6jPZ+&~+=U=)UjT<(vI43&er#`@eBVs1STXP1HS;tLz*c?QqWppQe8_o)F! zaLwwY5>RsmM;z+;k!$jAhwB|q$~h$aTK{#c1T8triwvOfTDCn`Bx6*Ooa5e6V6g-+ z$S{u&hffPX6aX*+#v;sdKkESokXe9ihz|5NNq!2`c5L(QbetG~7Hw7#!tQCE2jQw} zoS6$~Evtn%siWD^L5p;tdK<*KrI36Ak;0XNO_g7i_*ryF=Md)!5^H+pJi>tU72py|$cy)+v+=<| z;Yi{s*<$?~`0u^bue5^ekqy>xxmWTg-6}cObwQbAaqB8Wy9sK$|F5-g3f2ald zSZp~qcPwixp)a$V*6WK;N&t-J=&UchbI1jeI&$phT6`Cx_bUz?XddXHSujNL@h2FJ zobMEAi>;H}&;i*4zzB*<2aB4I-{N)gWY_8=J-zk!Ia~FiWG04Y)zK?V3LXqfGDr zcqig)F^Oh?)aHP@_7g7XQNzDHaFAD^)eDL{E?cXgzW(1&L2Q5ljAi*-LWd`7Itmiv z>X&zF);>9Zq?(#Hp4(KSGFJj*pQJ(>P>O%X=kepmKBdMTETIuIRL%>M)tZduXR%E? z{{6YCFJY>43{rnJe`mP2e7z?zfCzC

5D-N|e5{1NCQCkJL#}TCmp$W3%fAX_;}T z#(4R{UQ^ypp8#pu@Q+KeZfYD|kH#SK9NeemJ|gA$mZzaP)UIbKH^19AV)SUAqCnZ} zsh=J%aCG))^*M^+m_>3_?39$d!$NW`bp--RjPfb_h@6|ol`Eipa~QDO6@fwOgvUN6 zDLDE@EcJ|Z&kX%-8fbL4VZJM;+f=paN&L`Ym^JqIpH{!t33R;0gFHT+Q_et6kwcyT zpeikT>J>Qa?6Ldzg%fr)@1Obz1&&UY3@+Qz>_uO&i8CDeVXziRqV~v=Qpn%d);8z! zW+W1S34?~&JPzxM8tnClIEf36D_$e(8fx;!&b`8|r+JzzN zOH*&ZX#RXj3+7rIAGeCMHP&u!Up%Lp8yOi5PZtP;_s~Bl%&lA@^bFlO%g-sLaP|9r zx{W`2pjphqkx;HG-AsJ_XXU_s}ewYd=p!2Ug{ksND2&-un6fepv#+ zbU*Ymg(l_;1pUeiV7#bED=Qn+R^#C9NGT#h`dqtUw+}=55uc|)b+SF*t9^fj3K%80~2`7 z{wGE7+y6Xd@jp13-~LO%{3HK$hyBlgDUehB|J9E*3wX8hzaReV-x5Ao<5~(uxqE_po0k)I%A)>Ghu#tQs#AOHUt6H!ZP>K$q*wAYFeg?Z zI#iv*xpBdrKmTijpt@5V#-x#5_^tTtqxr7=1;Ss+3?&RC9VC1!{8CkyVXbw3+c9N; z5BmX@Di%101&q~sx|O%E&SzkUE3U7fg}~x`)KC3(EFH?_lD6l z+kcwy+mrJOE7*}FIQ8@S4%MCexOqU%+FA$?De*ti*K>-IL{|sY+{bex}JdVXIjIXtM7qN&xOek z8tc#d`xt8dxCNp@i_;*Dtr7(vIie!&-rge#D{eq`DMjbKA0ER=MVrEBCN_V0=y~2B z9*LTyd4RXEMA!?*EhxDd?aXcW4)gC`YIC2a?@V5i;msjo38K6Ze2iW=-fLKJX0eK( zq|J3_FF3J&}-(q(mzuj2Sz!J6K@`{KIw|0eCE{skv6jTf&~j za95KXM7hMFp7Gh|A+vCSkcB76-1-F?t0x2iu&ya*KazD84+j@SX;m^=qp#m-gVHfk zohbT~_$wN0FMnzhG!9SP{_RVB`*V6aeNVEW<_u0O#X(N|FDliRB{0D8O@98Z);bI1MRFOGqT3+w9J8>7R=RpucrbPrgfbANmqMI$h57 z8*Eusmn&KDy3_QfZDc?ACEeMnL3=X6z6n7rW(ycWIsjIPVYr~fh)ZeCTVj`iwx12~ z{`&L4yir6wZkPacPKvi3`}s$37W&|ywMG5s>04Qkd!PgWJuc6qgT_B4IDbFB+$#t* z8Jzl|>$|1{QU0bBKE)C;60rbNlrjygfI_=0n+gQ6>fAMoSh&v5^?8>BrF-qMF@el#pRHw=>Lvbt3aJgF- zC~FTnyV-K-rJF@D%MXvxsZn-){x2B$h81?a=;0)th3ZS&ZWAgo7sQDSyp~Hv^}JW< z?ls=7a}Y?L%veF;{ku_NInsz5wNR}w#_7t=f1Oq@D-BX;#;;zOSr2^3c9f(a`UY4D z%E1QK&bbLQcm1w&Mcpx)iNE|R>7KM)d@eivMB7*&W(bW<3l6@wPYvW-`9(!Y5}Psw zLh!GUqb<2IOFS2?3a2WWl#&4}jFMo8@eixoZzns-1U?OO#O}yQj?S+^Gg&PR3J%Uf zFS3IApX3?1e4AkfT>ts?AiyWv@zK-?#ip+BzAU+&XyU3SuMuQ`3Lig>#kS*4u(E2V z31Yc$8U6ijq}t){WGx)qyUn>$4vIv>s~REv?%gRg+a^OT$j=(~$)xp?tpHlZIH;_* zAu*N03Xn)|-n2DP=T}`JAyaxq0A6TUDTHa3=U2b__ErD$XAqBZ`tx(9-t&+U+oqxECm~T>hw?71_*y3t zS?j|v)h0_L%Qb&#f11>4t;Z*H!H^~sp!-0C`GptfdBwDML66rD_Apa!n2dF(gc5TG z?hevec_*L_L2+c~?opT8d+O)I!^3UgdVtN4JBc6r8OX+<)C9ZIIeP#MR9#o>>|hEA z_dgHe@4tlQnD#3%%~w-XJ3EcUK=LVr{OCSlt8VG z8dM;NGJ)kxwgpZ1NZ3B``GO|Skr5kP0Aa=3H$ZGR{BfV@n(h70JVZ)i4X#68Q4zjl zqWW44c}du_S-$^KZ}cKYz*v$c8y! z*2vJFT!V@;1I}H8X_@q;2nKhv*~44|dGpizS-qxyGI|BnGBHI9gZxsY4PS&HVJR9c zzXJ5q=K$v%sOj?V^oJ#1+Vrc_Z}IBkA#cd6=gekC4-CW?EzG4N8 zb=ZVhW_^#r13MMRShQ)kC+6FjR9$ANthieyU_v|JI6Z+d7u*=WqTQkJyQ=#0d^Pc( z6_}QR+k47oA(B)nuH(Vv=O`}v0U}!)QOXp`iS18QJM8$U-u3TCX|%xx5a%Co)E3%v zyvI3332Z${VxD-S7V-H2tZaT(zgAZ=0@Fa9&%xIlS2+P`PgIwtFPN)4UVMunXn+UN zPT@X1{SSC$l7M~|6#%ZQMZyp^hSWHeF1cQB#^P`sXfem&`@_hO#M_@^(d$jGH@Lvw z{PwhlSACE><2N^T>NVwO!)ag2EP_9ePu2}Oj&`gOl9P0UnQO|5kyLQk2WR#v4aAc>xV6}Axa2Zrojox?`qbPhH>J_46@rJS7Uqq@ z#(x_oHfJ%eJn-r{Zm1#=1|kOHIfU^0%FC}(yg;Tkz8vSniPyYpKm%@lj>BxL!C3ts zrsP|2Dy*!};TpQ2OQ3y}JWab?%`g?^D~;W31469Mw(rAIJVXYHtPYkgJ$bl9M$X^IrR%zd*f`#e^ZP30zo$em~&nAt}fv zx7|C5b=%yLuv1@4r;gLpUGMz3?>5-igGpd6K_2k+M47VK+}ym}4W-KCzYI}zZ$^S2 zE*?%T2`DM*ep5c&5pKW4h6oSj$xVAf@;@AuzI~b8UBKS5P%mnT8!)YimB7otRQq@( zTzeNO>&a*-ZhkbQ${rL84p%aOR`lbj6gJzp;kjpzP*(LzB%t!SxRvMr<|fZ$dy5XG zYN-6~AV-Xp?$vvoFMeA;Brof&NA1-*keQcO6@b8y3G>HP!~YAY6V)ktNLam`_eD(x z%vW%qt2#GrEe=p9QgEWC7N(67Cuh6b8?_%=-Y)v~b^iNjfS~P#76|^Qn8g$N-$VG{ zL-=}8{&y1oA2|t>Uw(Zh|NXxf%FNpoa&VBZOj8U7;>2z5OXvzwPXLx~1Ly(a(WFMU z$|q8{Ut+h{MKsoNc3lM8?I4SCbP&z}^;3?OH~^(giXK3m=h*!dxduIVIbdCtQ^L(I ze}EGA=?@T23p^TLKiOpjKaZMb5UQ7^gOvInC3+T90Ie@X*+%e-RAGLNykXB${ru2Dj?PgfaQ^C z@T*tEvTB|83?_>Z-vf4}_PKA<{&0IyDS$jl$C4Z~-Ku`aKx)NQehwWHiazpGQk?nsZ% z7fxqDKcj@Vawwb?0jVbJ>4vVu)Wr8iW5uL#SPpv`u5@S7v>&JyOwUtoIN7?A5r{iQ zKZ=DRr`R2M6}h48a9se%Q}8Bd{W1S~vn#85S2=K}!621qwLt*bwWzDW@gJKNz8&7O zM`-o>;LEP(5Hn|Y7_CPJV45I`8xN#L3$Stt?L6yOVA?J%zEJNP<%*4%_X)Wtv&8qo0+o|?Jaf@re>v>jhSc9ahcCi0E8 zL5pP))mXebWDHUzdhF-rGa(A>`X0lI+n6@xuk{7Jr!FtvL!KQH3c}BP2+nT5r%p1aAu7;SU;XT{DFarTMJ)?4)H z-f#4NPEkm&UW=n#mW_6*6n))-eNLiisLqCB33@^SZr0cG{k-jVxYY6uCZhk%qVA#R zbrSBRLl!QFcwD2uV1cv?K`}cA%q8*VMS{R~n(K{!@?GjgMQgNt)NScQ!lDx(2FomG zTy_*}f99fmCqLq`Kb9CyRCJgK@K8TL1WG8jEH0oh&sB$XQ$_ASO%B)pWR%-e`5~o5jkv^CA$BqIC`x z(2`@O0eO%Q_P+FN7ag=Gg=!dHjfu( z24$sk5xVvgSgiR&pm^?aA~*L^R2wPj6fmewnJozWjml7|3@nqgcOVt_3T-1hCQH4{ zUnTsmrr1D^RHh_iW+V+&^|M|BsH<|}bTRiRs6eEkm2i3iT7}Ra-#-U!n<&{<5T(>6 zwg`+mWn^TKI=))iUtT_g$W8&C)+YFJe)&@MLljUqUCCwYyfEm9NzY-J!HNzTErH<^ z;vwS6RFRGsAqHNhI36;A&D;75Y7Vw-b)t5WeQqh?K$4F!IVB6MLxu%hyMw|~ei?^& z7XW)1;J+pLQOfrWfqpTNAH90$qU6XG5|87&XGIeT({0hQ={zYqAVRMc1_cGFrf#Kv68UM2J^5*7C&tQqB(J<1 z=u-oLAd}Ef%}tD<24nQGH8Woq9*X?B94Eamu8sv50C0ITG%Eec?c*SnvbrYMB#?7I zDhC+V#r~Fb(ke|$pIa*YTL>f`M6?nWigA^26=|xwJ z<3~NITmaK3@y}XOW7|6*Y|$$op1xjGbY!H7-1Da}wiJD+HNCNjJtLruKj7oAd6}6R zcd}VZ2%zZv5PfZa%}1z@v4Tl$;aWtfdjp1DoCa~1FW}aRuyR%6goKEdqz*y-(0PkIIpU?IdSS z_vev5NtlE{zV`786i?z2d`iqSx6U9o4lHqz%5q5Ls{~Q_*Oo}WL=TCfl#mqL+t7#~ z>G@t?CbQhw!P9uV$HS?$^tXzsIYsxo_as8XmhA_?oG#>tMNXZ$w4LH@VgmdYW>ZUC zk1VW2BJ@w&pCOuI6||jUg*&Ky7sMvJ=5feVuToRb&8iegEWKnL58{7)OX61LWyV33 zEti##M%4B*>WqXrV{u~d(H2JDGmPvHsAdEz&FOPBF>_|_?52$MXaBhEH$PEzi_?hT zM*P&FyFtKD>0?H>NES9rf;Gn9dy{it{g)T{u5r(`A0Ds8ZyN7fZciZoJrYEvXr9LB zMIgFr(a0Ef2%^pJtlYqzk_U<_1*wOZvItj)DKC@vGt4~~V{R4d6SBugN;Ajov$9N6 z7c9K@c>5WA{G^O;)R-5weoo%hHmH2TG3=b2>bMhc;632j%*+ZPyAQ11;>G9Az7;Qq z1$Y3H54dts|seD&G>G~Ig9Q-5PAsNvz78yTwiWo}(OTcEmt5=to zT6zlfgZD35nkPk;&U9^OR%rGMP6nySNlF;b=1U3QpotWY5W9n<_Z~9|^O1E2tAqE% zF9pVlm}>m)k%*%H%2HdG40)nkcY?8!bfgm4w`#r&+_!syEzOZn*({Cq8qGM0A2p`+ zHFlYCwe?rOG3-2?UR@AGGr0Ni{V@cdEE45Wz-%TDh&)3^vRqR_Ow^JJab9!*5ntM! zbHMvcpIw;q#R zo4m0zZ80gdMPct^Nvpt|+8*?^gOpUDJTCz`*HBc0(5uK$3c})u*RC{WAlj@pMK@|) zJ8@FJ>D@kEA2JQ)Z4W;Hx>oA%*=Vn}at&gQFP>b)t$l%l&=V`s?paEfrfxm}h_|29 zG}C9Q!8U=i3E3*$e05L;mw+bYqV`VhBpS!a6w+deoYtXvR3^PzitvH~LMpI$(@v5f zfr6Hoy;;!T1M{N3tO!HhDvqyHo^{vx8Lo?owcM0i%uOT2tMC*Q8dNTg?H0>jb81tv z9OJGGqmZcPn94HFE*DIFlOMFFMAiC3o&K8;mG30fiook&v~!e_(<|?o?X6R=AR%!< z$329mqn2L%+(q#^e9C#x)Sgv@{o2zT`>ryu6l@UX0mdDsDW06@-Hv7U|Jc#+E!}Tih781>WtNYO?({3p&PApw1&cYR7XjIvGw`{cO9BRBG&hw?{G@R;5CD zB`iUHTL&h?5n1Kf{Wuz#7;p4JsG`vRiF;l9%`l4yQxE=$()SLEG0ahqD>J2EyWO+%1 zQJThG`)jz-W=dThWZRbV0rtFy9OJ>gNCAV12Vd$9A(Ur$tizCGFgPQgbjIOGJ=FI) zkQ*UOzY1ahD+pjwKFKOaM6j;VetikEx+NnQiq2j`$gf-(N*gQ3W4VAiG}nfLI-~&` zm6JX-Z+E`kPTjzry*v%z<%Jwh3>((YoZBs6&umnxBS2VD0MfIO*?O!b4YS~FAmjn;qxA`KzbS)?`1^8Bv zfkWL!uSxnwp1N{m&qKN@zelF1+-wuo^UC%KZK#!&zdBMz4H3nXG^wb$yb74h3&-qFfRj z*nmV>p48si2Y;1{RW>6ozIxckoMn0r6r5VjTe(NsAgJ|~kOD>86%I(uGp4)uAuk$- zQK_ffQ20=7^K+my5pkyv2u73iw(#VILLrtBtvQ&C%B*bjUl(L{2OT|nbbAxjLs>|e zF-X{%MX>J3PH)D8!<&j12zjP3 zc(X?#=%%17+up3|XG$L(CSyWn60D2fgeSNJO&`xp20B``3B^#v_a?B2AhhOJzYR#1 zVt2ESSE9a9L)Kbg}?3e&w&33`_Q~xH$kJos-dZ^6iAf z&&+pv7WhLl9(v?%5k{;RO#d`XQaz)ONv`jSDCGFXhgWvr?Fh)zcKalhAn~QMer!wS z@$#Jyh6E*Vi=62sLggO)r!s&PT(BJ*{Q@KT_YD9no+&X^ic_J&d4lI5y`RJt2>rr` zCUIOe)LOYJg-V;g^t|DKM`fF}Z)Q8eWKc9Fe3^)zil|PUY<{^Z^Bx7ks;M3r?^EeD z7iz&IrAwe6uN1Xly63y&*-@1v(J|}OYs8mdr##)L5owT7_r8=9QYmJP&2F*Vbwa*} zLT2I|v$QtGBt{R=TPkuwIvCBEr9TNuQy{x9_q#;Bk_ZdhwegY6|8g_7pcYDMZj5fv zRLd&v_jpeDNhffM3(PVZ_NC{*QEGCEw=*AFezV^F7JvjHUf{-N0K-*4&3Dh;MsAPk zDOOA=&>Rc1fJVjK+7$i^Uldt$Q&32e@p^b1yy6B>`NT1+EtV$a7LXp|6o>B>oM;Xx zBEJ|3{Eno~=(`M=sy0PWNEq3loS&j2cXZ~!9R2EZYM~8TUh`dU49h$PmCt>3hl6(!BWn)$poQX&=k&NS* zo;FhQrA$0jOg-yNe80|#j3)Cy{vdCfQ%j8fMd z@C1yVA41MSMZ8j!FS4FgIdvcCmgt5Hg`E|@DFd$Lf+FmeZ9FR zo@$j)ADM}}+PMbdvwCW)P*y6;!XzU6)^S35gdHq^KHSlZFY}m+CU*tKGv!KG zwL34X0l9|SN1qyE>QbvEoUu{Q?SbTwd1_u?Z`7xqx=HOa%=FJ87+d=5?cN|ADS+`s z)gg1qg^8yzTA?Gp3y#{V9tG=AcMw^KF^(tcoe;-Hf9uP?EN7=ia}U#B9)giRG#Gui z)Us2}>lNBYgPNyC7%L=37hw{WBu4xBS*><{M-lZ_IgEqe+!I5J<7==zo=09}|ZaNX}!)7cY92<%o z>Vge38OTspZ(O;`ou(3|bHD*InPE-B{-l?6T&qoYRrn?PAWV&c(Aci3*v_-bl?j%H zv01(64el)^(j&J28h`rPXZ%1NlD7*?Rd+2boPvb>o{p35jb+V~iQGXRT=1uaHmaIk zDMR`MEs$g!r>|;#-=JwO{I-X zu*WGoxw90|%u!d_v>m~SZXEA{PHg%$?r`UwTwfRMO!E^Ceq66^#XpI%&+qNL_7H4ESDLoS?j3TWGikt3rY?Kr$?>n2Vyvyje>E7uRhblrz$ zw*=jCo01`$%eZu{M#QN$$Cx#joAJR9%Z^&GJ4r3I_&}LU=XSA$PCQd4%O?Uqe%8e< z5-NvjDB5$6j-}ZGF$HbK@!cx*cOMOW+K8iQ`ZAR{No{`g4~*MuPP&@&N-Rc;S&*x` zYXuuD`gcwl)-9|4o?WvXfq2D`&Of_?JP)1UsZ=|f=Mvw7Wt~(mik^9++Osj%QRW+; zm+4e3JUwlyW`BBNpg9<72#noyzqKZ1Q*Z#~s8~Kc2+fjazqaGSAzcH#B0$O1RtF)0 z-Lu2(6FouP<47K;)s~zrb(C!wPHLsXF#^LLXHIDx0?DhxlurpjiK~+#eGoX}LDQ94=afwgm56j;!1VH<2g@u<4{mkJ&BMzXi6&ljE*b zoX|oH8LwnwGunK=@2thHZTWAS@_LU<<+`}SuyG!kow3eJ$VUDwF1h5RQnk^+fORL^ z0f-om$!5f?CzgTJH8$CK{a;HCkTgM@R(9pRNFcIA9y+0Z+euSI`Y~F6>=++rUW<3@#z_!z46=$W(gSdT#%vprOaE&Q7zqK(ND46~ zr~}H{pO=?gj@^1a*T*FM^MRJymS#JK2m)pWp?GN?jF~}$j+k08@tlU6OfkYd>I)$%dudQp3Yx|)uz@`MHS^iP~7*;TTc5;>SD+er!xjk#N1xz zqdz6EZ|vAe+@DZmzQ>@$DxqrKW-fSh*rpJa5GhZ)v1*Yg6%+Ah3vLx_=Q zQ4v+Q%|9u4l25ixu7uL5pgG|qQ_q2X_*(AjWU4o_3}} z>bU}i-W>~1AbaP~1uQmk`u^?JvR$VBB@WeX^z0!b_QQz%!9|+GL7Z9S%y;IxXw`*_ zJQgWvzT;9OXUc=Ff=V3q)I_yL*`$%#R{wljU(cF@4^I6mp=yUs35g@(-u!D}m-L^w zk+~Mzcceoz8lS-e)Q6HLRBj>prx=(si-;}e7Y8h88Tm$_^)5F0pA7o#$$|k zSs=5lX?(Rt_g+f-e>r5X1Zc zr{zOWJm@n)-(ulbiFgcuKwB6{FkRK@!UIhp#&vlpe~(;nJqcu)V-p1lkuJb=Q`a;N zdC~5_i6ZnTM2sVe&1h7$fg;7ErRW_KkM($e&;q9+)1CwD5o&THZ)g5q*oLzu2-*xm z=NjQoj5u7t1j#@ZqLzM*yAh<74tA%lnQ|U4ffdz~)UKvo+x8g(BRy`qVpnY6M8P)k zWjZdHt$n64!4X{t-Ee5dazUvf)~-nn&&uq~`!lKXOKRIFum#KP^MLkip=T%oQ{~>a z`z(|h@>ZZ<@&US)L4gFID{r*VQpO&Xo0!`VrYQu1|8N%lQf&obk@C;Fpmmto|DM?n zy-Z+w6Qg^H)Qc8inWaBAht71k7sNNsIX&cg+JpMYX0-B>;>$NPLVI@^X^x_P8S5+< zgfnCPK;odqqC2YXJ)}A0lz=>_0)QeYKUs)>lZWgeRSuBIhvwS`ENbVB4(Uo`4A1%} zr%UiM{MqPNIu?O*N$<@@FcWMH%ix(bGh#R@RcnC9M<@LisKkN> z#*#LH?u}lQ_ISGYzu25fZIi=5aTy$o3PtU*S|Py(8ZL0{0BbR6)B;!%%_XAIrF0D{ zLZP=|4u(O-PpPh8xqc4a(i>$>?e@N%Kfe0l7hiI=8V`D>JEx_kZ4DwA@m)KQQ_!~^ zlLLW3w%uJ;GVR)9>~4p2AecU*ZSMl%NLV{}Phe!!&NmW_Xm=ad|2{77&HtV*pThsw zXV<&?(>Ap55l6WDhKQF>4GeoJPje_!{J>c@pi~Xn6%d!E!*@Nh8mfG%40+=@BaEdRx~N$OFIF@$QLQO z1js_v(PCnZcv>k|?wvv=wG-3wocP-x*Ak1vc-DNuhGieyp?rm%NQXEqPi{7F_=MfH zo#(d`vPT4*J^SgZSZpdHnmSH~)p39n#5vCa`> zWN&~#aYH^@fCrNX*MxPBXwgUA!M_^P7O?Z`SPjS8%%VGohIZH3M=H+Iv3A#rn*p-W z(yF^g*Y+Ql7xxJ#QbItv>BnjN(BVZq_au^n-}bRsxy|ldgsBenaRo*jm|~8FjKCjk zY#Sb)oIHt3BQGYSKB-8aG+m*cS12Qf9@Qrq{Z(0VEk3F}Tu>#iHz3_|M^b!xhwQ%A zZHllFFKEih%Tr{9F54ch*J9KcX`{=bx;4Hz`|iWd+~_m_`ej@9!^%WIY*3Q|`AfBZ z4Fq66m=v>7zJ7ST9f07vT4m5Z7Tb&t>@vdVY{#0`y|b5Z5f=N57c-ZGXQum6+<*q_ z;Q)^yj;D(83+TcH^5H0FXEZ)>*}8WOZ-*AHIh~+4C>aV8 zv>Oke>JZyjaAY~{gfWSm#GqMvV#e5Yuz)Odk&Kfl7VUfVrfy+IZS(7mE`$*$?Av~zGr=%{XW#PT@###&V_-G#weR3t2N+RHphVBB> zJ#8;_R2cLktA#tk0pwaBQ#S*2<~6igEwX}wNFX$rkGoB`fOU=p-$4`->X+h_>DULD z<7>xG(44-iy|1kU1`aKWrC(e0;l$UI*mdwP)2V67o*JdVo?0hT<8EKu(NkD@yoq1Ja^zoiL zFT>WNO(`lu$G-v;wxEGq5nQ;bEkXn&Dc;z=W7+O@9`PQIaB(~jB~C;2kIyW1c@e|& zj+4fStn;h&-#l@;xms_8)v7e;v7)D^xaP&%*#!(V61DRPRKNP}61^d%Y1@5Dzd-UD z&s9Z6e~l~D;vKKV4gamjf~ze+sr)dpS>mwJ%cdPtfZ2DW9dOHwiZCIjB3bnA7>S&u znK+Y)PMq9)+d|Eru=exEfJV{DYU)@#H}C!UnuI)HYu)Zo3<^zi2yIUxdh`)o$tuE^TuT0 zS##fFJ7x=1Xb+k~6S^P1-Wl|zB_O)R5%?)kf$F z<{H8`A*~x}BWH{qPQS#k7-7UM-3nt%Bz!`C=HvH#6lw zxl~W9hyz@>T94cs$|2;a__V+jF}nBA`|@FGotHV^3Y;{ZyjoAdzvG}qA~)7#!;!$S zSQ`gai;%hWj0WDjg^P+a2?0H6!$X>AhR{s5@c|OX*D+N*4-*JUWJ1EkG3q(kBEl`n z1-lo$AqJ5yT)S9*AJ=!Z z8UM-wsmrZO8!qwNFjO*rdg6=)@hoOxu;_Zz5pBVS|JIVyzj>Wt@Rh2QV7u&|%o!61 zC+BbuEwi?P8Mp#ez;%u<>cy8zCcpS%m{YOffi&NDus~pP;^3;>k8Hw=4WQ$llr8KS zwZ3D5=0?5j62SNJKjh$%Kxs(Vb0E;heV9_H#YK@^8I4zJ9j^&_+JMPh zoIos4Ev>2K27qf{Ru@?@W9TgL%9hKxbf_M)m_8hG8s81&%!Dc{5RH6He=sXNx@W2A zs;YR(u>RIREsdY(FJ(_-P^uphH0(!W0`w4s~O5Q9^Aye3s|CnBTd^^4UZ&PiR?bO2#N=zE8yDm;`AQs+G-q7nQIm_!8Rca|mQ#<2)XehK&cX3w6xA zMH_11Mb6Hs>3ncdHP?JxS^7v?v!NDMm$>fhG06O{aUEn01md~NfFN-=AljcZPKE9bTRxVG+0{rM{LV0hjL6;b=&0lw0Zca71ojV_ z*eNo*dwO<1)8q=**wj^$V7=5K!lWji4SME84-s-eOKDAMUTcW=&grFF3RSm&xp^^J zW!_)79kv@<_zYdrR6Ngukf(ie=pcX8b1Q`y0&ZG&VIE73&vh)pH?J3%nc0;0zH_|H zAe)RmPplxl)-7FMs4y&O%sYprRemDuvOv=N5%iith&tFaW9@MPPXo4F;W8nOvTB$4Z!3Rr#iP443n`5eUtH34#l?9-;TraGJiZv4$ zYFcq3Ku62pFA8FQ6$l5GVCSqq=S8BQqn&OpDc`#w4;X7rNive+`M??6X;FZSa~9y7 z_UY|lyH?A%KnKJ^bwdh*Ox&c(g_qVncxtm{VbWW4)PAIXkULO^vDGPelcQ;2tyyKc zrks*uZ|{#luM=%w(=Z4rX*QC%NiY@;G{Fzrp~wY$LGIVSDTSl3zcVqvmVp$7AllpjnT5@AjG z-DcnEoyOvW@D%zMg*1`;LDd7kr-*!hZZ2_tly~Bis>u)La_3!%SA)nsnA%W8xQHQb zyd1;M(!Z4)0rcHl(u+V+Ek)&`zjvT2&yhx7Fs2scCWF0GoaetYD{Kb(N>?(I4O>_bQ{aB?yxzI(u+X!Fh1bTDGgP2n#1&lu(&p{mJ(8Ne{w^T z5PBkdq*+b+Zq${~G`H<#hvt=#e@}Q{ROWuBbZL^PQ`l`AlMAERPa+SjbMv*vKrIo|Xy;sSx&X#7l; z8#;EGEOOIcoU#G41pkBTsUxTaxC}M(InS2nK~!AuP}+b?89iwgvhfF4bTfu#UK{Xj1me`2EJ>hdRaRV}3qiLsyCuch|tATT(}c#FO?U__-X`L5N& z$o-Cd0+c-(j!4%q{faDr9ORr4H5jH3XAZ*nhKgCUfu$fJ`Och-gHp;J5Y>-R|5N%y z@68usrPv$R8{N;thBSd%Y}DiYaO3QmC*VV1ye@>m6e(4=;x(rO|0(ev6*pg02;pvj zRD-soj9OEb!_KcmvxUB7`>+0sK8zT0*gJH=-w2BJ!mvy^FSF@-7A8sD7Zfzpjh<78 zbkZA4_Hy5v8jz?@HaD7xEL6RtziT~9jygm(45FA??Pacx1uWBD zW4NXvJ2V_hCnNPN9DTC@?GC0#1;UPNQJ>Rk{yo+L&BFrP?tg5&kc7G}lTHBm_(79g zTD_#{z7tUMhYs!piS@{>y@3@fVuNIo#dP0cf>95t8`%cCTB+On*`My69DBF>8$Mp? zgU2RQH%aRNVx&MyH$3)yYQWi_4axav_3~;&o(Y)aOU422N=7ga{$h40l$1Wv$_^9Z z1H&Pkr z`V2Ufat$$oK4G}_rEOpluYo1U){A1c%T8Bn!nIPbt6z_Wxl-4P%$Go4e0=}F6u{Ve z+H+QIV@_*|KXxhD#$f#K561tmAgvwH`TylHnSB2g#X$!d2089$PsE;)%sMi^Nnzob zl^!gl?r(v@k7_i~;-XU{TVEs0`&BwGOFT-J*g5}cQU^NV^>Jp_xbc&&=Ywun^AXv7 zQPVRnsiE2$tpT+lBDFS;G5{#oKpKy}$G;(6vjuma=rP?+R1-F9K;j!tIx5*mXR$L^ zln^^n_2v>i=jZhh@0H+Q9zbt7kxDF9bxQaK532^Z&K{W8u%KKEemw6+7TTZ{;7xGh zU2-)IC=5F#*dis-r07eACvtJdfMw9Zk+PNbj` zq#h6x)4_sYl^n3T^GhtRJBhz9*S{o+b$YWZnT*f(=aid9V76f!aiw)|5eNq9vK9r^ z?;9cdSZdn(cV?Pq-EL4#OjXT`Pb1>(EN|qU*kvPGkM<>Q;;aO)W0lD%jHqlg2oyLC z^p4%$G2|Sy;aG$22YQc4euvqqB>Px7FIT84Z$tERy<4(cyj|fqlKV(7Wj3Sd3^`P* zb3zqRQXIgV4^{cH;pJk=yrqBBnJY!2b4ttt<^t`TLefQ{|Ib5>o2fsqP|ET3P@J1m_1UsmE_!m18|brg>0m8yk7Sjm-GVI9J*LD;F8xTvQ0r zy%27)8K7|}ON|z>-A;ajnI-WwDzMU*#kg({-n*m;)~LSZ;>%ZaoB3t1>ZE(@aIO9d zdH=YEypz)HFB)u7tmDzPB}Elz$xAC}a2vNDIeK)xQC%K6GceP)pz##>-)ej9RF$xN zbc6fq>ZScWYEHi05qaycC98mmG9wdqjdKkVzrG5JJ426JUfZ+CBd!)ekPKydg82~B z<16hf1!g69pf-vs4=#^5Ya)gOJ#sJ;2;dfp9tA{<8F9NBz&kr!InV zRsk%hVxL_Aw}uu0qWpN_q7uLfz5ujLul#%cBY$UtOK&eT0g0&i&}+U=R_neOvYLYo z*$>=W!nY1{%!x)(6a6U0<_qrY0Dy|t@ehB?Y5TGZCg5Rs(N+XR%NBzIv*Rj#V@5X; z_i+JgRGGFF2rTE64p3dSR@>z94qCz4Vs&41sxv zG{|}+udT-iu+XA*pe&-6N-E7AqfR2hRHk{#c?B63zCKMuDo}Q{-?oiQxi%dWWS1d; zILg=t>2-__Y+~C-jAU%O9V&C5BVNFcy-5}5Cglt z?)C38kq`$APIQTb<$kh)Az7S-&v=``Ym+qg=9E z(*R;@evC-L+l*x(&`EBHHRGpNT7C(FK|4b;ihYA!l5WxeFIjCMc1zQ zb97XNKnMt=Y>Z%3h)Um4-EPr$E4xQjOpI%%%VIM$dHK!%{_=t&LYg3;N0ihv21MB6G*e+cKoxj2g1Q&VuA4{depQ`q$~4qvN~5+ zg15QKS&w(CTxKB(&9RkJ=lKpadoxZ?u9#NOd*6Xbu;yeIQmAEsTdQ{_l%UJAlYAgp z4K-B~UYyw{G<~;ma#Fh1=0RlSLfxa>pDj4S;cBmfOqtrg6FDqyt)Z zfnQd(8JkDt2!x;(uNUoPK>!1a#G;DMa0Q{_i5_x}uN`qwKD~W$y{KaLgP}OkI-6ex zBsT-WD!cds^(JQmH4UjZtsQ?`kq5OKhIB`t6jur10>EP=++~9!{IFq4tMBx9q`JBv zjnEulbGvtA*>K(7yJZAM-6+N8t|RHD#pD*&RhUCsn_T;mzZ+`C#X;S=wYp7rB4IW$nF||}Kkr0@|GM-5G1UTrWis}Q zbs|PuqvQZM!|^qAH?K>fRf?Oqa!iJ!ok~8O3`4hg%FPVwJTn~uMq6SAb_5A-&W%a@(;ly$GsCtGLOSz%iQZB)1ZPaq0b5~vBhkG*F+U< z9h5Zl)7Q8|<|6Li%NgP2d!;E_2>;H{Ee;~UKSk|!_VQ(|ny}H`)Adu50%D*AjZ{jb z79B6Msp=NJtQ0%Q1A}S^$MMO@U zU;Y$@@Cp41hW+oD_3J1`L{EFz>ZOML^rHo>?4|&DmgF72u@7U92yE|3dWr#RDPn;i zLb0XVV>SKjAvV>((2AN?-87=nKW#Dfr_#@KY}&h-R}S3JYog!S4r>^uKLglQo-lE)%EsFvV(4;%yEKn$xQRKr4~c!X>dOAW@ue zZW<%FQE?t*7h2_-9yLf(EgZ^@3y^g6seyxfcd(GrfV-7x(uNeT7>1o_uTyBRoPNz( z3JjyMddYQf=+NWkCTQ%}7rl#} zsQ^U`MM~U!rT?NuPmsq+5B+ySb@m)}PuI@ycsSS5#(L z;_lwPXQMfnZgX%2S+d}5*|aNQ@{}GiF^t|^mv@2`R809d(FbE3{^>iBXbmT0Nr`pU zyI9wB2!5io=v1{yqYYa_P*py>$AOEScSY{UYN<9fQ3Ou8LO;{P_aMRHY}% zpVIg+_+6|9Auo2R-pH*#Ri0T+H=T`2u;L?U0cqlTx$}-z$yMO>;CSD&9UJgJDqe>a zuoOU=)mXu(rQeBvd-JfLmMhJTpoKP$aHIB}EBXk!(6WXC8B?t#sgL4ZgZ8da)ddgh z2l`(IRV0nnyA_Z8Wi}?A^AyJUZ+`*|R3NY{+Be9Q|h;Xqy@I#>J%f z-oV)tw|{=crER?l8X#B3XMb$Q{KKSjGp@Y2lzPaQFRpuXa}dW^^DxI;fgtc6gY5G; zf2q_tW*-@8nROYjlTSY6uu4st1~l^G_uRQC$J`new=prQ8EJy6^uSv6E+l2aXm#- zCrRE3P3M8^TOB5IPzGJpJeD)^K0(nhh+To^!gX6}4;0(H(-xD9nQ!|V$#wpe{&weB zXTXVp^{LjHj$Is7|C+7$(#Q;+K~Cw~Dm%rmg2pteMJL0>grlsra25=zyfBi`rQw^9 z>{RUDV8&IDkSs_r(_oNCa$MQ9bo8AiurtiwrZk**&HyO-Or^_=$tB6GXS}8c77@;s z@s;kSxyQ&U$Sei@pf@3+h{`qq_l^3_%goNvmQH@QDR>LHa1wT1A+<{h;ir(Oe)jZTPHxiji8^6Jm=z+-SrYPd^u%A!C&{dc=pG(}+^3fI5?&#V!z^NB(8km}c)RM%lH>T? z6=qwUW*Fy|avW;iod?p!rXOl|}b>dZ&+;D~ZMJ!&GVxmN#?fZ)@KJYb%B>5gU8IUn7B zG?8>!AsXZ!Lvz;Hf5z2}d?Ts; z0CR&3*aVIjKg3TXTQta>kT7=y`P6_L5cz-Dd-J%O^S^)iFw9`aAhKoO;@C!1DwRPf zqS9&KrbJpaO4^q(V~dJNQi@XAOF41UVyhIDrG-k-cIZedX|MbFI^T=AzCO+UcwB#c z@B4Q>?%N+zL!I+EpXL32y`HbT?3PY)D+QoLR#s4cemG9OzMFzr93e)~MO;WbAFn%i_m5|2-T|ojT7f)O*R*6x zlnvre=d9hz9{n@*#1duLY~MdT0>UkyX2&%uMJSpTMUMykwU8b*6qx7ILu{>%S_X_XOjrW)Pzl;U;YEgkn%%`NoNkB{ zxbM#}0@X$XPuc1 zO+GP-5VxD_Y)!InN*B1%u^{9_$*!G+$}7LQIcPJ$xF!*$;B_ou9Kj4qs+Rf~` zAdVeJ8x%GCFP$1%u$)jVJYP|W?N5lVRR=cXZ?&m+HKTUm@jGXsq8_LBc^P>PkNx=$ z8OjsacVv(wW$aA*E|~(Ph}&b0k)TZTzn3vMx@6u2FC`vE3=?qle&}Vd4Nvr6AHU#J zELz=|*HJJYe#YJ-9AS~o#SJb2bRG~bqFr3wmUWb%+FU(s;AD6q(;T>*B>>dm2b_eA zWD92UNu4cpyrzxn%r;c#j!g!}XsRoEXu~J>!5vKFPwV|7Ow{4kXPJY=!6cIX8}*8i z4cr*EgXW^JELr7!^2hdHW}vvsg&Wu_yXuvd7f2NY(~H&AN89B&gMYNT%%7F8AY*AD zFut>64!PotuKxZ6V}goUxr^a!quCE@@u2x&{1=(>M6umw*v9 zywde|$Ac#~Sbh0G%1m-+yf(bTE&-=(nib5AJRwl{_R5>s1Knt=CvFXCMQxgiGmHu- z&BRI(ysr2T3S3+1p6b(!NB6vIKiS^`QduhL!4VP(t)S=hC9B7k?u=DZYRd5BIpb(v zL)Oa`dXRh9#>OSG-=^<`XQx>LI$0<$Q8EmtqiBu)a78$ZN9?~S(wvT|k|MvV?- zJ-DSq0<*q7a#rw3!>K0@XmE>Hmr)arB6Nw-WdK{b9C&C%_Z>98K1s^DKS$s7mZTZu1}PS2rz9&8n;{j zp-N^))je40q^vSC=ESr$bzdYVM+DZHY496m9BwQm!Szq5VxZN!qJn%f&@38p<2%uA ztS$rmboOQz+RP%oFew|ovL4*Y5@jG>gk#4#v+`w`IZ%6N0Rs?EmI2RQ29TWpS}+zP zu3iP%4+sd2RcI%~)J#%>Wd+n&S3u55I#!r{LBg^nYxj@kY+GiLv|F6@qh0+)tHcw< zii7KHY_OwV$MVa0q_+}NYn#Q|L8$N=5**2pxoiq}vfR++7P06*&+L4e8wPKBBiDcL zz;Gt4n`q{@TiTiwLYqq!CGoRHP6wfU8TgT6^aK&Ph-wIey@^>&{`QMk94epu z48%pDyGBI?agxdMMl-N&L)3IgfNxgkck4plrIYF65PA4&N~st$zTM{@XQo&&v`^~~ zY4}@Srt+PbeBgqsVlJU8y~zSy>vb}S1DZ~1W1_8=3}EkgF!$ID5!``p@8Rh?GHkGM z50)58p^Qq9M9qc;GF@|&rh{_PnWiy!keTUCaBIkY@^zH&bSt`@(K=;?!G&_>c@CDP z88$9GT`96*BvXl%;2f{mx?KR`U0z|l%Bjo{kj3BmzrMv_Su8bYuxyWGrxxFj zdPMW}PTdQf^yIjM5-9A!^b;e8z!|9Xgcd}!DbxkoR#OfQSzJ6Tqm0QdGY)Lhyh&?m z!jvie&V6$#yBbECmW8lL|NR?j?%28-soiP3!A{ri#}ntSMnhqZXNxWn-Pskd_QN9 zHO;~JwcT%67#gzXpcs>O@J$c^!L7Qy{)mL~oa*!|PWkQ0B4O zZcCqEXR=XPMx)E@H<{{(s_(lPe@H!Aj0@p*sMGzg7M%i|(abZ&I~5eW`|8_w-E(}D zrT_U=O6TS*X?9pPzekKq|K{xbu^VSIEQ~Hj^typwd7SE*Cxx$*S1UJ}&UsCvlWl!= zA)Rvft9P}SG}4#ixBm!p2Y-iPPB@_o2)*dje z(39Zr{?ILfcpRGQuP(zV>a2>NJlJUD&GfvChA-qwQsBM}&+6XHeiWCYW1em+dhSqM zx?ucreG6UL{Pz6xKTMRpbd)v}R4wW}YhPTKZ~tJ>X+;t*Y3H^NvcH%%q>L#XHqi1{ z{OqyCZjv1WjOEz}KP2SaZz^>6{arDku)n^l-KlNH86W1OP6mBJ@3k0Ez2_T zw0n*dxa$1t<@@};+_q;sHw*9I+YJ(A-bKQZ=07R%l_skDpE!Qf+4$&%`-j7Y zKYaA>AO6=X-=|mp*AEL{Ip7VwvhasXZ@DA|%!RNS>9=&%-y(b3YP3#7v!s-^n!ilU1!qdAKO#a-t$t09mapx* zC_dqr87TesSrM%kI*ZM!B|zEjHok0fv$XBzM3#GpXXe3bD>%$R9Dy1sN0r?p4( zm!F91*th6)Y{thpQp^C+6eEn8L6p}ObpPaEFxi7U*Zz@K5@wTBpmC+Ytw8TY z?Et9*JXTpSEYgfdapsMVRY}`T8dS1rjJXpDErjN+V$u>*&ka$;8naPay2V#X$l1|& ze^3`yHL1?7!?^2)mD4MS@&K`V%}Z!vCX)DA#~5RQYL`IiL(YOS6hJC|hMbqVN%@`3 zL&;r_L!oq*eVu_LH5bm+1lih$U{6#OYLQ0Zy__Tw4|%%ZfGFmBBMRh(J2)rOBRHj| z|AQmJ$~U+K3y_mn^gHxq0oTu04R%WaNsRBnZn-ysm@Yz(UgxpiCY`)j^a&1`7wAo5|{L zc{kBJto%D{WkB)JhjkSN=?1U6Qj7n(bLP$Zxr;{~_y5CxN%X?2wUf$6p9$Xh^DFB+ z51-wBwoHHQ`Hj1cj>WxdTcoRdagR)u`HA}y+ahkvGgDLhHDtZ|o}Zs*30}P_l)v-x z*t_?RErkPaWiI8_{Qk_|e)-;4Z^LT7oQP-<$;erk^#vY!xNFPU8v<=;;s+FU`_xj?`HlMSeAJ(<>W{=+NmF3q8G=m{>w#hQSJ+54OB{kU>jK{aGyJ7EBsUh3ZdYdnZ)3SfWeo$Ftc znYKYA;mFI1ZTg)M=GDvuw6edw9W9oXeLu;8O$!a*WgsFjE7*eDOUGMYn^-q9wJR}(Jp5V(1S`&&HyOf3Y? z8y?%QVrAm&BL6LEs#Lna$gw%F|t3|LIfr%6SAJ(s!#yQg6rpr6$JH0}GJXS`_a zL2u9bG>!cfSe&EX4lZb}uO;Qh}(IVK=w?R5_nzLhqNo5RbJv#V)Xtdjn(mo|xX zElI;F6H(f(d|_#D;TV(Vdde}3PDrqEu)(fDc}HoLMgSI1qIbXXx(AjG+|o`<3yb4e zAo2fLC}uam!!azrl}6lu5I9~X_A_?H_RtmPuwfOA8R+4XQZ#M|7lv#xr|1mKPm$I_M))9N}$nr3ho8--7x`mAL!4Pnq zWT4o1!7O}laj4u4#@)mkT!6PjlN-3si(_nD)0oaGD_Yqt{R=|j4+}_Y?1perAMBad z<^6OcQIY#&qTke2^C17_8Fmi9;d~a!J8lj5*t*RRg`}c6y^~h;DTW8t@2VNLRgr&V zb2?+kVg*mcdIE~RebP&7sRHFk&ILA7-gUSNJ{VDm8Vm}I2d9UDkz)ord*#8J{IlYD z7}Ki%E{|shp2Z@SaWOGViXbXIEGR52tyM>;Gj~8h-!(=vFwUe>h zT>fR)vS^8pjt-L3@egYr5&CVSf6Q%xFm`!$3wijELXA$j)O^dmAqO^7c;ul6m{Y%- z3ipz`9e@II#1&dk&?U9|H9jp~|KLcPatyt~LEPp2v z;+Mv&frPyFB>2WZ7r%l%ZWm1;31+}L5LIgB-wyer8IeUuR!Va-0>BTU)Tj!fd7SV* z0fL%`i&oO;k?)A%ixtfRd6bY^vCR$b4``V_g6RPf2K!Wg&XWe3TTMc>Ks($mvv z(pz_WL9FMeb8LUL4A7fkEL6*k~3>?btbgga=?ZR|mI&2_LnRcP~olY~yHBwS(0(Ww32NT{r#Gy>L*@aQv zum-|dlQnrqj~*o|bSY;m4BnDnMzW zC{UsO0+}7;*7+&U5|kxWHWlxc2wDpdhuZkmB5$nr4Flhhhrq49H6r=)PjHi;MT8Fx zKXOtyPGeO^GDimVulN8}xlkKQ24mQqmI^n*7ePD7Qd-7q6)yD%}0>50Z#zi+fj zfSOeBtB*Zkh+fUIjp0B1fqCfkDLIRtjrrAPdN#XshW=^>jLk~6k6qHiJU=YWz;umq zcNEQ2U&daXe>VSW3S)-BQuDA>kssn$OO=NFxTTS|X9jK0-N^in*o!PJZ&X+pD;fJY z>)gU&@#lz(m5{*v#RB}I3a!02(wNs`vDN>+D<5ZsWR!h(x!V^_B)NNRJM%N2^TuZ; z*vjCo<*t|3T0@-cX!dZOCEE^bGk>`ht$=yZ_B|2VBd;_5)ys0eS3t`7yK$;!L)^rr zm!nD;D<`0A!~fSG8TPPcjL*W-SonYD^Zg$>*w}53a(V@OQpJq0OV`A@t);bNXP8mb zD#oetnuN7O_b293V6s;ntA#|+c1^LTjN+XYy<@nJd3c4XTItk2=EF7+(>Q3ve0t}I z{twneS|E<>mY6nw1SwkJA4LlEdOBVc%MF zlX*!OZ^O#y3@_~G6^t1r>vBA>1UI8Pb*oNSFnbPP5 zCI3<~%qHsKOdlVg$XSeTg%v#q=NHS6dOk(x&>#tvsRuMmt>RTwR8Tr;U37%6H-vf* zAXe0BTE#e}UJ__-!?6L%ZF~@C{?LlR0D}7*4G=@z3VGgI1q_4o08y*RiZiZOKKcc526c{Qy@_>lrpDr<0ezVuv=M`kjxSV#aYV^~uqHQ0sf`j)F;0WFvy4iX zW!E&&y(RV|wfp*jHI?0ZGE%ihS^nB#u!^7RlGMD@4ye16@kT zC&sOJdCX{&rz?PyO(nwQ4B%ZO=j-HmXMhyA97g-bPCblwQ#&#u*+P2k*s;QpC-_F` z{c<>9YmvABsTb| z!CMl@sZPr76UNLJk3G(stYi;EBkNKJ_+o_o(~mrd?3fz15AKW3E^&H=yQ-ggKdLF= z&0}Hqw@E74rhWPl^&2g#lS;t@OwA$Kb!0ZP3))0|Yn#O?d1bB5!${pW)yB}D7@<-vuGMu?q! z%WMF=*z>%U)C)u9Zx?o8iN#Q*^B)fP>ah3+nBcwpr`J#go_Dew-rIMQ80U+{!QvQB zW2^1KQp-_L2A2=9^R0KlS+O8mw@$W)`v-Ta;N?!Q;PcY(Pj9*4O&svOEsM^+v`%Q? z0VN3s#7?W~wU)a5(#3AY=--GC{}0d2joY*kaYWWcSgrQJh_IjzB8v-?8Xh+!MRCo{_XiagxCRY`utD7 zvJA7>6~J|bYurhFZd5~eeLr&0>fSL*n)RY0SnlLOiY5Vi9wqhR2pv}prpMH~iDZMQ zrAZg*IisfbVryzQ4U67RG)bUIYdt4GXRE9SHNnFxRDx@j7aeBa$_KhXvJ^ zg>)ONATD974l-P6GKP`Vbb_qjSDe2Rw>&4QO^`1T;&PB+>fUH!ycp}8A~rhmL)$na zR%Fr`U2aKR?+pj{M{BiOsx#~0x#f#0UNQQ)sX|Y;YRj!e#d$w)^JK`p5})*zKn^ z7jDV&Z;=D3T!_GeF}U>Iw+c+)P~GG2gd0X^cYoxFWXOU~SqB-`=f4=MvX>7o!u`8( zZlBmsG29>~qR>MEBmmd+Zw>%h@}3dNDl>dzHM?JP!V?o$z)NTXQS3-B^c2G;{~~5( za!z_!yogc+TNkWaVQ3S^c`HfPXV4c57Hs4!<}DmO9yK6&#Gkv3B}P(G61Aroc1U^) zgOXSR2r=YyqS+$YG_K^AdGX>!6?x{EL+vi^jgP2G*7`tM_3a<_2GtnGT@C0mebi96 z+JC6Z{!jHx@$-6N5;Gg%>ypn>ePQ*w;>9z*Jpru6#=~`#J|h54nw(9X(AJ#Hi=x~3Zvx%Brc+w4}9!bZ+Xl*BH&75$#?JH!#ZFJPD|zDC+9rF9t4vc zC8ZOKl9yf`iO&FHqh_@kExtk#>^I-Aflt@|W#WU(Mc4f&Hav~14-ziV?{p3bu1~Pi za;dVu-Q)Z8omqBk{eM6B*mEqcGZgdN3f+VJ2cZ8x_q6kS&wFM=Vb>avw^ zzeT*|m_EOSvBAkYl5o?o?gxdBe~_AK21b`ppF9~a7eSLN%r2%7-HuFu91|N-^D93Y zv~|J^r&?%q5AU^=`S1VjBhvreNtPF}?I z>#KYrgb?OBL$O55Q-C#Yc(HJ8xPw32T>P8WJ#89zAgBI0rP(RDySq2FGqzq<1MJ!g@22mQou<0tlL1)J%AS}s4z1rhIy z$1VlCR|%^%lAdl6$!j7-Atp;W@A~Ijo>ot?&n!@Zo>)5v<{VQ7 zTW1eWKwS9CWrqU%HgiSqvnO5916SZNY&Pi8I~2_09_KvbrG4&Ey|0bElZ>(RMouvp zkNd&q+CJv9W;e%+bCd!@X@DyXyCn@Gi3ANM#d61&MYY_Vl*2yQ@4NU$aoKzBzOGn| zUZ-UB&`6%VC~g-qR6wkAKMaePkOKS6V3ZZiv0ML~-8W|zj3acgRqQzsS2l0gk#L>m z#+A?0HgWZWGjaY%*-u11%FS+8n9Vp5Y6>Hg*>_Ofi_*GbfNJpoq~m!|xlH?U^0J2s zHmW~aS(SO)Jb$a@ah}XCiqQH|92^G4S2jIod~DV}kvp+-fH$llU+p(UMAx3c5_3@>NJehl8eIOUUsxuitg4|2W%lW>L=5B_>-hRujUAw44 z=Dp?E!Fq*aO*NmjFxywnpFzZV{^G?{sH=+@5?U53Qf-&NA>%W%qV_>-kI2C@(RPx1(wm_i0hD>ifYS;v}p zv9rrC=l~u1J5z|cPQq)RfU_XEa)=5IMp*62=U0R1MDksVoT{*XE^&ru2DdsiLF{}L&dH6Vf6dU5S#RG;< zHX~t5SIIR6Q6wn59Dhlrexqi74f%aS54DcA+FS5ZOq1~_J&%@REmSRadlh?ig^dbo zc*lroZ7pZZ7o3;GE=>+I6*iuXOL@6%lnE!9%zk*6ODKi;kC^jsVm7sPQfo6#q08hH zysiL5os=ymV|A#x9LWq@Y|q0j$!)(TLO`DjTJm(*LTaiH(eBcyV-Ketf=aL!H;*V4?)%+h{guQBxBt?(bf3ZJk01z2a%gbPf>>u*}lPl$hjgg2qa;Y1YkRI4bn z;)YXj$p`p^`_yW8-_dOAdFtn}hSK0P{H9Sm9RjVO*pVc?B68`?0SA89vuTW@^7=H| zgmZbYCuz_dWKG-2=Tw0F%~_GvRl9f!5iJR6=~S5g-trx~ujO|&8JA-H61>0#3NoPC zSK9QmDn;&Zel$Lghh_1U{AKXXs~Z2d;L(=b!XgXe^87~T$9V=4!EZ2d5J5rlD0Am< zjpZAKQOi_Zsqd#!45)L>Vz+=mw?Lpz8Gj8`-A%j``%6Mjc-!1~NKB*()X{)Z@#ED69y-eQhJBOWX5N`d;>+_L&{*fdw3+T^QHBn z)DDuH@DRdOvxo10TDwgi9-qekL5yGZdNv-#Cq!e1f(7_2es-l8s7l>uVx;Rk=m+)3 zD3z9#ZRY7=skzG~xuq-_EJpQK5`OATMxVyAI5I_ovzeAqFasjx=JyP-k+k7w1>fo+m z0R@KH3~@a^fwIo3oOQ*~59IOuM%8(S%kLX5A3IxNT?gZR z*{{6y{d{y%Q%cO#Vw&VOU-RpaD%r*2Y-h#Yb9=jSHtEwnBJ7oQ*t0LVfYmMZ)P^O} zN);P8akpZaC?l=Ety}kOIR#gcGq``@w_g8h*ZsRnD>#v%!&zzDeE3?rkgJK1hrtK; zE6F7gDvOb7@Z2-_|4bt;s<4&lzx25DBAA(x>!vZzm6|24`dfI(RWDl{FerDz1Y5=m z0{&^SRFSBu;@P9ADtegN7-8VNJi?F~9^B&okH0-K7VDfZOKT*FD(zaMRsYmbwqVQn z+v6tjf!oj871@1lnE39nNh7;LBDlgLqk3~H_gok`Oj{p(|6&#QCFAhC+R(=QL6hCE z?#FxW=EiLBhb(p8+Ov{n;FK)pMWEHmK7XauY;F|HYq=bZZg(G9J3E79i{M|t)wn45YO+O$6 zYcSPD4A-MqNexE9A8y9D%@BZRZTizRiK`EHGB4C(ZCrC}B#^Zx*t#!OfI$TRo(6`% zdYi9Fjop=Ccw35+;^2+@A~93?)wP6=9FOI#Nm$is)3z-SWqhzZMOTD-HJlA@b-^7j zeXc~Qgj7J_rkJ+hV%>g%|!IC?6QDfZtUP9Mu zK}J?5s7EY(myJ@Qzty2nix*_u+NolgKo;^MLrit0#^0$pL0#|x=z(%!QJ6M&JvVZ^ zmTP{?nHp&s0`9s1o6aPIJ@rmzsmeadD%%l-w!4*^D5$4Qa_{wz~}dMZ3kz&W4Xz@#>_I|S6RZ0h< zK^(bcnPE{QpBf!7F6edz9Vg&+A|^J(#jdgv$sl_`>emT60=BfgRho&6v(K=!0%DVr z{79AO$oNpKt1*j8UornW+U0*$&+hPU|CWnLo*NHp6eS$z!K!vL?e1iI52)Qf4?vD7 z07Kq<5D@G0(3Y%1;GX|`CjIH$QQUlc;E;9zbo?Bm^#d`p?6|0rlQ?GN(DZEXS!zMZ zuKobL1!4ZRG>vV?Y3kCHIdY3}YP~*IoCHx}EEG>m4T~SsLDM}fcv+m*rPr5!dC#w3FNdHxxZK#-cndo5w5i#aR+$t%giQdz z+wpYSLFR(k=doXbL6Ud($NhiltldSpdLm4t55j!mjW$4r4CZwOld`X%&+JZQ=c9=;hVKr<( zSwf{!YAZ z2A1xG(BeI*yzymxg-#w$)mC@rbk!?iAu48cvw=SvHgMf)u+NcS#NWH~=)%;jb&uk?y+0WV)ER;tXHF82+#oSX|0Ol;&Q|*r|Ub zAo2GEptnO*wXC#b=;|;!A!G@}m?|eq>{R=MSUW_ZX%4wWe!e=h>FSPJbPzKw` zawB9CYYx#tzL@OKH5N)TqlPHS|Mh0GyYBoDo|Y!nsc<&m+Np3)YAZyM575$X zTQ~RX9sT8-^_A#9wnH&P#)7=poOy#6iIa!AIeVy%$U5l1$~!5!!2I2ItKx~5C#J+C z@v{ni=qF(-5rGIN^o(tg7mHD;Hu@U#8MFw-5631XP?hL}?&{IhGH_r-K?7Uz((TKk z{{7Vkw}BejppiKZ4qCx2U~6>}wiha6@?ddlI0T`SC56uHZemtXlaLqIA2mTJH!hMD z#FPH##5E?r@x=ukmH-@*s)Nr}dF5sEbcjj{Pm+aD{!fCL6ub4t-$(W3yTJtj-J#m* zd;p)9>9Ewr+*ogOvIvBKt@mdS{G#yb%D5cFD%VcQ?oxVuN$%#=)yy+enm|Qq9f9&f zi6kiFB!L=BYHYyY*k;$krjIxOI(q#F2vRbo%^Kka4`{9jr66yEHw2B1Nn73;tD0o9{CGf`M&8Y@|Muk@VGC<=WVn|8#Pe(ix7d=_ z(|Q%afj-}%-8z*H($T0jM#Hqf%#~&Dr5G$DsFveE)k^R_w|Rrev@2?7t{pJ ztAkk;vnqw35+b>w2>K?s3E2Q&a3~nlt63|xe>U@IylljWIGg_u(Dz?^zPvr|mn8z% z)=J~^=I^$uUo;t}KIzx%mk-Nde;^|2+C9-~d&z_I4>nEoS-9Ekc&^Aq-&al|lHSuJ zR!xsE{q=hI)?E`sH@$Oz*V3`Q`pr*|imZc|->>d;kS{MPvTf{k-@C{0P@t2-SCG<6 zc1W$uC3a$Ev&C_OE746HCn7c$Ofz`r(JTkfp1v@yw7^b%1BM7vP~pEtyE#epT@-Vb zEEV@E9c89Xa%JFe(l7>`0x;+loHJEw6i*mCz%q>ZPwImJ_?TxrVveJqItjt_wGDbF;F^((e7`zC0Dg!M z*ffN2<)Jdm>bo?PyNA0_O-!X897$ids$O$omKN*q;c+`vz}4jAOpeQW6zv8oL8obi zk5ufN`CpFg@2^sulse>D+&XB=eJDxt04ZPL9-(nJwtnkj?Vv65GPQ!{C%kulCQ!Bz z0PGb`*r9cP*TB3D%jSD=|L6uezzlA$2cW3UfxST4Nuz}+N(t=w`jg_Uf3m_vki5;P z-I)vyN?br5bAUE!(XjZj4~dLHq1tGP<0odRXi7VAroA-P$>)Po<%=eI1-ad343P*U z{PhQ9{ae#r69t=+p52Wtm?64E?4HUBPfQFCP-ZE7ZreUtFzy4d4U4?E z!<>+hDkQ{!ah5oi(=f)^{G?uF%dYA(%~>zQb>8(x+NR=KQ2eGDM(x~>%o>e#K2eJOeWzN9|j+zi`Rt4Rq=WO2Xuuh{n(yf>Rt z6bH1q%gIv>8e)}^No)!~Af2@nibm)ss*hv$6xT{Jy0rSQ0AG`ZbcVYMZ9OK^D0}Q| z`M=-fzrIt89+5oe1-vpvVEmr+i$_3~2L%F?&YVWFDnBF@G+}RR+B#hC1RT$Wv4zBT z!7ku#Gis^+h^5_KxZ8!Utq=-ZarZFZf;C28-F@8p4vMO#yK}FPvH8*HGs?|J3M?+* zjHrqDQK)L!kJ}ge3!Ytx!6vAf2vN!6B}-ybU#1^?CxTQGPip-eHYY#x=aT1_gVYqp z!9DqXl!@qNBDv5tNe}TgG|x@)_an7{8-&G4f!&C=B4cfDG+?foqqG|NAmjUjrkY?h zgqEVo4ECGoH8lI2m>mVPAd+a|AewOp(l=zTGTg9=@{Ke;{s0{TS!bVjC-~ku8?yz zMd)A{oCI!-~!WEpbpo(ZU2C-JaLq*YkPy+EjuDV|3^?7HQhZz&ii2TUrVB?H{1ukHzj zQz!dD8RiZa2#T}@&o72|Hi#k(!8nR-xo(E57Q>ed>P7wOLkUlfMsRn6ap<)T2gslR zqg=AyF{7vxh~CROUof5z9|(kp?ZfLFc@st~jg2>e1I1e@Z{N4^?1OMh{^)8C8=OJG z?M0Ofw}=G0z=yvMWRmUjv+-1!{3Ir|EDfPDvX|seL<@7W0fy(`F*wuSvE7$!Ob8&M zXO(=Fkf1U7TBwVqb!wrMdbt9ywvrj7{z2n^wW^XvtD+xO!gYv#1G@e)Y6GSMdJ2K5 z^?@h9{vi3{I0J~hb957~8|6y#T}0F^qld-sPP2xs?@Tf81#>z`Q8d8!91s`Pi}kJd zP=2sP2IQ_J!Yd=->Z4vsEds3_yMBvqBKT_^dLc@dAsmp{?1>+(q-Oi`t+=rc`celQ$$cZ`t!_-fk{_(7iA+>5RA? zw@gZ>&jjo>xaWc`rCSk>F%ChNbV{m=!$#buwYNl}dkwh|f~~tN$=${YzV1|}kimar zM~2YkmuR5me$tV_xYJ{Najq0?~Md_EEM@1o|BxCxq->6s)IMrIRiG(;xU7v&^ zY!}*AYvQEn~V$ zQH9x6C&CPwoXjZL%oIL2ry%lC!r*e(uxrFarAL=swlSEMlPPY;=qr!vLAuwh*7YI} zV!UV9{n_k{SEvD}kwuH;2XG1~0NoV$D{<4hu;ET5m%O5`avB1I-}x(X)d+^2UQE)# zl5X}GUmNVTt3bG$3K5?*(s#(p9mcdipv;5knK1$GAlv1S8W8;awmivr281k^j z2d_bN+9v)})t&uEX^GrVCky4SIzK=7pl#lGdW1hA9NhMW&+aB}LY28oJ!wM+QRV;; z1h-3(gG?V-fxcK)5^tvEFABcui@3o&+zPYE*V_PQLNQx+Nh#M`)8d01bWPU}WI;_L zMQ)O^a~d#~uF@nA&Ids%Ov$eK?moKU)}iOB(m51B)8R&O83xe85c6+~>cZ9}(AJg- zT-qcZC@o5#>O1-?qbg-+pXX{TB7m zqKz|OB8Ha;h=RYaN>QLr$)t zz_%!d&zJ{$^+7GN9y5-5mp|eTs&BR){~1Yn!J-dqvgn_s4cf)Sv^Utg`*|L|O5bABEh+OyU5>F`MQJw;_aa z6{ct5&bS)-XOl|pF6H-2`eg(8a6wNj;~g6}JJi003Nq2NvA#rFr!sA~VVH!uN+0&L z_%a?v>*gW&E+r8GUpcFg`fYjXHXvgH*XhAdt!{)boEevy?TIi9s!@HX!?;}bw-_8Y zJY2Q=V0Ci}T}SClYmjf0*-tv~JjPGF&9==2qq5jA;rF%dMt0l;w`oD~ukXFGa|FP&3 zR8VyE`xin?dv5g|M%A{yVYG>8G%z7)&_2(k8%V7Qn0D6%-Gz|v4X$a57GD>n7xJ|{ zP@=vihfLI@iUOv+D+pPAVp-`L9Zh3YO;gb|7Nuulv)KaLe0xecJTpTamOJmK3{Cc> z7To(*Wn3IEzd@4PGP1f7^uUCFJOo|Q&t&otzD5k!X|D>lY+vr&#~W_qx2|ogPbcFM zUC)X$A&LUZj0}h2yI#)I5vJ$Rdh_ScvWtmbEKJmF+-j;#ngJ?7g*HQ|l*(SpRT>a> zMrkc$p3-^<+wp>Q51i(Dny#>Z{zJU#J{-#3(XP+?A}NXC1{seW=nw6v3@Swrg}G8wiS?g-a-X}as( zRASBx$J4}D7VYka4o8?6zdcAIQbRqA)G+9yHCn?dVC83puGd7i5@us`xAT*38b^I{ z$j@7F;}crRR1@PK?Adz_v195sbr%N0ZEh3l!_>s$pw-t@-XzvL6fAO*)EU%S+|_1{b-_2xM5tlF(q+f>n%LrOKn{lCBqs-u*kDYwUO0HAGnUh zGN7)U7Yi*or4gx!*isI8ug1*Zh0d6I@0hn5B9o~8B|##kgN_+oF#A!c-160TI7WSJ zo4ABDZyHRD>`!TdWSVk}X+}h%Z8AIkys8tFqoh?)n-bd>s;3l?7_p4|xw$eAw*jo; z|B4~|bGveX{xateC=zzParWnTL(5Q*E;Nd3+lQj^Bvza}5QPjr^w5S2dpUh0W>&q0 zHJ)Arnk$pc9t0^$AWsGOYrav(ocy^RG`04;U{}hl?yLl+EW8Y;(YMSM!;aguRUc#~ zcH0D zok!^dz^yn<>!^q-<5KPxMKN?sVJ(cen(h8cy)2s9R0j}BHFb@J8l3;-W9DdxgcM9p zWs4=SSDFL>L>4IUYPEsyy9)!^k(}4iu}b=T;++?Dph(beT1R*wjOO+j)&RqekgAt$ z!7`y1V^5uvKriu*mTJAMt;m9202@~348t_9+zo*6QV?_ZfD*;Q5jPUMRv>yW)_Q7O z_zO@v#rr zIm({rqSVbHaGoGnPu;XcL>uLUj#dCxgbQLHTcQb0O9`IhOrfyg}!- zFevN?4giG9qf5s; zG2UfoUs`PwIsIre(>dz$9&alr&*{m~;N>E4rxvcqgHs6gMGa)!3dc5aq)=uSH9u>O zDA;`Z+1!L15Z0{&TmcR? zXH}(6RHUFf37bn~3i`psBkW9FaG_QB0W-s_al zNQ;bUyK=aW`F$;ONd!7#V0dnfQ3qC>G+d__xQKC+OvBws5k=w)(d+`zI2E!r^gzw7 z^)3dqp*@)+lIh{d$;qMB->#gGX=wWZ>h}VkQE;lbvEZfVnjx={)p?fe=pNF6^FeiFK!}AABgaX5Kax6`_f-zMcHae8JZN zz?WNSeaC2%GtU5rLg;mSZ^UGBdOlDj!`JK`AkhjnwzNdEYaHL)JEB|ezn){G(pWe- zC3f+G0ISRujYYz0^s?@O>o?WDyylm$UQKGYf5^7M5=2Yjp?u*z+VBHZ%9f^kBJ!MK zhy*UEuOg%xWIYpq(WaQVQdwIC&4UA|o|LfTS~)O=+g@YgtXFadkb-u%0tL5`=1`b= z;#z}Nu+hL4C`L&XRn}IJsmK3P%`vtIacv!LNC7Vbd#JN0fYSQTBP)oB#D!=1I3ScqnuS z2|7o$Oz=3?;mXUF?CZbrTi^b3g2A9!^?!PrEqUL0+savXf0`w^2mT&O z>gxzW5fWV}pL440L5s+i;0$NJMrD$AqH*}ro-6)>A57ej-WK8X|DmIc+c@sy52kIi zhwn%9YJ()B9(zn$5PE$d(Ph2o@y@jM!Pf&_a%cy>F^_ng>R+fvom zEq@PZ*6-u2H=U!hw_NSM*>JSobyHi-MaoEf#y{HhvGZ(ara@o7rh=!UIlpsM&CUV+ z;Vlz*PV_tS_gw$#95vVU5G@|Cw`&e;l8#Q=E#_DfY zmA~iC{J_??jgMpUTGpG*!Q{pTy+$ocuTy0AEgG#-E_!xwoD1ZL`HyD3V`k^u!ds7tKkvj16zsp7fBxIh%8PkrkWW zto94`5TTlrtSLI2_jIv&A#Ur?tkzNg)oMgmtW>Hre1A%!Q+3`>QjF7RBG*-FGMP;T z?@wQXz|28Sr%IZN5$qUp6Ybazq9f{-lg_uR$T4docVSQ~HkYN26u3mOhI=zx93jc2 z_WhCJwhgArqvF$G_u+Yh>RjD&l+B)i800ZlBqS95M6+EvG5M&iE9w=m(TA!-zoiFe z$eL4kKBDY`5_PHL=a=f8U#_VoHR}9`2eBec+An16$olN2=17?4|0#+2WE> zE#iLpf_kfB+_}fAzAgwETe@bJwKNQree8UGwDq<+(mb0YJa^4A9!%7HnUp!UwRefw z*WVlUb=}Dp;~cH|t>5T_?O0D7+sB)?J^en~E*m{KxzRDgpf5Pm^H|3$^{Fir_6>AO zTefElK4&Y6e!yeV7Ms0;ay@3C{%TOjqdCz*G?dShMh9pc9I1TfMLFa`u{=%u9ENKK z`3tUX5AqM}hdAM+Ls9Vg_d(Utcw`?39RN2w7gi?|ajzN~eXf2&8#)7Tu5&%m8nRo=kH%ien)ag73Ica^F??Ou8@Gu^*EYH(wA zQ`OBV|JfC#ud-7-*K>~KX)1hKM(tW>srR-_P@5>(ZTP5Vb3=Pp;H8>J?uwj%7V(;$ zYFo3#yI)1SRX<8?zkR0VT5MZ?eQ%6xpIckMCI{=aM=V=0%uSJVENo|1*qCnqM5rS} z!+1wpXT5PKDEe|~3^KNSSwuE4(+=qT_x*WEDC8-9vrC+0b4y4C_n9oWQm2n<{yG)D4V>)4CN#MgEcE%r6;50 zv19)gaBhS>59_-9<`lL|ol%;#Zb?+zn6Q0-ksdqO7^L!Pi~f{BSdbP|_4ML2P=;HC zBU4TW05w(j&=J?3O^7{soZY`l9jWrP5wvDe++antR*zZYcr82U`iEx-FfMm$$=wZf zE&i7oh`^#?uXq8fqT*@(6iZkVWx!8b>{SY!)AEw^Y__cD&^tPYw(~UihC1(Imz?!g zGtW&-2?E&T^Z61Y9xaPg&cs>IjsN6g*A0Nk7j7cAH}^_Q(MPA~R=y1lJ3<6lP;F1o zjWMuah+0oAp}lcqVmq}A7!z7KFa$|nV?nk0L&a2s3TiWmLu7#A-`C8yQ2=~4z4;XY z39|Aw1x$V&Al4S_YvsAg1wZ~4;A2A4gla$+(+i)xAfGcCEKB_z0CE$wEZE)CW0w=6 zj(UhA=M`Y(juPp*X^waF@_#6PZ`O_}+jp6eM$JvJ;awaT8%(2p5umn?*hw$I0kpi~ zNI))M#h3CeYAK%{o$@;Lf0^6~@sEQf>U2;yog!Hhx%X2IaVc&jUXWjIzG9}_p~pN05J2AozAHIuQ2c07u(ifg zW>Djr$wZvsAOR$WB2`m!lO(>=#L(@eRU&T@-2#-mC1gyY|7q;l2ZowE%_w6Qm(#c$ z{lm+O*PN)wG8B-6R_}*4*}S9s0oNArLYMmr=Ne^PO#78=g0STjbpY6IG$@vIG%qrq zh?7|1y`PDBOvbqIEy7h!Phg)^E#JQef|9BPH1sdY1Gk?D`EJhy0ktcTZ zb>2-o%SG?931#i2$K1K_Y?x`6crANT!E@ly@m9p8snB@JgPP@eh=A9a__*6@UO8{+ zMJ60y1x(@zu9v}wj(5qLwc?81%EB5Pd}7(q6&lwQXeL84ypDM~L3a}Y#pD8#*5$X0 zlG#{;0R_8oH)!Pd+hM~r1#?%o9L~h=mGelyR?_F ztFEDwum@qyMluKrZmw69U{Bx5V_5wLP=E zVx4gw?GufG+6w-PYSPd1ok>aRCujuNZ17G;yQhujoD{J==`bdoY@us8={C{B3lhTM z{QeX-gx*UOKrclV%6Z|c^h`mF6;;&%#9Oh=2HXjm?>O+u*y!_}4pk)>7;nK|T>l&> zLu#T8LRhIE%V1i)KY1~UKu1-PzWkpI76v~5H3qH(GI(yEybgUvQhv0k8i9{a+BbNQ zm0XkG7v|BkkJlPUDj6%A78tAW4JfzXKCuN8Bi-_pBiAkGIU8=J8f(g}qmsP;(dcX64Hej2$n4dPi8W@?|hKI~I~ki_4jnv=q~274)DYpRJ@6;Mt2^ z&OIfuA4j#lIdCNeT#6})uriUQflx*>6B*XQ_IR2EKn5+S{?=$>{n9N;*1obe+>mFw zUHS5R>`FKe%r+}4x^$fh10GX+X2i9ue_^lnK^tHeNkR?)9BafhfDbs(!?&5LqV z3hgYXl;a~;qoWriaiXOmqYx?F6DT0e=d&)O`?keRo*!5oT zas9Reakon(0^UTjX#K+Fwze+4_pJBNCb@gMCw zk?loL5MeuNd4Ob)iycxVngC!Ht-@#A`*Y+l`hP}K>J=+?818>|HQz-*sN5P&%p8qc z(hp80dpLCq>{qqY+1;u%x)Ip;=Sfp!cKhSiX7E1^kZl1MfEv0}WGFQ!POnNg^_oRW@T8pcItNstZ;x^pgul<8^hO z4rOyR@#44Nu!5v3ub&^ASpdx~uF4#6r)^V8DN7Z;w1!V?#1 z^c+jzDkGP7kx1kDHS%*7o}qBuh@}@Zp(~70D@pR`h>I6?r3MF|DpAx0!KfN0}~!N%HrAzw!oPRVY1x>F2e^ z9ii6Mq$&ZMm}-uH!R9~w-|iIA zy@&*tgJ8T&_xDriSuDCurC8VKRE!!qX~uY!(ri5B=C)a#;C*OpQN>VVsgR<8?ailX zHH2v;f;Jkt@tggVZhFr$UICJmN=|ykP21Vsryty)x%FB>d@Ajzy0(EMQfN&tQ8%Pq z8aIJ{P3-4>#^qXUtbSgw?=t2vH;ec|zsArIfD8`C7NQkAYwu1>%MFKMvy)3!bXa%d zIxc{`P!r<(%X>NMJ*jf!uBp7fvvk!LhYtMGr2M7#z)ImIvg z(3h|4%@%=VL72B)k}d9H_R$Wqr;gfjY5Le40EM*x7`7D#o_FSfTSCj|;)C?0vOjTg z`O6Br&+^|93B=ORN2+3R5+8nSo?^sC4v$`(ZTmyM{h7!5`hTQS=GX_UwKODXbE&SE z;h|B^N&0dSU%y&ncl<*+d;`s9*oOjsRz|Tv&dXiTD!sMsS|#ulY0sE7%Q z2#UZAAW~FBkRkybl}?IAk)kLfNRuWrLlFc81T=;sARtN+RESh%Lp4gAl)L)IuVkP z{T;WfV=sDZx+Z#nN+tU{$6I+dqMl#>GWwSTKV1AZm8_d3R$&+)1)?k5L>zh&91_h4 zhIn-U@#_+>n)y%3gAJ|x?I)mv9{>yz(A=Ce-fs_#nzE&M9(WBsxJnB#O8*+*-5Da` zC=KBLj={izzr7C4*)*C|mxq6pK~n^=yz85wuWCVfYQ~=MCp9&#Q}Azf&5TS;!hDZr zqHN;9`(=rEBNdDk%3AxBsbZ0oVmq3DpqcqItP|1432PS_6LGc$_lMTj>xeYkyX1x^ z+DfMq`H%u49aQMFm7hTAU!&d1UnL)?@R^3Z=G9lQP3T!JVi zFCKm;nY&lg+#Ry`U1k+BzlUMCOc;|S%!y{gcDl#Yb*JU4BD_q$NBCgYqem`_=L09{ zBXr-+fmzo^zhI|ye~=i7{;H5*YszP;oxD#TeJVHwzO+1elurA_#+q&VOhq}~%z%m%#} zZtS>bPKhT(>r=s&Cud_KOpxI&9X(Jw>@bUReWOu7x;9m}(uKtMX!-TZD*xf(r@l$M z*=YIr@Ezg5o%^C`ciBG@lp2ncU=4_X;Zu`(?{Q$sb{gaS4LlCl^(SFz(iDIxWFx}v zW{jLz_Uk2M?F6Fn;3CujYrppFPOFLhN1Cdqg4Nd637lJ1brT-EttBoo(fqO%{_PX| zdqJhU$ajX*xwhVYr1x@m-ihbC7&dHje__Zq__!8p9p%lH{@EeglSHI)|-1T}|Otw-aL z_z4`$Aa|KrDE}P%#X+u7eYD|_2gp@0b~sIvHWZ?IvHxC2N2eJ(4{Vvb{8H_}%;|zd zD`RU8S5^;9m;AEbhw&j3F>aFB3F5LdAy${vOZV&6xS?tYBJdnL4nzFdQre> zjB{(>|NC}+m|kW+Zy2i zq`Jb^iQmjOV)B=cDVgY1p*!0TIU<`1zWQx_~8nK4yry-&sCIQq!pZ zMlvPMdn|gF6!UV{?rYL=j}i?EdAH|IF#~o)DUV_&IOp+2%Jl~&^Fz--#t4*UzX0Ft zDsHSo)IPHz7zxBpC<#{s55^aLZ;HRUl^dedEYXYYsXSh;R966PgDkO4G1HH~bd9+u zCn1e;Q4dD7lFT>EI`k2sbu8~;4am-=c@{V;*c%5R1c@S#P9o;f%(ajP{;<<{@bop~ z@!vhQ74(b&m{1iUX;p@_MyLp^{mp`BSb8zM51-b=eGqlPj=8XB4J4)*nW*G$ z{Kxt8Z$FbHV3|M3LdIYUbhL*Q3o~x8S04+$ly8>u^xzJhQ${H|&(%~O4e0i71_8Jk zQkK&IW0Y|P8vLQj)Qj3Lr9-IK4&|;yVFl;e%ka9`0=d>Ff=By>@5EMtf-n>ns!<`& zThQkF(|`?~<;827Od-W|RZ?1^wQBzL>s_aTrLMz%%iQ&zt#{!e*J^@sXoqh171Kxx zhW8%n=>es@-NEADzTaPe8T})891G~0r=Cy;CY5h59CeTqGte}WL$^q`$#Mody z_Y6&&KHTj|hy_81#01&(gl%x>B0H}ZJmn|Ay-9LRXG1X%!*9r;qw^-$fB#(Itfc0^ zBegIvqnJ(HG;z#7#%iKjwl-zoik zw~UNTp=9z0Azm}Uta7^%_G9!|VZJ8MX^Q~4#3z$zi)7_dOTG$7VPxYs0+bPDKZi*$mvjgrtImPPtUWppQwa=wQ$zSeo>Wv<_=$pOvQ^m=+FP&g@8{f`$tSCTzHit-_n|3m{ST!7GCn(_5HwM8!dU zxFGJ49v`Ig0#H!rOesX|G8JN!KUJ%Q>5vr}bht1UBuk73ewQ=-EE(j-UY(_EZ(JdU z2}@nQ6qVK}XvyEf)T3WDYTe;Sa=>&h@JENoRAj4 zNbVEeQ~cdj#xL6~kyJsB>yqL~YJlBj-EIAZxsq)*6849wj7!RKcsR)l=&gzxh-7w@ zsbGorBES+LdYgn{H?}{Gu<}~0~tjQ zK&Z-`TC^gnQk)+WEwGy7utrqsF#Lr`tl-Sob+?b8N9P9;(c!lgtk?)iKwp z(-~J+?Oty@{Mf5*Xx>UARI-miVUQdWD&>#DMPeVZf62Dkt!_yjX_>0N!gpRqP!nFq zp(fCyMf=%|KO3D7iQ#3y3QiG~7os`%t@{ZGEA^oea*{*i&IP0BiUqI1Yn zg~?U{loYwgQhF(xm8Zs-+pu=up}7?#OxGKEukLcDT?s=3JSE+d*pDzBe`gUNSY zYvpBbYC4CwWt()EWk@xQ2qt8ZPXhGYb(42kZM#lj?zM2|9jGj0D>`n&3$q!t*}WTd zhb-H}OZ>G<~XSSYcl`Hx}4`WY^nTBGF9&=j1L?@Px3NsBQ%$(x4N% z{G5vAp(G*THZrlVDKTZdWc5F=qoYCMyCp%9h_1@!n_D%*Ws3 z3=)gVq=T^Qr&?2p-tO_}MD zJgU0`m{#JdP3<{#HbDHp0EoO-16M{2KmcyTaZH}cs7E3~E}UVZ6S1LwkwR{8|DCMFF!N|tuGCPA(dY*^OTy9 z?1AO0J-GcttsBWpJOfImqQLo#ueMZ&X1q!Ypc7M)bM3Z-NAn*Cz2qGFbotyr*zOmD z^H&Y7#|NjqLQn;{P>J7MQFCk`E!|+@d}VoR?ph@U#*ck?LZ)tjRs9S@3t1{Lux#ud zYy|dT8n_-x66A_|52gekZ!#UI3~wXrP80%$y?V^;H(P){u+G)THPC zRQ!%W~4YLK9Iy6Z4LSF{fjfg-7e1g1CDFmasMlrn?o_zP|KBe85l?Q;xvzW*!oO8BdGuZvYp@AET zbR6ARBI<)|DyzNDyi=pPq4DZ~adCEj>9gA#OB~7?)l?#3p-^o}IIbmRjU`Y2 zv#?ihx$!q}-cmSFeMkp19fCy3T)oaM*wrk5g0W+s*f89+Jrw+?FxBJrp6uAy;h`Nc z+%%UU*)(FHP2FnspSBLsQ>pRQ7P*u4$AUm?Gasb6DUm`3FGbuC%~6%(MtXd7oyMTP z1{m=kb;uI1UkG~-RzyZ_cGW>xNfZry2zsqPA*G{28(0%dmi~dr`Z|F&r*w)1fV#D* zsfSt1v&Tu22ISA!y|J_)HGOM#Y2z*XPnQpl;PsvX1v~5Qy~%cp8yG8Y_j@!`K1H^m zYC8evO@-`!T`kytR9{X1j0WDa?9_>e?T zK>(uRL=}+Pzw28r0m(l$y&J;4H5ZCsKTME2+KN)Atu)C1jJN(<;+bZF&1Vp0 z9eFMH(^_CIvZVdaocAH)i{Z71@LCSyweT-}SF;a94|@hv7L#bl;QaxXk&$0_rfckFH($*Ma*tBy#9Ux$@`bEg43l@`p&_ma%m z^^AplP@mP)4J+p+=3owV0NN9Nzd6x+NXp!H>gHPTy?&@^WIns4rW1^Lt!PC3pwzpz zDrCwJ@>TrSoK}KAF&Frdf0~mIKxZN1sZ8HAh7K>8PX-RKZu9mZpwv@KaEa|I_cnSF z^!5NF$Ref|k987;R{`WDCOR2iT6B%>$t+BfC9}Ba!&qSacMg>x!sd3lzc@J~<+#}) zA3LClJ#Y(dcaQ?|;E=8P2|}boB*)fGR=1j%M%M-LqA@yX^&Cg7o$~Cxq1&l}` zp}W~}_zR3WcyS+Ev?1=psP`tb+S@(zqwAe_i3*uy_i8=$5trb3`V<6m^CKR@q=Tox zy-@2Oo)4^h&wzG2ajGn_`&^rtZG{a=ht9jW&?1#s!7stdoC9`dW;)G|AzD<3BMlE} z@pHSQ!WC4Gw|j*yOo77`)sc+@%a)dxu?H%acX?#MIMG`>!W7}J}h$z9`MjCGjm z9^jOGjx0Vu$PGI@#}T4cKr^*n?*vH*K5O^1V65Nh7qH52fOgS3V}zGk|Uj zgY(dqje@dQ)iCN@4^%Verag873|fFKm^@{$2w65KKDusOPft8{J!j`%LgJwJh7I6N z%mGr*&r|`9a-O&HGZ!L`WZq=zl^(vu+DKoW|Nh$!w>w^=whrCo@eXs1s2$%MW1Om4 zoq%G$&F{yRu+5Uf17^v7gJ%u+G$ULD!-OqvTi<=#s!d6WTOf19PZWZu_ zC!7dX3f<1x&B{2P>UFNBMv7~($6qL+e#w#{_)%9Z8Ak-(5ja!6{F8FCxRL6ox4 zK#gaJyQgYJ4aWE;QOpwBW<55QijNZD)!gnuOoiJ_b?+HyeQHO)Jm)Qjr-#h6+h3xV zj$j70u@pa_V4RtRF*N^AjxuWZ2g8xT_II6(XQvY$VMb!!3h~Z$&og$lGaeaB76q3H zWnYAgP&(}~ADF%Fh$-<xaOQO5-C9C7<$aQs>7d5>vS{IX; z*l#gJGr)UKF~GiF+rjurl5MQ|BW6j~{(2H8O=3;>_n!dNOP*(Tl@VnNf15Kcvx_nT zK@*6ZXPM#}G?RIhOV;jBq`d%@Ckhr^=JD|#>eZRL?DPP2sz!Aj03->q_jErCb&qoZ zqC>*6#GRzYBpVOWidSUbm3IJFf3g&RSD#aWcE5mwMDZ@jrqOs@puapP2aulyNT&{rHHo^bb+G*BS7(1rOVrd(hd&Q&4m z3;IWXS?C9X2kO-r+D@Y(F$wq|b>uM%w!Z7<8D7fbKo?bL!6J;3a5g< z_(7g3XaaJc!Kx#Tc!4wo{$Di%2z%Z>b&yIdkP4aE3MbyD`#Jsi%(yEcidIS*}7rnjNN zIG_07D??ZIjYzdR)T7M>U+X?78><);@nEFU=?jFw&s|#t4hnH zJVbg`pO9NqIQAo1dnZjz(m3>~Fv~t%h67;8=z3sBy$p(6QJ+h@j%(2 zeFcym67p$)*zMoWJfSC%Q{TgxJWz9q&_y~iwrudqqrpj58`4Ys^a!J%BwnagQmb;Q z{2=-}#(lM+3tirrD#Saw<618UPH7Y??a`p<)+O^ivb}r_@{V1LUfb5|f^QYJ)nF=@ zg{2XLIC!N<9`JH=_pt|y@{I5{hLlT+DWbv&dbrvXQbq-mW>Ibhk6if4TM>I|=gGLz zi3#jj`*Z9ww*EK|iT|DZpdJjB%#g>w3w@Dddft391tKEavQpZket}?@?itaEONmYb z&dCyrOxn9-MewXYN_lbJ7k(9Z^_;bci!s4GpyH_?NaKQc4hlb*;v)B_Zdk(Am6jgz zme9iy&~);efpG*<9ZU)Wx(+OHYy`WxQS~Ru=>(aZP{Fp*Wyy%botwX-IF%YiVu#_L zSkWzoTXJ$IOZZbg_E8Ja>a*l3#GhG9^Lmh7cNJa_qEn;_ViaQQ9YApyh~dCORAqWX zh2R-?OFuV{k2N}je@}bl@+4>GBDsk3?k92^#P&!C7`lnGCuDa14$ahQT7E=fC3dkU zq&^~#ZH&V#`bUJr|_PAR;{ajt!*$h!e`Zx}Jl~ z{~S&)@GAvb1D24V3UN?UDsjy#9p-<@r2#6DWPDBA*4jLbR=vWU@kI+Tm3sj3Av?Dm zu$9@88_b`>0aW_a9*90q-g<>1E#iTcH#8TIDgnibOyi-!-Zq|;LPMQLGg&eNl5|&x z;1|SGW`fP?T;eC-Ss9(Uf{V0HO4-kzF-C_`+lx^bnL)?-iCB3N?JcyfCYfxegO2<6R`Ndj#pXR<1tHtt+?% z=Erzo-Z1Mzy4s8}@5h%`tu)>}{pUoVyse6wD^ktA-dmlhwc<=_nnrAoT$)_k*6Mv{ zrp2B)u=Ani@8-YA%li9%%n8%@YKywM+f%cC?@caaH@eebKj;sQcwX6&)80_uv98bk zO?`dnT%`{8|NQJemoUipv6^IDBex>B`#mSWW$%Cf2lzYnF>H$K9-2vvf5YmG;YGY* z!e%U9_05#ya^v7s1WTcQuxwH4s0RlJx4AQEc0Q`)N=T*<`I^jMn5ckpznm15WO{n- zpfqOYC8^BJaA_RO1yb3T_oZIs&QfP^|NB2y;BZ7&(;ywz=9JV4ZpI7=sSR@-qIyca zsEMR~WB|H8Wk@y8IYkl7M(PCgfe&81V>~kTtJChcM0( zcf{Q}6!dGVtu4f={63nX>@Yh5Auh>pBmxH%fz`un7$1>Ut}XKTQoRL-g#+7-2glyR z)hN-)2U)*3=q=Y-Yl2bzDkTa~BRy89+N(UWZz27a#OM-jn7c3Ij`P7;uJi-Rn}~-A zVt6{?X2DfLjiD3VkT4P*4T^n-m$-X?{zCkXLJZiVd>t7-h$Z*Y^{`NgQWsHYK+w0) z-(2Zex$D7(PEEcGlOT~=PKJs^Gy5g{YsS}GYCplayZ{$UGr%W5a+X9$yEXOEEiH>qdgin0 z9(Cwet_9FVYDa^gRxv(GbQ_WJwTwIAG8;(b+_p6SyPi_PBy+2?2WtwbA*|p0*zeV+ zwG=NHVs>m#L?!b+o=&d7$H{aRixFi zTE(B05IsUXBReS}LD8-@SD=@;4WWW$6v=Bg4Cb1O!2YE?PqM5f%j?n>?isX0w|6NJ zqgRqH3vlRbf|sx*I2&}GM+CwY@?+nCg=>7Rk9qQ*6!JirIYLJNAr@Y)g9E{q1_&{n zd!-YT0#hW__YnJ-6mv6`?VqUj2!|R#GHs@KK=PID2UDka&!dG&fZxuFo+n%i0dz-Bn5rZk;LKG$djFM&)=?4Qq*a?83ssgE z<>X`Hd;)g(&Zqwe#zqYg z=SQ@ZFcy`M2idV=X!_o53w^2)_GcCmAU2(M#1#HGrE8k4e#CWZ#1B&-PNWp+md^;u z_{m8YO>TV3wLkP_ERC^KoJj3d#i9CykAGnIsg+o|0S3AZ`Yq|(XT4@=xc2@Voy?hSSL#kOi-KGTyNN-lf zp#;E|2I#$9q@+llBYxyTl)roGhT)jKlZ$V;HiZf@X~fGxlr`$uE}^_UXm6@Ggd?>y z0LA;;9~}n_XHOk@$nnY3vxuBHJEYJ=7f^M$9YP)mwJ8EmK-BRT6Bs1?{f|a5wxzXt3XymY$woT?dj*>wacqw7`OX0enVc@dEfFkn2qxJ&`U@#UY;{?mXRjny40? z)tJ@21tH(x#79KFd+6|X}jSpICIPI

%jzb zQI)cA=w%+F(RA;LX_t2#FeV#CC<3<`I6>!CM929V9@lR_yvH_Bv2bgUv1NRo5os9+ z&Ka6C%3Rp-Tim1l`~tA~T8!=taTPx%Gf#>|j`)*n;x}tj8em*maH80?B&5sgft~>R!q2Rfj8dAZnEG7`_6313NH5}5u!^^ zlcs^UO))Hv|-O!H` zqf$DpuUKR2OKJ!R-e*&GSQP(2-11EZy*$uOPotQMtC9lShGD!sp$7K>DFjP@Aje{o z#SwcV4bG}X6q80J8{#E08WoTxwoVdC0EdA21riwnK-;p)E0GJ6n?30AC2K>m8gp*H z`*K6OJFD{u(EhXJ=S=W`c!Ken(lr9PU&YYl#I2&Bha#f5kPnYrB=tPRttXZwMwK8m zc}yj+05VYL2p4&QuLU=f%L?HxrbIF3g@R`c+krJQ(rCrFKKz^=>Eok zBnpi&=4v(FX^!tzld06xD|8`jeobKu=cMgu zN)bFYkb=Qb*f)LFf0zQ{tW93jnZ? zT%UMDv4l-~c`^QcGzb|=KTv+pMwA#58t25>_SfMjyU@}&ctEFuJYXf66d2M50IyB@ zy@J`UDSqkDw~6H+3!e%S?+^va5`_V&ND6S!8s;~i16Z(c`5A|~sK=a2mt7-q$!vRX z=I3#pde^p{Pz+*o7=(~T4ehiOU`|_tOf7nTGiRxx>QSy91elgEWAPT&C>N zdXS=~kfdys#|glu7-@QfDN+I$IiPZWFMe6Ihzvxkna^lJh)Ksgd&U=LWt@wTsMFu1!LiB$K*C3JYi6u!F^iTtik(U_3Q_MEK0P=}J>hSWW z7Uqrf@h+C|B&Ll!C6O~whJoH<3WfFf8M24M=CC_DE_XT;TycrkmB<%lX_$t*2F0O& z*LeQ*JkE($sYDhdW4>HGMUt-fX5xxA=>q*C{XG=!q~Mm!d;`dEkL-un1L2a! z+D*qgDgvT2o&52M_)CH@tG5`o8D=)FtB-9~BQu*3y#HNh$B=in9^76_jCt&Qq<{vB zn%HFzXy9@jmN$~j>bnGcbznzVEl^{vTu zg8@Ge;M6LfsySyWC0@e+?pOY!D#6h#wu<4PzCq}lD~+2yUy}rX{>Y!7^q>n?m3hH@ z2itKDZKJbHB2_?l81a@Hvnov(CdfM8<`MwUl>pVwL?TE062%;jFV(?UQlAz!L^0K5 zJ7I|emwPHTU8^ZSA*AyVf9Aa%DHzFZ+650A3R9c?&PmFN3h-wZIR+3fvF!!F^PW-Y z&xWst59V-=kHhp1PpOC=+0+$`g}lk~pvmXP#_?3xTavHGsn~osubNP4br-4s9%n z&-t{LB!MS@_fd5yb{k=QKXnXj*}I7oPM2z%6JOA5B=d(6ox@}$unQD&GAb2$G{(fl zJb}c=AcXv7c(3tqPXpInK$1K7)}*Yx9~`MbYG=CRsg>mS&0+qw7q$`TfUgh-pAIwG z!c^kwoq)wi^OxT5f5>Y#3YBxGXcbAo+E1U%Ggi}eJC)&^mO@Soi1nH6A!WY$iW)Nt zkf$3W?=I(Cl}iCSnds*T4S_6gLk!r|{%Uokq@)NBOPK9a?s2cHAm~Ae1`}rYxr7-a zi+CnvW!-y@V**!!IU>LS&+zbxywK*6PtSo6T~JO%sxI-?+TOIYz$8PslavB<1P@vr z#xE~b_4uVdLej<~4{zqU*kU~R+GmR-hBrP!azGwryTg+6Z|lc)GLOSJ`ciQUdq=+R z_XfDTjv(DFuC_l!&y}F#Lef@AiLRlqvTe50FR5ZwY|CN1A8sscS+< z9A3^m3|dLg6}r*2SGCE=&&_Tli%*rmZf|AqDxc-n^vsB&*tTqpK zazJH`4m9KOF45N8Y;)av=8XSZy$r0TeMDt*a-jT#1Y0=aC%Q;e51ONb*3r|`leB`G z1~u68Ng^1dRY;RiN?LHxQ>;z*V$P<>#dW@?ZUao$Z%Cmf(8~FaP=brW!Q5sT3mhUpP9oCEx@Rm?kG+5uQzGEm;!fhZ{Q~#VGPN`fiH2 zX_XQmzMu}T;2})%(<%nEIBW-~z9i8DVZM}wh0T@)#_EAjMbW=};Y-?1M`>#c6c8UE zk5Oa^s$chbKc5nVhA4cROp9$Mt%BR}a8+S?BUcX?a4-xeUT$IY0 zzvsZ_v>9h*iO7%UR~vB|wJF5McWitWcY#O-t`SV>=>1K2jgsmFTzs=JPwvzt;Mdov z_EU*isWL!q6FOmbl(5;aq)yI*M@opJQqa)DbfJ(Qa0JDV?#BRAuLCWMgxmht2aw6E z5_%YB!4eG}WkU33KkAE1b-e!jMCvUx(OtSodEE~)fXZYMA005FMX@ois1Xx@8sY{ zj75<^Jfm&yofnu!SUw3fXxs!ZHSRo#NdgEi+9wm)n%AejT37Ir_j_?ndE zXwg%rdaoUKA8YMl?!=v>FVFe&=gH`nx%`35o*jn4fJ5?nWOajopGZu^h&aVsQ_fwdpbcv5lO=Y&%O?3w-sF%Z#7t z^D_?RXaI`6o0yk|dIUaQbZmesyJ->g2Uo*k_=6m1@udIJ%#>j>#;;>V_e+zf{l7Z` z`yZOD{huGpV*O7|E&u=VY-7wXt+tp40-Uxdv;R*#b5kitW@MZMs7N6`O19QS3`U8+ zYP_CuM2V5xXXcaPvAgo6wAr`TFW`R3F@?0k}&m#*glFvujsdc42Bz_v0R%F(%2!wVIY>dw`AIL#j0jDFJ_Dwd^}7hwVwB0|2)dfRPu^10OXD zbbX?5^DVVbvp+K^4_CQSJDhBGTbUv@Da)Ylj5+tBTBNzq))P><+2cLu;ZVM{Et z`wTmak-g8U9#DD;shg&e9Nh~tk%F1mhA^SD%j=LaGM&0Q_wDzt8q2fe29k5VV+5J+ zMp32Cq2iv@%{D%oOxSk%+7FTr{Yf^m=m|E-WD8~r1v8Tc4lEH2ivmE|Nu7}|@Go6R z(i@oOO>V~AExCp{P*U%MeE&~!CzPOY85tQH6@wG!2#4^?TLAeb2%Ca8G9ck2vYKd*%Gzjf`muE^L<|sv{8r&C^y!VBy8kr zWm{?ABNW6Kg&r`L86bCJb^~)^W&jEJ*cQz$`PFTOC)8k?eVU*^BIU`$@@W)LFbs}w zF@EKOG9lCB)Vnl6|MsSCnnr>sUT5<+A8|L$!Pquj6%DRl8>TSu1k1q=C z*NJjLn;@YRsUdTYECeh^-uLtj+!@~#fNrkW#VGu0vsa0jkibljYQ~dLcR^L{LowRu zbj?=($ki+Su6A91k3WfUN#f__)v;g*;Dio}LdvJv4dUH;g-q(>)^`J!#| zQN=%iM5mbyuudidgO|4e659G6p@2rQ+qqlG<5IBEyvKC*k*xi%L3>wfbRk|Zj;fH# zONl2u@vGaaRx&@U6-{5YMg&<|SsUfJDqcz;;$`7j5M>OYwXd6O7au>c8E-|Qaq$IM z7_SixLAYcl9A*3@Ld3cI!1#~ikyp2WZL-~X#xCq>O#jn%84KB;%=uA0d4SBQEWsbx zX_m^46hi`9=cdc}S>$aca~Uf0q(0AS7k7LjB-R?oUa-m>TaF!Q^tmBKFnJZj-2gIX z{b53QU_?KYPc+HkDYzwF0Jtn8mU2y;Ut_@}^#$Va8?i=*ZW{c09Dv~Mis3Jj+C|HQ zJ#ggB2RQQ%6%q%60y*ilBpMD8JP5)9eS=*4s?d5pAucpEz~m~lYeUz?oD7F6cS3A> zb18hZ2P>QGT1i1lcmw4SOYm8+{rB3ZZDr@)BtuxVKeNcX+=Ki*b8*V@v&oEG;!xnS zdQZBXxbHRWcL>iWgS;nD*iteCasNC58JEn%7?g$QCcWW=A$vW4fnczGV(b=R4$~=!hI&HUT1@QxDXtTX^+I1gqeeB6Mf0~7f;U*` zF8flGtBfb2+I6KM6>-D~o{x!f$*aMe@v{p(u=V89L6t_pbmYKQjIGm|DG5j>axFaf zmy%gJEEv6}5?HQyNv;|l+bCP+l;AAUa6uai^YxmG4ekVSY{IwZ{wqH-@rQ<(6eUS_ z?$i@{?gbJ}F=T%L_fochoW@wiHXD;?J2}c4^R6c56rCajj~%#VOOhnhP8m%4L?K*f z51Q_UcbDQPG;1ZwixUQ@wHv&+deeB4yj8M^W`9>RQ_`H`rHJO-5p3*+55FYlRtI_B zvX4PaYQez956dUDqDSZ(5$N5uj?DaO%1 znsmoPOpXQbVM{aJTFT!V`*EHEq-NrbDK zH+SoSsSvYUk}yPoer<}`-Y$pZ(20?7cP_E!4iHWQE<#+h1^MtixU!Cf`yp;>5 zt3-hX?3E%R5>w?8F1-%v3O@toq#|9oPcYey^&TIq0>EaB<%AP&5$($p7~SXxUd?&( zgfvW`vZM|$Z^sb{1tn)UXWGOaF*`DQ0FNuQn5f=Uguqf_-d2o*VeOlyuk_+Pw*PDi z!|?rBBG!3k)4tpQ$j0DpryBW>i)_3RjNfaXlqPKWy{V5_R9-6{-icob0P*_BZ8^9NX z$}-?zb0plUWKctB?5E{bTe-1MhXPR2z@hXT9>to|+H_oNH={=MIaQVOQ~$zGlueDKe?jT!rIB35GPYTdDliUN zB(yfF#ex_v%|>lAn_vPoQ{oJ}diipEGGfPdE>bDRY(IQWMkcxs&8vgtgmX3Cp451h zB9K0@C|w2cn`H7(JFJW2g@zd>o4d&=yGSfo%ftD!4Lia%vl~ZgxbHrz$+|El%-x01 zcT}LA9p|;CH4P#5eHz&=B=avnsn zIQTp^r7Fh54PBi=ihR{|LQSU}OCiE1QQQ1PWXp=hox9DQF3K@Lq}Hb6cg6?Ci^ywR z9B~}w_N+SWjtX3NBlE1%VW3?^J3w3oMKW2PUve77(Of3`Cj%>rZl~Ij;+f17*(=Xj zJjY?A)X)K1$BMwk+-I@rAAYfZL1zn4Li6s8pxzNSE30kmOGZi5i4=+FkG|BTf@k6e z3!?PnW_aLs1=Qo%ZSMXU&$gJXb@5!Je+I(CV%Vpha~j3%>5CJ%#xIlB4U(CuuEi3;C^X(=O-yL;SL5fVjV=mKMu8{|8Ti=83XYkXt@Ab{>J8>V{zjeXzDSyw0y7kSw4?~MV zHpbQw8(b@9-!pgbl7US{S(X~gQ9|BB(#+AbFtbtDvYJ_|xI55X_D(JyC@* zhgH|ERc-ok_Xa#K-NJK0J{*u*rd_09=Wr;#XDu?O8G*#ev?4sa!c*vLFc*vyJr0L+ z?b=If2;ke9=rJy+xUd~L7L>8psl=0=6T5b~I-NRZ{6mHseW~uu%fM%j@=Z z%(Dd73hX|A1j8(*PKfiV9nDU`hAxKhoeWv5V)xAEKeh7JkC%6^u0^PAGv;am6j6a% zxHfd?15d%0IuKHHHIG@g;LwXcBW1m_fsA*h|?X*@*A6yYL*hS7}f! zn+kLCF{+j~SSn0^XI@fAXgEr8POK}blyT&o#aEG|XkQEp+J-sudB-4)*F;M>JqUMZ zLi5hhcTjy-RXj)U?k5R@?H|K_+f|g=3J1u9or-a&^2vOhvLv{aEhD0O+;+ef7r~8` zbw{(7r-@Q-Cp0^SwvWLes{YRoMn7Vsg`G$agI{6z)9!Yj?+a?pYeBlb-C+tN^lz7h z91~SWBkNH|;(O0R%QN7_6~djjm%M0h;iraNG@tPU+w-X)U4sY?*>Vc%qY}+JN~a@n{ZrsJdgng4 zD%Y8T*=2j%Gk{v9SvVYSB_T-DN?u-mR^2C4WI3tz>0MWr`H11RTVoT5gW`~>_JqCu z3!mnfbPv;bfd<#H;SBb2w7dH)qguEgra^<%*U_U*9&F<{BVpFrr%lB3Ju?c*^lPo@8L7*QtgZUTLbd7fjaT-imzCXqD3N6Lg7}3h~;`iIwT(%N!=R3D?i|$@kmTmifM z!<{1U(K_!!GzhYz;VXY98W&0ZYoC%^0hN@t13Mz4bP(okx#L^2(C-avPR6AtcN}=5 z6BKQVNAinC>wo<z?@)!#EcgHc}f;igl&pO4X)8@b8rIzMOp_;M~ZL#!k_q>K||l=4k(jM-2P z`x4J6b8|k(ONY$5Q4PeZeT?@!s}5I{!2fug-5XHSt<6%8FhRSuc2}&X+)tww5C}dD zAfs)yPe=d!Wbjq?;t|mTs***+T^;fWtT91U8U5v#x-)}jq4_)^I9~PO>XJ_x>}PL zB~BlG;~KcsZ{X9)3mqlhk#z>z?twl6R5`UA z!QYEFF0Q=Lv?=vk`_Q}>F4O$Wo1WeC3cO!n*}HM@d7qo@vhJyB%c~xalslLsS@&N* z>C^k=;|WijOub!3%f;*J+<*1z3`lGO|Tr7cE$LaQG{;=iZ8?{hFJiUZiL^jQ?Up=1Iba&8JJe zpEY9R*KHUB`d<6eB0pY?^cfz1BufpYxQWgj{vP)DCj*6NXA~o=%P#k~;#Sw*+Z|#= zoO`>jb~N#hqA({4TC8!ss!H~s{T8Z&_8f!+M=p_k?Iqn)&t0k18fzcAB63W*lRefx zwtBoV@%mWkcZ2g|KcD$sw4uwEoodSm?W;F7F0w8C=VJ-LV_9;3@>u&fZ`?JM|M6H1 zv=O7b=SC)r=pH>YQNgKR2lKrZ=yq)c(f0 zV_8DyyfsEl;{cya>a5AVE0#?dd;8nwBxSD}6jtx4y@F?>pk}P*WQ<_)!5X8RyE(n% z{KS+SMl&h}sh$0)tIjcCWc5SI7$=veCw=Juk~Hcx9|2$PlK2>e(Nskjk_pJQyS3Oj z3Mu3cEs0c?Edz=+W4_n>{lRub2YrU|z^ra}+IToDOxhX%fgM0^R%J}_V8x_!@)Hpr zP2i~TaU+h9fNqbD4crt$8i`1HFwcA17pNGXyA_{3dp1_t@fA9GEi?8fRq0#H)}Ab7 zfAV{fkFmqIR{vV8r^nm!a$k*knm83VPL&thDt$O|$Kw7MRj>T&^o{D0_8178@_chO z6`y1GJkHsu>ot%YX;SO1anD*_w$DXh+ijDpPUMn1?&gTK536%`rCEZoa))B8wS3f~ zuu6B0ohSbB@5{xT?sdq?4&Bh-={*qEB3kx!U`b!=@k&vNyT*(Bkl`eaAG{5FRqk2G zCXD5Wv@0b2wO1V)7mW<8a~9uu)jv?uw8T4V#ItvvUeE4t`}Yqo&K^`Dnme{b-ZjxPx<*XMQ&qZV;^!HPUvv^B-_7t z-g*Cl(?#BaK0}F>)+?j@1?`@z6{5=WB5lh=C8EzJy>bbv+{W!Qj|{EcX0DsrCTIdy8XuaIs*rsZkuXOR&JZ#i$iAE{E(XF!FHGIfBcIrSLuvds?gnbaIfpkcSV9NIdVml3-gG-|^Ou#O=(LDR?(^9Ln61UGu@YW|4Sib0jk>sW^Z7NS zrYOIv6FV4GBby2mJva+NcO!zA?=gFPWLlG@2QwzKWsGPZYO?5M8CtQk_XSYmH=VY=#Wu33Mt0| zl$&!YHq`@zDZZtIFcyb7Uk^G;4efIpMFdNbyMr*Vi^2`4h=hagtt}PLvBSyD^}=K= zc^9k4pv9_oJ^d=Zt0~*1!{%x2(L5V(n|SKvWk08>JO@SGH=aAY{D~Cw^d+gPK;Q%SH}(%v?^l> zMe%fUx}!(S>^X$we8c5mNv=_o48BCkA~6-1q{6%+NX!lmxu z_|W}+m%~Xf85CM|ZMZo6NtQ}ka0rgg$~r+TZTa7>h5bwl8LcG|)4>-gbGzL-m_y`x7}cQ;zc1e5A9>pHRTm1HFBA%Cn! zkt{wXUh`05VOe6(LI`64l_lFEmh9bbOTpggthcFjV|uby5+O?i#H4{H5CIG}$L74Y zm`~*sOr8&#BF4izHk<)pM8{+Bx0?*pBqZk^p@wsw>Z{#``K!z8X^BAf;{>*t;q2o% z(9E?UA;;!jC>5+pB7*iW_8pqcae!!r0FADBFv5H=_^hVuHh?MRQddT$ZZZjw1CPE1 znMbVQ%XtJZ^Wt((usw%8EW%=mOAbD~URM{X*~u5r9&1%4R7jXMmc5?{Tr>NNeO@3r z?D3wcbiT{F+StV#E&QxHpbwI?7OCG+A}%Jw#lj!H+{BrefiZoiZyKLgncKjdx!4E3 zs0axM0F9&K@k7=a?k$S!6XrdqydrGm2eNUtHn7Kj`{(aHXs0;YY8Y_k z!|rYtxhft&#lvrEIi-jQY(F~$%UB`TlD~zip?>&?xLa~w!5ZE{_ql~x+_v}Jbh0^= zwEq~}2N2W%qyFCSocfLf8K_< zu`TGJB+%6UV}wnRqpig7_EVF^)IAe=v!#*(n+{6E+F%@JZ5t*BaM+NBIE@n3;?z5U zbA@0XE2NCqd5;?X63NhtMavV7mof%HaY#s6a#fgd^sD767_r>W!b)leARY*3)mDev zOMe2!@Y7zEeK4XUCY_*03{hrTNAL_=ALAW-1&p{dYYg*40ABdxvuPAiB*yW&TI|B- zqlC9I1&-aJ-(k0uC_F`(WHFCAXZCAT<{5(sy`NlxkpQfnnjrUU@JFG99fw!cOe3FB z%3RnA`bQC*8Xqr)Of!eXgx@v={aFon!yu@7QziG*||1qR6QGW%et} zLB;(>Vp*-MImbyuQHG@3Yg5W$t&V$>+wFqATSTp_lP3WC0PN^ALwCZ+(rOI%TA5PZ zatvQ#L4(YO=hH zlrbvM)0``V5+udh6K9fyokiXQM1Rf<93YQ2nA~iZxmtPdLIEb$^4>uJG-PKO{-YB$F+Dim*>|Qsj5AL)|e-*je~O`N_UtIB)HU2aUs<=K8Wk z>&WcLg=M#XAoEk7A}It-f7bZaKn@dL`S{UBaG%iY)zVnjIE?wSAescx|A#N z)O3LfOJvwVx zy%m#`lvJ9DXVix0br4S1Js4qZb8#k%Mw}~+Hp9mePe2kIxN6PCq?-cEu+P5&%u!_= z^u7ksU1i&zGPIP@@-(sn{8cFd<-;o;JtDs-ny1<9YPjhz+xz_=^J%o>SDQS$ziS^D zI{w&T8we$``JBv}2j< z5NwCKm!`(mX{~nC?!t*p-vb>0f5BH%x?xhO3%xmt!UBNY5jm8lNeBR$Cv?!Go5VYH{!5qdm%jfsU6@d35^bB85g?O z9G*y40Xc;H>1_}Cw2mgcIgIE3P%*OtQ99<_Eip$wm#}?sg1%2hffD10_Rb?zumc*#6lSclv$>RgBv^B3kvKU8)Q3$KCi``Qhr3_0T z2ZlU?)33ZQ(e(-5<_Cz&Jy5<5D z7~#_V@@mls+@&g$A`_q^PW}4#9TpalC{YT41VYB=qj-qHd9xVLN}VNq$m?oQqGlEy zp{MJ6_R6^hvLNk-qaI0vNv_22=zY}qU=%TbiD;85CADIugvvGw&zF@%LCV6JXy7D- zi`!sWexxa>WvQ&`45~%7OljKeIPw;#mJbOUq(M!1BvYIuQAA~+xVJX70|4$if}gcL zsBv=w(V}OOe*6y zuo44n#W~E*idC(@cXQf3>`n+T>ht{)sP73Rk%SDBQyJJ0 z7ORhP$$@1QcL%qt5eKC+jZSFpD&g5v#?&0>vZg3+V7!HgxiCg>am?-6_6(B~3B?>> zhC)$_V3L`Y?*Z<`*DAP{dPP*un2~EQvA_?2Hg9vTnDOygijeS?S3_MyoNa%)%kwUC zxoN_TcU|ev6Jq4NVA>Lt*MD z^EJR2=h)eCA97z^=MJ4{({?~JX&2GGJgNme4L_RBqo{tQ9{3gG+b%td&v&23@{%yw zH`z-F`}xd5zVquG0W{vtW3NC}w#L+xI5I>tnsSUe>?Ojjl4L?P2SkxIhabg>Cvc|B z%@f#wQq7&0I_I8**Z`2K2T=C9(m9OJK?-f#i9s)GXj%=AdtJ|{kFHG%jWK$)lv}86 zhVQdn(r~gN9Sva^j1-LYOh+!z^hDRo{?Zb#N6$0#60Fnv-=6!q$j9- zromt;QQ0eaPB&(Mk|=o+|6v^be4heQOZa8De=hi%c%t~?_Z{**93P0PPn8w{`BTW+ zGjJ$Y^miF_-Imj(4*k9bQ@@5rV?P=3|(%r)~*?wxnvC zsZ=NVhHBF-+Zr}THu`l)NfQzIJBDA6LyfEs+WHLpL0xE<5-%$8zj}bCUFf%6zPKMH zv5f!}Y1L}3p0hnVOxIB6wSNCge9d&UxfGz(>Be2dH#=4@A0_}~d?e5E)wwzVcYRW# zWF_urt>BVZ0X6@<`xwaUygZ`Y zb29NDq+QcHf5kP;U-cvYy}(95qjjfb0H!KZ;$bEAj=tGaP#*Jr6aF$y#Qfmmi_e})zbAcGjjMo#jHNU)9y5-r=`;!g|8P7m#Yo8=F~WcEJ%_X{(~e=erBvut zi%|$FLWVJaXE_@Ab39LU`S+#i?Z>EHL!N??XebKF%r-m7&6gO%rHjnCQ>7&Bz)uxs zWtf@J0Z}bJ%Awkcw`mY}N@@|rBC&juqs#z#MaxWAk|Uc>h!bccfg5$1E3P#NFdG>* zzlLXG+Zw(v%*O!m$${Q9d+-?LUt(36s~K_02RpoyG0tSQk|06j9 z_rni@9iD`SHi@(E+;LC2vj5Fbd$PK-t>*=bhD=?Wq6Rv=?pO?bKK4rCzg(6SzBlP1 zdaF6;tN&cQh5z8wTACIS$BGh7Rie6a$p+e`WG5Vod{_BE<-#l&>l9E~FD-Yw7v;4& zHmlW@U*l~xuam%ra3_QxMBXpaAqG5!w@Y;Epe5LzdN51D(hM{#Nl4&9=2;c}zu0^0 zu&UC&ZFrkw&R7^Awt^sH00x4Lfr=t3-6#r3DW%ewqaa{mk&1LlDkxz9(xQZfprn!_ z(&2mlw&Tp)E_|QkIlkjPzBgw6xWSvf*IxTtSN!6PL!hbc0Y^fQ^1z@JIUS|oVMw1S6%Q+M|0J^q6 zdUEJ^`;J=ZuXrY>h{wCSj&2k^RZbb$s6g6InS18tP?6av` zrl@AJm6YsMyTcfx?obrGwsOSF4M0T3u%%xxmhKbP3oQzH5*4kI1$u0Hdk%)-FqU+k z!;Nyk*G6W91K`l@37|>Q&H|)d$c24}#_t#2viX4W$JMXB8^HlVO;wD}!X-J38(_uE zq4BAUMTs&EuH4mx=0{|rGjTs|!QRwwvocAav%_X6l^KS3G9#90bB2W~1Oll*WXDug zlB+eL69Yw)gI6ah$evcRXukTZCO5(5NxLi$ReBNya&I0&_d9GeCZSd%Nh=bloRro` zY>6!Q-~y`{T~UC;GT4nIwa7e_gwaTR+`^x+BH30; zOUnkT{}3OS+8bOVhUd|^3J8ZZDGY0gqCh^CSaox-vU>*CGaiA&Fla$nlam{>pu;lt zTdT{%yd8(3ve|eAkM^napMftkz%0%AI8RPlaD?h_@G?A~%(x3y%u19=9L}kNZ$WJj zEqe=+86+Vigtwo?7)3?arC(Be)?V|E$JwNfD#fTC1Dm>Ax(Bc#g1Y4`EG&|S$k{%l z>cV213_+MgDcR0;8k*V)&xue>ZIS~poUc-3+}q1@*dhXf`DBxs0z*SDUT-0IInn!w zxi(G{+kxK@`=t?@SXylE31~Ze<>fg9@=0)v8o~sIiPX!#d%K%i^LQCYThbz4VPF`D zGIi7H?UfA(WXa0v1dag%*xk52Ha;B9tarrA->z=gll7=fBv|FEW_zeu*r|%Z*#8^xK9{5=69z{!M&gE)=28Qx6_pSN#Ku?oOJA z!g<=&Y5g&`4Lwz82p%5UNVR7y!TJ0E5?@dLwiClV0G*XW^l?dA8aF>_~3Frn#Kzhd9bh z?|zbB(0&W0$_*5UQoZ`N5VO#^r<<}n>X6bok5s}~qP=u;+W@QGBZGH#d^b{PRzAF=yT?IZ@&5s_15`{chsi4g76TfgEA zNJU?aJ+9V%U12+$I{;&yPh2R13r|g2%v^S*Xoo)}y}Z4Uza8Ga8Q(8LkwTLisI@HRMfk(y$C` ztkE!)mzSqI+?Pkj%oZ)e*sKdy*VfhwdrP;<-pB^}iF}FyZ~Qt3G?Nm(O6J1lwZXNY zcR>+e?oGptiD>%Ef|5}xrtnCM|Hka7F0<3xv~?@qh-27-7x%S!?KSjwR?g%d%@i}A!x}7H-S^%P}4x0kU^h;*WvB` z`qciTm4#A-Y3EQJZxrjs5&w-a|Kz=qm^b~23U*lO{^0rVLukUIBd3kfYOk7_M|3MQ zXpi|7_fkMzS4_<|#61ry#wcvAR-|~7#2yJW?qNHFF+4sNkcCXmMVN`2KR?%Cai$Ya>lR1(V6HDk1WSz*X7 zxQ+tsO1*wH+m1r)?6G@Ch9rHGhqAQn`w)l)EKi?0#2cZ$!^Kug1s{%p6MKPt#)sMp z_E@56mkK@m>6j)tW!a2-m1vanXdU*#la3rJCSizzx^{dBjx!(Gh{)6xt0Uc;x!m!9owxV~wyK_RL`(w)0v?-Z#CMkg#5H8}pAbdKd?q z(6v=e#~z1M^Bmkp(1x|#h51(AF*y5}x9AY}f0)Ax#x8$KsAjS_s%c?tE8Tj44{%~{ z5Qn>^5>#;;P}8`Gn25B?px3vUT_YyZ1=8|Hh`j~=#^mS47GSVPE#Zg%d{PkaOa`Fj zJkiy?pifFn0ub7R5)!Pg<}I2WitS1gkB<_4>g^kU-bH2uPMZA(*yUm@NdE)OiWI8@ z+g~jek{+UCC))-auHf)D@&qDj6wlCY=q)FedVo8E8q+rpnven?5@}Qej5W)_;L9#T=Zk8TVh` z6h~PudYjciij?#2cDtTb?6ImVl>3bem?{*T5H*Sp+BLVCM~mJgM?l+h2+C@Lrf1Rf z2>tZHILZFtxG(E&)$>HyL!s_D`YE$#($Du zzo+n8i1JY%xj&!8@Y4WdQJ(6bIXx+uJ3?w>@y(F% z0sWmD-Mu~{z91oF!`ez_*z)ofJK?V*dG*JDopNmrYuIOdxNlHixJ!^oV8%&f z*Bp(1stzkoArz!KkeJVjI7cNPG^zy2gpCR%Rmfg)6YTvFO(1amixTzIDhYfzQc*hG zbHs+vjOw$bFd%2spgCBpc^9=+RrB3wSsVbHv!rGa(aa>malq4dF;$+Vb3+WS*9?-1 zE9mDZ^fiB2A8-I{UW#^xP%c8zYG~;MT54(@<%?%)6{8MQl|0y(Os%XpRo(Kogg$xr z>FIcA3?Sy`%bz)1yL!dETsdesG%76+k3s$m&? z0re!~SW6ewmq?709ZaFqxO$uaL9Z`rP_byOMun(CpShE>1gPFfZMzMq_PdYtpYqXq z$3b?;1omRujifJr9y#pd7mYHC$8Yq!68akylQT=07(+^I5Jodkwo*jRNxlrO*Z z;eoB;$Y{W$&8^@0R25xTt)ejsR=MPxOQHrIOf>1YDyS89lQr=-X5A{q4p9wEq9eMG zTapMAiXz8xF!Mi3wjUb(h>xGY8d4p$6*>%dD(mxROkV=gkWz`<9=_E}+_qQnbfJ-& zMw6|k4AR3h(E$6n#LRzj7ZE(_hU#=ABlXAz{zoMKhRgoN87TU$7*lH@$(SVb9?(_zBOdQ-?L|Q8yBLBm7sfAtIFBPSvkoa$&QCpD(2X|w zk_1tuCCfX2KolqIRe>C^+sJ-YxV=aO8Z{esF(*z9tkzHVfCk=(o(ZZcy&v>0z)NT* ziT4_8Qjg6+NL0U?v2U@Zb0wfq&WX;(uEcQuCFEhXb*GzAT#Yr!xUCx^UVDLvba)hU zb*tjzXpHFvQ6noO5m-5aUS@Arq7g!55&*l&w_0HnzUxj`f5H6(>W%SCT2u;ma zE5w{Ja;RixCD5Mgk;}~H*yVGozb7^lP2jpdSfit6wJ=|J8{6Wrk0|gbHX>2V-&eRs z6`(iMpE=CjgG(+@;n%D@2XB4Eigmu;Kw3048mQJZ=wj>m{IfRFJ>)OckPmGn-M1LI z5}lo!G|(?K%dKY){AX}bQ<{Kn*{iOexMK@nd>4dyEw9F2*Bb2aAz9M`4-4v*{nT|w z2MEEpA8d6#>7yPLG)7@(JDz@Vwz8s+0qV|q&zM4<5=lr^Z<0M)qso*U322So4G!j0 zas2%GGg%l;vk=Q z_SgJv_SzKyWw;g3VtjRnazafbH(rUL_O^qZzqwKaIh@Q|t+OSGsuEOzbUvkc>q;MS zjI#_G%ZqL6=W_&5JooaQzxZDXEI_b8mGhN=6u-CuTTm*Kd)?-dt?NTDMC}B2vd*ogl->p*0<}HB-Bcpdpb6@N~QnQOQ=nAaE@j*MznRHS^Cdxe^N~F%QX6 z=y)^t3cVgFGR^3ssy$GGQ?PtOFQ^y(D(%(*j`LFA0E|HBHr;Rb5 z8VM@2K^x~ep`n4J64qU!pLZqc#$Y@kg_J%IFi|o-&QxQhGB0K4B(`)rMb1Bao_sIT zqee1_4dTxS4EaNGgk}@Kp~TxI?H)qWB0`{P+R=~h;J#sDz?P;G8j~nS4R&?2V)HVt zStcWS4w^(X+ZAu`so2vG7=Alg-aAtxfA)BjJoRG z@PXCUsvW5jVBE5il*2}h7JA3U+*HCKFTnqac-ye0uDbKboZ!`_Zaw0w&&XRF?G*mf z{*f&^V)Aq>kI;u@%!i9`IP)^_+eMECcaobUTZvLV$&w$J~3YQ@Lon zTT(1{1myeQ_OgEQVlIYY$GC{}WyNF8wGW|XeQEP>B}+MVb@g^L zNXOHmivl%LJntJ75yIsGY2c>HLaxkEw zOv7^k9r~U*OsJQZ(>3D~?U+TN{W+oCDbUf2Kz^q>>A0abB~Fe?YXK!{7hAv>Ithi; zk{fzKNL4ED*-hu**h*QX3fLEH?m1CnO{j-KP`9DT8x3+QT}CRwq%Q(dXKzR;yxKT* z#~A)Lkg3uT;k1h^WPYTrW23rXswKK&zKBcED@$6Xt>h_s+y$f^(Fi6bh;@)iV?_20 zHNdqmXp!>Pdh-vLUdJ;o&Wa;@L`!r{Mf$RF6JA$`DE4wvDaHoi4`Nrht|l5_?hqc5 zdc3G3M8jY$NBlJcevfSChVXJGQZ#?U``G$}@5wU+BunTV;qXA}mQz$L?#o73#Y}|3 z)MZI*#uL9OX!Cx&yDhr{lfjJ2SP41MBR1 zZ9lhDBnf=(x}S}(12w@h!*(X-UexldY=8yW3I;%!iQh~TnG zw$}DcAh}iuGGg)sE3c^lA=J%LWdQWq^N@?T^Btq0Glu{2mZ(AW#Qei;cIsFI6yH9? z%BN^;6zUGY3-1jvd<9T`UiKt~I$Op(cTmIl@0mDJ>5#al=s7pR1=OFiapdY}?lmmEQ?LAOZuTRs{5(WdHx3Z1VIkq+ zxn0caJ8L4Cg$|Uh4@DPxiT>HB4}fiE5A{}Oh4OA!=K7O(_J7uCN^z{pZA!NOY>W}L z4B;uk52-(f($4&jY!nSC_fqcah885Pk#!!Nj2Xatkh;=}~_P-r8P+^*X~OF2@c@DwYWbe$X> zv6JF%wTZ|Fvb%pc?JVS!1-k2z&rOUQlW1J%Q0Pd4&L$xFrxO@IFgUn$(TUjjhlSTi zMxiD%6&sYYe#Dn{IwZiN9-0z@XzP)vBG(HH!5ls zM}X7<`CE!AQ3`#f`=H7hXz5w8rpLy#4kzE7052b3^)YKV6aNIVat=HJ~WNC`@JYp`KFt}~;BlmWF4k3#JkV;d2rkEqJ z+QIF*IFBu~H{_`VW>P4xDBRb>9HRG2bI_!5B~msjS&CDWq{`I3@U6a7ICF<#$nsdw zZ~CwcZUqv82>s4O)PnW6DHN6NV#}(hWM0%<9MR~g;&)o^BmaP*wt&)opNbA_9iMv% znFaJmVEzzAUN#wM0Q==ds>64T4qJ6v@#6}GUNkyon@|xE&J&iiwCq_@Z@Bza$5rqC zMby|GJw(Ya=2t)p9MS;C@2Uz82@kf^6t!^+c+m}l#?U00TbH!cJ2!J zaOsfYNj2&!&QuG1Y}qa{&ghU-ML;o%>vN0M%%mI4m}6FkC?JK-C`^Lj)oa~mCFx_E zCJ=QS*$m|02{9yhR?^cT@+TXoO4tjUYk&~Z@p6E?q_Gx+8#geopQT7s%mcOv&nsbn zk(2h#C=%fhDN5$dNe;FJG}3DpPJ3rmsEmEz$g5xJAen@a9OhXQ15nVYKul12zPeg` z+|Q-LWp}SzY}Z#-nTBU_E4qa47R+5$HQ*X8B=4_kC0Z5)G%rV&Fz$}95Fsq$EWn0u z&(v$d+{sdQUJ#381J6;X)zAU7X*|KZ=~Cn)xo!xQ?~9EHX@sD#BHeN*ErO8$q=Q~l zw)lqH6e6$z8}{Wf7!F?~Cl5g}=^*9|dK7IipfH+tEz{;L=<#eZA|PqG%9Bcio) zUG@YG0WfWejXe}mxW)Z2gM(O(_=y8#cu?haV6i9s$d(}-%s)_<#Tzq{(p(VYbI>_u=gh8_Ir^KRH zC{pqY2uLR`=i@6qXbYBA{yx|wnO(ImEG!IaC2N_wLqj~Uji{_|2g0lgfu(Y>CYl*~ z7ig0aOzJ-1nIFCdTA)Amhx7*)fVyA#iHnxGm0KfUk;NpdBhWH*Ed4Z!y~HAcSok?b z48hFf{Jw0T0-(9cqdWgqId;g7@VTJY4r!uEtFs-9^~l^lU|51?79kitfG%VHJ}3eS zQwDUqXG%t6j0?AD8fK7ZIx{e52Zk$lEYnBb_AT!sw_kTICLBsP+B9&Qb(8`)E^owrX*&PfB+DCD}fJwoEkxlPpsj6IU8=Z`;jmY9j}5c5{vL*WYWJbxYpL))UJX^w+j&8w^@qn(SF6@@HNJcwmCY78GcN2c?AM)wI>8x*@Otv zeBE)Bj~g@}nQ$^cwQ#$3jgl9(Z8MsJ(0$iVtiN{S25A9nW04YWz8fvsYUublDr#6R z#v;@-(^#-hu-#V0%B6;_A%X^HX8@z_?;s+D1bTo}SO^GXQONNmm|rRuGEk&w4|Joh zZ(>8Q;b$|nJT1ID!GF0}+GP;YoX{A)5-ku(J5|Ob_7UbK*ori?L#hd0f{w%Ct8e7g z=ZAiAOY%-?xZ0yu=Vp|=>y`m^=?JYdB%cRtk>NX(8UUp@Pn0d+74rV2ks;Y9bMiTFkGa!A5K$&jej5RtC_ zI`JE`lCg0oR#E&yir*hanTZKIJbZ=2@F8}HDo}Lo2w%_PEwu>&+KgAaej}8LUrZ?| zXv=?icrZ%>F@%HOg`?f3>rtgMg(XN&HKyK`G7OukP?z+v9Xfoj4k&A`TaUk;r%@5>+f|uaDpekgUT>SkI8C8UXFtNB%QhpeOX^JYdM5 zFuS*s7mP`8niXQ;D$-mh?rTRiB<00#*vw?$0 z0Dx&j!ISj=sws9N7Xf1MNVWL`beb!;QZF-ShNb!GX6ChMjyvu zAIK;nq9T!Uh=L|*yM}S+J{?e9SWHSB=m?uFQ;iGFT~P>3-phAl+mHZ9B=t5(?hQ%k z%)J)x)f10~ZaTrJp<~uY^7v1Ns0N0S2RhCJxJP1>3cMiW=PP?FFB|vydmn+O+n}*$ zWYW!7X86(a3cuw>Y?p)%BiV?F70f+?*5kNx!m<;Wf64ohMIvrJc8~y7eOwPbLKPNf z&-C<&hH=J)cHS;pNJ~>etmi^ru8Bow?WC)pL+Ln=$I+)t?0pwW6ao79IQ!bj*TA9q zv00#qvK5sT!;ed>;ShRB3MxT zjXxEb0G}WeGV)PsBjgxi)=6xnT} z0QH%MXTHNWE~#5S?wx}A`_Sx;MUv*AO{D@A^)}^5`TE87&(4j^jJuAwl{gsHRC25W z1|_?<9FU5A#Z=V*XQT}f8%Y4VNd4UekjPYWFNqg{ib=ujt2{EhN0m2R>0fhcR zltg03qhn0&BEYEo+hQYZ1%Spd6=&=U5~@Tu(DV4l;cAjR8Pt0~{sOe^j~uH!=t}4n z--q(*uS`N~2$DGA8zM1xR*S_SSNkvqu zGC(ciK^1w@VMQH{ZhIhbIlIsKh|R(lgkCzg7?a(h33FY@*vFMnSH+`F<{rNs1mWO-Z*iM%Q3S^{>kXeF0h-cN%ac(?FKqBZ`Z$GRSA%B78wuf zIT2dR9J1zx_Q{aj1G&uLZ zf=+(}nZ*N}8C8);09_hG64Z+5u~7$`Y?@I%?J>lMtf*Fm$WEDlUj!B=$7W;)#kVag zi3Uw2DYzp<#56E}@c z{x>Mf6Q_|*p%ZxM`-Vj<(z@;vS$GJN3PJXQw8#dm^@6cb3hX*OZ?SdaXxfn4UL>J& zD0c;{`~peMk{yrZi^Y|9#4#$GsE~@ApcW`%Y5e0h!V*zXYZMbG2l0P;1elRah6S7N z;p-{6*)3lMDWTwYyEkQg5VokQtLk}3y*T*eg{A!qQcDMYL0R3J4e3zeJj5>SqRo`#3R^5Bjw`5poFJa#E zOXUG%C!9V&Q!l!BseqIY?Um^8xfTU`*;O-a(+^4#iJLJ~SYNKc-hNOQ%%VfKEf7R; z!Dx!yCUJl^Fd|-}FJV!B7zM@?OBuX4hh9`FPO$jF=RGaIqdx$LmviYFRga~sm13cOn~t(3SFHUi z)wugb5dNDRRtTCkd&48(smduhq1G!GL$!&tQXO{)rzs})7Cr&z%Rw zl5i>IVu~o}VD{~7A?p{eHOomSv*Q*Na6^WGo$Xme#R8NV45#@??QB^*>~7j^aa*gR zZ^o-`5jKALeZSE&s332F7!eS`WwzcL9H{&%rU1u7fhQy?c6Nrwiv&B^hooW*-`rbG zt-CjP!cpNQt>RqTg(sVtz_wCF*@Ih}y%<6Jj0&gosbe&grlB_zx3y1Mc^0&MKv5}M;Y1*V_@O%Ljkv?B za6SXF?IE`7=d|z6>57WO!MPvgrjiOC(Tiz4_u_ew$NnsJqxCf=~dBy+{bSVk$|D zCe#47K#r6MEl23L33Eoj6RARC2@}VzmxWXJMct;o9b5ey+Tkv_m_s0QiLxj;NiQX{ z&ep-RhNwEXVIwxqkEZ>Uedv(?*K%q26E~t<^tOtm8KS*zoC%6j#@-4Dt*B~D9(2h$ zP)=4Z#;S;rG=skf1~w}p$X?|NIn!7Y&L*Z1jsUGj%JkfG}JgBNg0Z!y5v3z)g`z!oA6B>_5G zZ{z}T6@C%JH;^j=xlXmVytmU95DCtK)+J#{DGIU$4~`%;AjXJlEu)d_5Vr*aljPr4 z(Sa-ji3)|{)TeM%E3Bt;-v2h1qG#9DXebT!=}AK&PaAbB_W3~WRg`UO)RU82qdJiS z9%ROohJQjQMtuwFoQd8tchx%v+$&VfTH)U5>|)Bu^6nqPEZmM5U3yt?xLzUlredD~ zvT-Fl)&^i!EQ;!>UxxFN!KfkCMRtk-H~QJS>!mJ&30`#hBTkJEovkC&Kd3$Y6TP6H6O$AcnWw=>!I& zT^otLZksHHm|!H|vQjT@!fbZhRD=~w+?;9|`(D{{bVLM}e~{O_f0%TmIF}d%*HrGd z6`66RwA^Y{U(r6z$GcdJvWw40Dfj)qnd+fT=tNjxuD3gfOQOt$-TOQ_DiMIW#1_wY z63GllrMX@ixDt!(TKI4U33C7#t#;(#VFC$n>a_pbWxp2%eN|KT$^!=?wlKFD`9~rd zM1&q6z+Pc&elIwvp)b+Uc!-!PqE^8=@HLLlxE+CftMxlbCPwv(yF1jdmuWmcaaBxT z6hq+t<;Fwo)|AkfeDKJgNgFE>JNLd&F-2rDC|bYXfxLEj1#&9h&?$gP@bytfdJ}bj zeCrvTxK_K$vG_lvXT#$$k~Gt%T#?e?IT=Le3bc%mh%Ea;98cdo-IlBD5y$?-HK0gr zPHmhm!qI3i@^mGm*;BCYmz;nkEm`yqACyL{?|HQ3WV9iI-HH$KS%C5b-w_#G&}FS0 zXIP>j`gcI!@YU5*)HO0Blxyvv`^RN(FKs%xX{Uj~9f_@Rn=DV>S3D58z-SJJwo@-} zn^RJ+{nEOzIKnkIH@CcvTuwU7ub0`!uTa#%xj9hUsudXXm5>rhaf8=0YtWZ>u?%LC zX8!?{DB2#NSUcimkk{nU@bGYY>f>F#9LqW++6PB@k0HIu&}U0 z>HIkil)b~}k@wF9LAQHF5=_shi|0CNxjD*kJX7>bvr+-lKBA7a4AWsavN0xaU%KEN z(LOVZpw9~0+)+&T`6^;uu5`?X)F;IfrKNiw-0bY^w988@Z4Zz6DrGkJykVuA#)BqK zePiS0J>^oxopqY^b(#j#^BN7_@mqVYaV@bpc5M2b#9D*T>pN^(-JI%w*X(#4kkDFd zCfHim#WVA~ltH3tt5~C0mucgo;GDcOU(-tig$x$z$L@|2$*iw0w`r;Mi}zBKDY8D? zI@#`=FQ>ucuFp0_l0Bi$Qf6ZDUZKwV$6fF5zFXSAJEt^owt?@fQ0LD*%FjBk^;SMp z;a~YC)VVw>D^i3nrOHPL7U?T5eZnr16QjQUw`P_K{*c=`(`V$1(Sf zOpx}Mew$<3L}kuOyF7DtiSr{5{p8WLgQk(M)?72SpRba2RM9`8azbAf3Scs+N6wcB zK8QYFUPq4M8UwYe`KV^9pD-MZO$6E5_NK|Sx~Lw(ayp#oK#AyjkCEi4v@Yp#LNlZD zqu7{QC`a48PIzuB=xuFz81rDIMT6XoS@MqrYa>N674v9d_XXZ)44R@TTdBbl*ttGK zY=L*5hIrTKS86ihJO;kzU1K&r+Zz^p#oAKbz!zitZNMdC9`fsB{lQ2LwdC-U2V`F-W$)`OJ3Z|upCIndwTX8WraN7vq#nYT!# zmL$`}b1n~tOmhsL_JT9e^iH5uYoV!G#I!ulK&ifnX)hSDvd`mlEsymlKT89!f?d}E z_|%@4ufUyGJ}w8nr)pFt`y^flubG36CDB~=+q=&V(N*JjnnigyXvBgYhrcc?`Hl~2 z=|&owP9=(8%rC-0Diy_~OQLoNq7{X=A0yxt&M;+DL-5#WcGie@U0j`e9wmW?H8dlq zlQ1RnYI5RURhD$Wu0?l6&P_( zwcB}B;*TmXE$K+KoVLvAe8*zBx^F8uHYR4;ppAL+>U@stOTVaV3qAk6TK9+w4@W#L z>4^S;!*8Yk*kT@fe8;Pe(!SbFvs`UHBwb4{<<(5A%nG|YvnlpfUBgE!zVzaP2gOUZ z!?p+)?`WCaQ{I*NIM&AAR7`o>!)+5zT5oC7U2Cv-v{~3eqicDkf!V4VrGblzuQp~> zauv4B+3)#WE5gmHuOdsEdvBeP>b7eKx!+qwxg2#3tP^{xa^Z;vzcS3F+-yUXTHOvN z-pI?72@kmQx%FxPptBro$!sICgJN8GGidF%I+rihw{Yc++I{!7!D0j7RROuBde6h0 zX+5rm#H3V9J$3!yXVUfY&6nM>f)BLEWZC6gkhul~|1#L31t2EBHQ)SZ!}&tlW3n&2 z`Y8McTkA2A7`2f+$ZWt>42Es5qpP*@tU%SHF!QGN_7bp%ZhiI2D+RKjioJ6 z@Q)|eR#;ObHy`mlXe4(VYq-m_SXPtv%z?c*mGk#j@`bsvUzw+iwb;OqVGPKw>{ zy|O&3U70K1t2eU)fB6mDZYp+P_Wl|kkYtMOwf7m8ZU2#kw_Tr=x79>1KQ7kc)|k;T z^4THwqe-+m5#iiluEI53Y2k~>+h$LuTy+Bw-Dma7NtXifwfOhJC@izv)Q}x(=vs+F zzQ)6K2k(R&=nqyZE7Nf|V%H@AK}sE^vCz%9)Uws}$-eM26v*SKS#!4IWT3eQOo~(s zs~Ih^i%`K{xNExadQ^nRyW()7CS5hn^SHRUO?&j7o*7HZ5pUqBJ-YTpi8v~;Q&4ZH zH1Efbx`z62-CqJWa%0;(Z92P`NAzon!L=C|Gb?5k${Bxk8?K~k7`ZjMQ* zy|C-;b0CT%Bph$S*hMCsaU4G)PA`9fuF1{O_fgiDjjcXrVUY@sV^(g6hiNtCZup{` zYQi`(J#R?0Ga^!-s5A}35fijg-_7P$g)e!b%Ogb7ADWu7xhpZ;nd0^+buJf8f^=@+ zb<_Goh-|}3O&H&<0o}%nM=?a&EpL$71s^L112>?l8yfGGt(ILnUq|e1c??KqkbXJ? z$QU?dxYCNJWXF5F2|mi9r6&B%*1)(8cpX2|xp7i$r~2Uv#zrl-#9`DzwfIW~;QJjw zwkf}^1!aUQqD61s%p@=REOC1v^_&2N_+s#NL?kK_O89my-4GoJHf3gqR_J^#EJw|M z-?4u9QiW%lWQ_`j`uKDK0RfJ#NM!mv-pBn0S^&V2U$^|nyO9u^oN{WiF;?wLSVAOV!f1MY;_StS~E{Kqxzf4T?TbGLiU}u+x6pJIan=p7-gY_$&|%S z$%*`eeTYS95AGP*@B|fv zIh)ltZ{92hu;om!J|RU6+a^7U@fRLqAdB-hjaG@G z2XuSRc9%%74NIYkUEYTLJW{4l6{Vqas4v8u#8WwoGM<@%&RlEKs1Fd;u4iP#{mJCV zXT_S7BIW<+k@cIz%BXoS@tNu;HXz-e%Snrnasv}6MsO_9#5xVl*znOa&$h)Zepo0a z#?&_@Sk5oz>qSLHt5NFD<<5J07>`WRyEC4;n?y-Kmd(4)E=RbS-RqQ=9?{tI78UZd zT~=O*3@j!37-RJAt8i!cO+I}+y_s>Y2T}~$({V~or4n;B%Jj#`rigU+(viH>*$Qp3 z#atD9e(!TXG(U2kcXbp&-bHEEb%igK9%%Lr^w)3n7DYN51xZ4#1M|piGOF*zUGG zciqpiS*awQbXl=b1iVgEjn%a}PsQx{)8uohX-`gOB(O_p`!sxcE+ZqO{QUgAGgTq% zMv8YP2Y)yTa%Iw(-x7E4P>O3it5%J5@oGIPlZ2wjr(zt zuXy_5D=N`6Sw4Na+DtTPmRV;nYY_$}Z&)5Pv;9rKv?)30?y`5QJ=~mTyOoA_cj&yo z69G{TcH;ve&BV(%PAuv>Vr1`oWx%q|w7x8P=mz_d?= zaG}-kj(4OfdnY_3guk?`%-7d#l|L6i1u<$;0jWYM9~JnfPyWSHaGNR0zgDxko%e>UoihIhEbq zt2U>;lwVq+6!#YBErl`rSDI&gimqW>EJ47R+S;w{?dWiD^LU*&&<}Ey3#vt-4Q(NJ z;ZhbnN$wFQ>j84KEiEmrwSAeioBF)lD}z9XNy|v{T94-24Rdc6?m$4q;+7WaM>SJV45E z2(cCq;crzidDyaG9TKWJ*7W9S`oxJ7i_gWmYMw z=Qq!Oz4344zXXSnf_A_jqK7#0Y-YA0SPh`L%NM?SwalZT6SP+J$2P}*XlgNC_3_5u ziOy{sk*9V3$jJ>e}6^1Z-=nI*zArv zs5dVat$6uz64aw*L>MOm%cpbX{lH*!SLDlekWu|La({nv;6i-gLIe})ChpLN`sobh z=H@1-k0b*2l4cl>w#ff|jX!^LSiitwYatmZQo8jt{q5Vg@nwKuJ^R_jiAP$s(=O@G zo&B%V(O+@_zRL0^D}aLx#gKY@7!eubg&8|un=A^2)~vDq*&7g3SHc&rIeUJT6nBw9?K3XJ{HufyQj+U3*SRk$OjH?Dsy= z(b1nypfNYzl@Pl~*w|=%*s2ixY7SSqnyE-pgvq_ zELltUN{rNB8$2zJPq@O&-SBZ_BkO=N0R&v-J5!p z#~9+omA}b|1z9UR(2F{GzF@oai9?!4O!`Q~i|FWlxfA|+Cf|RVxg0GKy$$+{&Ll-e z9Eb7@JElg2XCNl<{WX96)H**Y<-i_a#jVQyWE@0iJg<0fzU%s|zh3hFmy{(q8TCO) zLFP;cM>Uq#;7bOCl*JmZRMQ~+ICy4aX53qpg{nxunB*5Zxwt%)zhCXIpFR);nLqs+ zr=#QLkQ`}`wJYg>CSTUsFr+2JwwfU{tj|GckaOF;b5Z{O{zROeZ%+Gd8Zba470b?p z2``EbJl1OgE9DQNv`-g*R288Dh}1iXJ84g_z`=ta)JzIR5U&z5?>#dPba(=Vk`#Q# zzS*N$4mry-5>~BW|HyMjuU~BZe3x6R^Gkl1ae>2Wvz829W>;7u->AGo?Lxp0Td$mq zHb)mzdzLkKNKpH};YxoE+h4ydT0F!tDDnU1OA8Xy^aA_-$zZ*7Wr)oYCy}4npZt@Q zmvJXPW`6k46y6~;(_h7%tLo;usRm;IcqTK^V*GQotzJ|qdaCapJFw8}A63c3*Ke|# z=@kz9!&h?Zi+w-pzrBff64P;tH803N<`4F4Id>{`r=sLG-Q!^aPgWF`mQPC0zNPe! zV1h34?B=&GQE=yHGigNO+IM2#KJTCZD=~MB)(On#}jHSlfq`zPk61m-4o|W^;P{b;@YH9P-is$MamFCh%>$!_qi+r>O4S zTAkeIDl*?UJcs$H0e`QtCK2ewp?C6@UK>TrfmD%epdUs5P_n z-4G%=Q0Bg*N9*^gYn>P+kA~<~cL&A9If7kl>aYk#P4>(QB=lQ+9uw zI1VQd&we)qOVmf^pYQxWTeb%;ZGQj#AMBqFR*^s3V|=83xP(LSx6OO`fc;l`^Fl@K zFrNRo8v7p-wnI1svgQpv_w=Q#*Zy#cnMwZ`64`$YiNpNg|6U@0zR7B-lcvSi2kl#v z&m%JEPfnXN_xpG? z?gu-=*kNnRq#p_{DZdd$&|Bk@_J0sxm^-!z0&pqCRWLM4W*;h_4 z%J}`%lxdSkUh@eYb87p($&1E}8FOmhy6w3>yxVs>I`ED;&#Qd5#^ZO>+;z)7d8y`Q zwRb)7Y$+)jD;8aux6W+B+cNdn(V}Jn2@Q&@en%El_3`h5TQ|qBA^L28{rwaFa7MNM zuRpR_?-WN4(W3b41t}3@hNx}-^=Ajo$wRdB{`xa3c>$w{`1c=KEW52kwY&a$ML(a5 zz05DHB#z0G=c~pUUY2$jxs?u4nHr$^z-8pH?EZA;%pZU31ngolpaNHTWaccD55Bem z9->qD6_#l*G;IF5@~5d7lrv`_J&*}?w_0@OFG2q1VR>uI+efCb8(nwelTSY|U!MU~ zmx8w8J)kZ2L3VkE$J$8EvVU~j{<`9~Usmo_@`0{ZzTXo?2#GjO;1bDR)t~7G-I7dL zIo|_>d6jPNY>i1jyzTFQep-ne8yN=8E-5hY??LI(6w)nvb6X)n^eWCs^>AUZTX~Xg z8NU%zGX48i`nfP6PC=>T#yzM-N14`2Ma;E^8QVTc?Zh8zzFqA4^Di)uHC|#;g55v@ z_?p}BrE8=W;ERpWr_;n*_0apZ6McM=R`8hU~$IF6gzYKb>rk0@q-Mj!m zZxLj;f{VSFCF}3s<+=I(lOQrbY{X!%I`K*CM%lBcPsv8_6`(WrXEnDjffd6l+g|e^ zAfapR04%r+MTPcF%o^gKS9G{B$zjwyJ)4d~MLfodH_AlKKg@%+tLv+k)u~QA5U9I0 zj)U)Ec|EYXtAM8B<^G#T{QaiB|9r8Rde#PAY4^-zz(oI5A;tGt! zZuEfwvI9@OHoxtscg6bZgbn5Q{UDa&aZLUg2YJpNfBedQ*P?&q*eTl~gCspu;UM_! ziy-o07L)t)3w8g7*}Ap@$^-(>0{9d{d2g*cN<=Kevk?#Yc)Ss4*HOn?H6n%(C;tBS1Htztz`k#SuISw54@jTr zbN1@I6U__6K@+_KjT6^WCIII9n~(m6@tocK{@EW<`h|Qq%7A`q>@Wl2x3o46XfDek z@WtQXI_WhYYfApFv)9f8TirB%00{7|-gRLv@ZR1Xf1>Sfao!N^&p+S2ax8tW5kBsE z*?11Vnhvw+yn9k=0eg`(=cY2USITkN~W%e1!(-Q+881yTP zc3gQ_o}``rYlQa@eds^0bYKcSuF}DQK6T?PFR@2?x$b~%-Y5f-raHR)r$Z&NO9I>U zHc``>^I1ksZ_Q{KC|}rt%9@7Dtuu{+}FcCzkIPvYXDbX{gI_zh3t2*)!W?#*ef$ zVMJf8U72iOSsRCw=!A{hyaKg{CcEckK9g88;&lg)- z>hm5OEQ|a5!VG@fvfzZwsXwYVTsp%eW7_jhxM0WmU-cKC7#}<3@u~G=#U;YdeDF~? z^CX8^7tb?f6qk8RmC0Z8VXRFvVyE17RT;~gbQ>4uE|6k5yxbBt zveidnIP1(3{l#16T1;Wh6T*Fo2D-71M{fcu{(0)>lPff5Ph&lPkAwPpS@rWa^>dn_ zX89Hl0qpwoDLEJ*!Z{k+*VVd>{~q7^rl+wB7A&wihK>Fw|BbO=zX1zUlHTz2>C?L$ zx&Mih`R6;2I%5}e;SfF6*4hD16-_;*RGDZ-&Crz7!0RXAYg8P;3wrKRrUOR*bv}^Z=d$*#`*JO;*Y*Rl@&Uv zd-PI%d>qs^u9vlz$5%(2m2r#WoS8FM^!HOE_kr{5YR_PgeZOgkL0Q~r!C#56!-Jz zr1kBYPM^MH)~bU?;|CBViiyII!alFOqOCV5_&l>9sv;;z`v8xm}*8en!a=5u@ zH$|vkqqW3Ne3Af+eLg&XPbmsk#Tk{U89Q9Sx1eKvcguwum~~uQrW|!s z9Gjxl3()wrdnx#8`e5sXkilI=8=%@8%hNuS*wFytrt`l#EOBPuFC+?Zg*d1d;J8 z_mE}0%mEN+OoH{nC*`fa7FCer$XWXy64#kHglfPjuH!Vw32ZLnB%9v4B(Kg`j%7-h z`m9*2wKreu`)T=GT&}oU9&`9AsCZj*Jwei0A@#mfH((6%FYhyKyKVM z%wCJF^0xVWLFhK#R-1`RQw+b;v*e53|;3cbZ&yN;#T!Uwk& zJkn$Fi3lk>jmBGZ7ez zoKDs+3O(#~`d9z1(fW&7tW8&UC%KE;CiFvy<})OzwtdtU_E7sJr#MWt_~LvM(|N(2 z5qb}ctBF|g5th9%d3WBosE8(1g|%-vJ6cxAsI=gW-W%w8CB{p#SZDZn$`_iMCcAQb zp6Q0pXBKW*T1!#RZ^u^ zaT~>Y{zhfT2rfSP;6;P}!aES;CR**9IJ$7&c=Wd1s!*~HVYkG1xj<(&h<-; zq4A`1-`?Ci`JCv7g~;w-`Ac(w=Bc=-U=RN=)+Dj@D>Qz|FvzLdU3ofW(jP#*-%N*c zBo8#%g|2pcO17$Roo>gG3G};hTC*POwEB5vm~%+K+`wYp$sF7D;#7QPJygc7!kX)$ z+KOWC_3+LbG;FbP|;1<8YipsH-R=-5XL)USP?%=(HhrCvq zT{$>L%%dA+$6Cq{U$IQws;Il(x-&GZ`OaFC5v+I15;*f<%+OFYF5`|46bX&bYEn2r za=TG$G={Op=+;7Y_Xa==oKm}c$Y|#LqPF((ug0$nx!>M$C{3_1 zcpAZCb*K9XdOnWWUr`Y}5o(X~k0f?nzc@#%wEAS@vDn+cF7E?s{;t^ARZ)%kp~|a! zKx3o*Zwenxt@B>cz`Dh8<(G+^H74DK)u7Dt0ze;^?Il*WS1=Fz*g|zT(Q6-vo3)(L z{5Y=!3)%0fN*u^+>M6L?H9h=Mt zYob}&Tj?$S#ZsKSle0Tp-tx-_%U#AZI#E04?&wp8J@Fvdjzv_;a?qOgL1hON4hk<+AE#?n;XYnJcik7VEnEw z<7&EKGd8Rzidi3=UtwgjkOkOInH{P1WVo44;@Fu@amH`;y`0|W)Pj?@BVe~@HeU%g z%Z+!ojoSOM6)ZrNRdrt))P94U1mO90;P>?v4sJ=1oYt?WcDqA=4@W)k(fPrzmE>I9#3ou=LA#Gu<9gTpq9A{qgeR|H0mS z|MlF5@53Pxa=9d7X z@8kJ)eIEDy!{`13zK{Fvr|UA(`}KN0&+|Bs<2cXroUi{~vz%7Zh#y5ZVfwk$p3=KB zo?OTF02BD4um47ig&DTjGj&!NE@kLAhyFEh z(iEBTFkVzyWJ|jurap9`8OaZEbUI3hfq{G4Bmn)$pYdx#n71(1Yk%0RNncHC^q0x; zg*5vyLsq9K*46QM?%tKfoZWIgaR!Ew5LqkCrcRz;^EeUv%H>dh@7}TFy?{4&stj3V zJ#C76nqfE8BF3zSCj4 zx`Qn8o??D98+E-7R18uH$ zc4BD8Sv$K7Y7<&BYKmDgSMSX2r)k-`xek_pMh!xO$*#8Qd}n87Tl(2%`%IB7N&>BM zh-5vQ!%H-y1s(X@rfrmr17sMc?_4PiAd;dTS+zg{{M}ry`)@`cahmhl0Jq%#5AO zW?+yWsXN#tHG+RSTcjW#n|Lmu!PI^zvzq`e!;vW*y)GPl!`sn{FH%-F*K>RYd3i8y z#quCDP6aYA7d3B5ZV^HPGkg-HD}eWBcsazKiquy(d1xFRSt6|E1b)4~5Or|EeXXJt zZ6<}ezjnFU0MAtKp8M=u+mPIi>dqpYTA6Nt3eHasKUoxh{AZZpDEoBBUCMsSV&L3W z!b~nEWTrTG10;yr-1F>C?(?h83qVtj9Ej6pqtzI^YcYzPe@;Fu>7k-~@-3Rpi*oEI zh{CmYBGt-)K`zZ?1hrGAZ@y2mE|kbsS49(k2`Yit9H}xAsEK=vHFNCVWaf99%q3NK z*QOY&1c+Hh+qV_U4EGvO*I{Bu@p&EU##Dys02!}D9K*M0IA-oFqM zXQ}j3;+t(g2Koj)5gfdRI))BFind!n$@aly(IAdb4$BU|bzPjP40=%>Br0n2rGe^f zrj~o-^c0SdhXlq*tQWM_%V|56=Ai3kKDUQp-;A4uweAS3g=_HKMF;{bUC3aXZFn_~97+nb<91)A!tZuu1juDwf%5w#1z~ zb_h<@Jn+Rp(gWMa{{x`(_>%B8+m_uwHuc`V&{@0YiG1(f=M0;4B4k*;y}5F)H~U={ z7kf)xjM6q0DwFpa)>ER0j@e0yr|Leg6baqN;Nq_cTu!zrd9sHoovuRr(Kh63FL%H#QRmp7^K0tID!-b-st-oNQ5Y>e>g_cKq5Y{!M9F>g~Hs_p?dzSY>^A zStt5V=Rz<!V(J_rgB$29?ee#s~pMOM`HCVIB>Bu&DDEg-a-S6E%PcR|ww@mn+tA?CIp;TTJ zThaA&&6b07bZGalfL6M>5Bl{zq>O3Ng)!_iY^*(cPQe$AH#tQ;EbToeO|V;7*LxZ*`DsAw^nMXRydWWzckftf(Fwj9mdy*Cq($bc4&*ZI3Y``2kX!%AV^ zTDe7iw#%Vc9;76W))m!sSaYs05Gm%zwCd=|*IMaA`R|i3eC$?z!kdb&Qa`i-|JhwV68HJJfcn=*NpIO#*nI!M%=)a0KOrf8NY!|M z9R`7yy-0nfNTOKwT%0iKw$I>zVqRH>7Y>rvi5i-D!&rtzp_3!<+Z@RrsWr3Vfuq2wyzkO z*8+@Py>Vf0L}xwK@%P;>Nr#gciui;Z`S(~S=eX->POQe})Z+xHDyAb(7`J4+F`M>L zytWyQ=A=ZTXYtK5hlK44<-j+TbC11}d$#L=SXyGiz!wnbX2S-jWcouuCT6$jVrYgH z^)&AWhaFHK4`R+^X@L-n`!ovskG*t&oQL5-pFu?LElR>Z%H6=O4@TbviCUO*+>V;x zvM`PU`U^ngZk|e?43UA-*tClj^}b^TkO}`Jm@zPXsHHOyW0+M`f()o_-Y{zkCF<~0 zZ}<+UxoIATFDB2TW>By1+N=%!Sg@^MQBOTs+@|cq*W~@t?r3c-_1DQ6;xuv&dF!lxBvmov+UN?F*Lo-+vq_q|rx?P z8-elgKR^BxU;21c25!v%!C8J=>YPVqhY)y4Z0?%tgoWZ8z6CrMvmcENSaYlUE2mYB_KIJ|Pkf!2n=`Fq zy53lK#i6mesI+1!!@3%*x@>S<%X#Fw^yMb~^E!WKq17h0rX%+s1oZdL`v=wLN8{8- zYx5e|T(eXoLt9ngSQO$l2%;_q{O1IsoJvQu{(#*7s z`oYWJx2qO|;{vj3JlmGl-gQF3Cmd5N=hd8hE4|cAZFMcnba{NSB)!$LA25t#+k zP3=^%Ab58c2pOX-$xg#<$5>OkF4Dh+3=v_&y`@r$R4ppCs$6eoy)^=uR=AwxZfMM# z#IvG5_BpD~)cM=vs21h087g-UJk)^z+a0}+&#)+03Qzc-Ql~+oeJ39x*=pPV)}S!edMmA#gZfEv&U!4Pjfb&PEU3bld~uQ_NvF`(Pq<*JXMsabRD)GtTBxe zaJ?mdb}fU8a&*YkM5Ev^@6m7p&rAc9%CM1nJSs_j{vRGfE zH#DY+fiDl4^Q)b4Ha%Y0dI|c^(yZTX9PEB8PDP&FEe7*!4c2+~Xs^d5E@0tE-_EY$qtNoh6;3<_d zBudk(CXSdoIf|*qUWf42z0e&i?>stl@9T=1b@5pID3e7aVkuxb-}0N96a2<{btp>s z+d`*yWn1@te>>O*h{VQAs3gXtsJ4#2!h1sPbd0Y<#7&!316;-;#IINPGcc_E{*xk8lq95+_0uw2vGUwY3Hy|a z2+;$kb+HCwdS8Rt{O8U7jCNH6;<9cqY-oUTvjL#es~y#jbaG|kRq*_&Yev65-FZ*_ zYt)3VssgFm(!EmceqZ)$ers61hk!;q6%-M>&WE4cmyS8>lowE{7#J4i%gzOhr#fO# zovxS?cx6HLhq~ZhrYTsXsIB;-HuX^u!T#~mB@K^vP0W**2;J)yJzB)VhECD%$oCHv+2KE!FlxVT^!3Z1q9AO6>a(<*ojMu1S=<8acAFjVU2te6SRzgv1hA5=+A9y0|Q_H!g?{ z2_h3$`pXypD;bG<|iptZl((GuR#B^|X`HT$D;wqm69fftYU9eQ}Tx>P= zK8 zTDdxKBBSD&96JnBSANBq4t_-2e=XkK(uu!)ZvHXJgL_CFlQO=#4qR`!UJ3G@%&cA0 zb_xB@>$V)e=D+LglLBJJ`JgSHaZdeP92_aVyHZ%CoE*1cef3!u#p;rJ0P)_&MAH=0QBr2t23`Rh`!f*RYCI}u^l^OMVzqB_|FK=@4fd*2O{)}IR z=kI4E?2lhF;Vg+r(eO2DiU;b&cgyTL?iA;yJX&_wBxYVe5Ykv~*UN3s5_x?oA;5Fa z@qWk*FFc#Y;34|0j%w?=jx0O-grt`ZDw^-efJ>%#0h~SU)&b!yKlzozv_69;UTh`3RGx12h46m+Q|bNJspQx@@FuM)@+KfGeqZcG6tSp8B8%m zRQD}Z6dx&7kNXoTA5bknRI`8Z%wl4<^%uPiY(*}?1_HsU(%_@%1TEp@4W_4oq<#eF z*-iwZXZUS`4iz4c^R3}V39HL#lo;=MbL^Kwx@VYK>V;70CBK)mWF&{Xb-e?{=t|E$ zQ+su?YAr)^())LF@`VVf@3Yb#^#0R*a@N@K-|>Wpc|EyA5S=#-Qe@mLY@~cvFl7>> z2??WRHdUkd`036wf#O?`{eFzoug#U4i+dy_v~y0Io>YeeXt&~xg*p2_*fFKh9>JDy z)W)^_%oHBxO2(K5x%12g)sj5CRpP@4QgkoT*SPMQ23}~EmSDTjreoA$+0{$y&y#Dt zl1DMgXW(`nt*(0nOD!_B3x0t@NYNS5Fx?fWJOMA3htMv?7mY2etweMMV#gY3uH@>N zkgKzP{FejZP9#p)+e<}E?{;L{s-bxP(4x^0R0mA!t6%agpyc(;^559EB0_yR?(VFi zp%EGa)x*7-?GKV}N8J)_#g+Lp02BHcHlaEWO5u7(+(4fJq!hLxb0klKs0=LLN$MEyD$pjpvd?z??(3?lxE{Ae zYv|=L{-AI1i0#UcQY)M^v3MknaXl&J+iv2_`>`^oT0|pX=t{+S&ijj6yqvB_;^xmX7x)6L9s{?sy9 zg@`+EpBpb%RdOBXT2k=cyXdEC;7Wu@#!& zdPTenxl#YZBXhzd@%qIayBio^R8V4FR|HGNVg=(DoX-!%qC_Gy~cF z3?zI;%ge)NlypxOxDWX$DX%n_{05J+Nb7O#*B7H!EtidFw7k5XJn6i!hu)eBJLHis zE2$2HZtP-~`jO2kzf2mF!x*zMsP-(@d|jI&gL3tWBy(*oV5Ms4@YChD^1WK`d;lj( z1l12(ml6_>Tza~h)(U^zPK>opH*@~^>U_jcV$-4rs(MOq$XZVEUD)>Uz*@^=1OKj* z?rXm9f(msbb1#M&Mwhp#;mQWUuoT%WEFoXS1p#}PD||8xD^I(|`*2A*8huj_->#Z> znJqqQB5>acviM1BVGi2PV)M4i3g^Be^?3<#_fnJXC#jPkzY&G!&K_Vdd`VSvxQegS z-KMCiP#JW50}9+{ryp+A9EuTeGL13A|I)$AI@G;D9P)#mCLMDsO2>8|D1l${Cn!kK zICTpD`S3f~5F`2wBp%5R3>g*(xW3IH_P@5IdwRnOI5r|09BQ;%nRPW0sYZe!oM*K_ z;dv99KMu5b$o0q5iOgLJffr|{hJ`L&y0pFw$$uvh*W1=4oWyF7M-MoUrcs-ggP=b0 zw>U968|#B$f6HKrcCJ^3aSe}OvsL%ky{?hoheYQ3@Y#K5k+tP98{G6gKP0q+s7@A% zisohd%)eSU?uSW75fXUh*{3_cTM(e6{NoO#7X|RXVx2Pv=OvYl7Twvh8R8)g%K<-r zY!}{NW^(>=c6XNm{ZFIx!1L?ydVf@Lt38GyM&Y>;58Ry=kp@p?nmH*uT9j6N?C-GS z+=asM_PrXD09=L47BvhEejtlAK%06i!`#fKRl#p}f#Q1(>m$9?E}+e5N}ARS!tz;E z6oo|SnHLE)^X&jDLcI4h2RWavs|ze02EVV>sU&3gQ*)sCApOrzj^F-W1xz}H5Dppkl%D`1Pt9Ri$%A%Xtv%JF0W!EH$G+xiY9Cw$^3!$L zBeHj@=Cm_WA;H4@rk~U(3cvr-WMPfMJ(~}qw2>6E02((-3d>(cC;G!a$fj1~)h?8jaze@hvD3^#^KiN?kR?NyelntZxgd$O7uh2`GHbm z{!}OXNo-FtNA`fpfk5dR7aIZ4d~D7nrhZ^7v%}|OaO6fL9sR^%K~$z~<8e}22-%*( zruFi;MVB(EP_PWJ&TJHH&nzKpxUEeC;DDK?Rr=NRbS%R5(dsq$|O&ptZyRuP1^WwQ30%OB^OxCmyOJM_7!?t?@Em6S~>LT*g!$9t+^ z(twNkIWmUq{#n4La@x_O4*w{t?_R;c&2s~VjdIUamFX<-K?y8*J6qeJ9nl3EFjL5JOI$72{1>v|U(F>%uYiKgrBplT}=>wD1dt?{- zv+npwk?pqDyiuq0g9VlFn-(YR(!LQ`bnl;1sB3$d70qqIW^qP_`~1KPV!9_<9sAhv)Vn zSslFb{Zxp%Hi|V?lPDNvKBGq3*&01Bj#6S_`06;YT^EcRhiNK_a91@jULI`A)x!AW zk@ku~ASk)V>>~11HEU?hO@|}Ds1qS%i_7pof3FF7bvBw+9ZJ+phl$v(Ni6-kUt^U2 zyl&}7yqK|OQw8$kh$ET2xPsTgzHKi;zeUQjV#MU*E+;H+65(`)<~~Z|n6y#DL2UQ4 z%p47s3p#8Tx#$ea$WX+X^S&)g5kI%-i7-~O`%LN3S=|+#4Y8`F@t>a`xpA2pOhvRT zj2&i0N!FKL6zho~6yZG$i`iAkYz%ompyuVU`u#%*^fN;-4TYj?;f(j&NELluMmjTY zVN0;!5wu(cKn4WmRAc!hR}!WpgFYUXDJPH!3f$iAm;O9sI$*@7t}pzmkZSW-Xbq?M zMQqyCDy5B6ZbFU01jb|esW1!j-WbJ2>zg8Urb2nt)Q=qz$Ir9#ROQibk#w|Mace*S zKUvPOUl@>VJZsZaS$IcvI@>%EO8N_Nn=h967=JBCqSeO!B@6|xZ%3(Bhwf1y{Z(%I ziPQLO1`}0tIfup{1e_>pz7ux8@ENyerCy9vEC11N1vKw7rA^@PgQH@7ru|s-pn-sPM)IT0 z2Tl+Ly762N;Dbtp1~KC>Ulp@sU)QoTq9@MBE5eU}P!w-M-*C%M_E~32!2h<4>(An9EO#!IlOQ9lm`W!cP z(A6=ZEEBNap$Gl$V+gSDj$DTcOH*pL3mV8eZehT*9~N#n%#D#mT(}mM>-v~#+54R0 z*2&siv{qE)VwkF=1zh8}WqlEUx<6zB@Sg=-Bz&N**f8#xytKEXAjX_$*_8qcMgd(t ztv?KGF3h?)b(nwi#Be(m0A^S%Wc@rKKmm1L(jiGxH>27;j z$*#z|kL-Q@u0-6USvLI?reewWoADXvHo0`k9?3q6X{iE4f0$4J#TY*RPVKl#9#@FI zjnIYJ3G1qmBR6uS=AYAPL433j-0wrxW?79Ry)`i;J?Tm%>16n}1y$c@Tj28J3=qnn zyMVe60-)5!GJ?oITy*s+B39XT8^&b}LFKis%D;dx^$e&e5DYKF^cBk}MH$CaD@xN% z8@`$f9F`o#nCQ>&B?{nuusT;AUmS9$@&_{{LrV%tLzSs<6cX+kKR(nG2eI#*HM!#_ zUf^qx-MQHHa@`t3kkt*LC+8v%?fp1&E zHQeQEEg1*o)E_RP;Fp>K76vS}0F=t5Y({_Jz0+)e+t_U5Opf+GLQ~-7!^;r;!pe#& zQee|Q?Gepv{Ftfem6Q3i9@wkojvYwn!ry-C{oY7yT0>(1V`o9>IyRxV0kY4xGv3_C zm)2N>doK*C$$GBkypy;@sY=A^m;9ku#{iz5AK0#2cy?JGfj3bhRd>xK=f_n@VPAow zhT+BDb@P(I*$1|FQ`)0Yk~9?ypgfg_gRJ(%!rFs%Z!S<)oUu8;$+9E}5SaWOX?e=P z6h6lJRX?#>W0S&>8KkGO+g9v)Q9q*a3Ha!8jsX9K$oqHhep4#S0VEMNsXgntRqVSh zNBqpCl#p^iuE(jC#Ca6gO^AK&>sU@|@Bp^!(!nNCD@fW&FID>EVCwlc)IAuuGZuu` zwG&O-BprJ!Lx6gNmOD5;AkV$#zvrGXA~0Avc4uwn4EWWIFL>+%*6Uvi5KDCGy}DHr zz_mWG1P3gEbmiQfX7<*+RXhMy2x0k&oj0ASzCX4>AyD}DOnI9#`yst*A?M!@k0?ML-XX^3xp#~j8-j7hHs z!^(JQoLMc=Zr!j%?uJ%+Q|wJ5NCTcw+2QRdaKFe4;>=1Zw0o}QYItx@d`Xq3JNqe- zpS`~&U&7ptunx2_{p4N&6!QKhbJ`CNfPvv{n?{f$la{EVe z|D}I{>^vq9*gW#CzYoHCgLCzWI!nGdOgCqwl0;fi7Ui}fLtTCb_oxQbrnz^Ea>Xl+ z`DQ@pmYuMJ9n6=MoQT}J@Z6rECyLc^0%oV68!||4-lIQRcH=z1&XGNJ{9;#WGnbZI zC(u{#r2me%`Ig>o57%b@$Y#~(+(*AZ{lxW-Jqz`gEpOcQ>>t*jqMXe90djCXAKt1D zNxUenKw{4Z5_^><{fVLN5|_y3F6sM3$kqoss>)J?N!||@%Nq$x7^G7)S>j2b(d(l- zyy0+X-4!Oln{HI~evR@i>}M#yQwkzY76{Ro^vGwfc2I(H(ffWrZLyv!7O}6=^ zcqMqBIrEskEA=oQ;Z(4M{Xt5nCT#7#@aY6FtmGxmAxCU9@trlIiCk;kFN`4RwN7a+19z^NQjy})?#7|HQ&L~>C>^Pk20f2_ZUP5yW;s9Bx1 z7hR6RODHcT?m?9-Gw}Cwr}jn`>xHvQWZU%fK=*iDf5Y(W3w|&O6BH5w#+HY-Uq!_S z){2ewW$fyDJJU;>62?+~)h=HYF1w6u^xdoE8M~8B=lkVeUST%SRh&$Yb|PnKtkBD3 zT?j4g6qh-BGO;D<%0nnt<};}vZ}c-sYi}2P?H;xrZb{3Mt=}9M0AA4*`-h(GLRXJz zR^^4kuIA9C4GI0=c8#IRTlZbB1LnV`=I3p3Ha|U8b>LSX0a69dcjgyKen4g?U=glh zLwf_c=X~3V{dJIGFK7HG29%@gbkyRwOv?Y&%mtcf2@o;$&4J0=Jb0&sL`&^HaZQyK zxtdj~$q^hI*)@}W>o1|2j>E3i^&1UaKgKN^Mqfoo-6URBJDN2}S2E!)Qjm!LrR~8V z^TC;F9*Xf^PL$qvwuC6}m}3x3xWX0Vz0}6C0}aQ-o@X9>a#?xtV-8;Q-M!g5ooH6& z^z2pQ+JsIPbS&MsZU)#KwxvGv#>ne}abr!=4v}Ks+BB(w%G@V^aiRHP0(d>eu3^|3 zvt~%?`SbYA6MMq1pn4z0Ry}{_7wjAR1Sr(gYAY6Nu;5kSL|Xu=uQzV;txOwGf?L@o z3{FcuBz75lK<;CbXTZJ?mjF*~>lak#UGUMR6~9}xBz5&GkA2xU&a|y6pu?suy3X@9 zr3}jR0}a=Qa870VCE6Xw9)rO6U@G!JXrMj6{Z0Vs0kX`8t^FLL8?9+0xr7UDXvxCY zzE-qKkM`km%Md_+hyfH&yzgwEiwKhuo-0)`@sT{PS`^eIE|!(0C}ZJ^I7ymZ{APQk z|G*3RK=j*kZK4P;T0ksPhZe1$nWQ*p-TjFVh?k2qQWs5BtlCk_nh}EbAghXQ|1mR6 zp&H#kiHg33Qg|K2Y;^SRdIl?{o>g_Y3n{HiAq~)Hdu!6@Mt4V=FzRQ&ZBEmLE$X9p zn3gwAfMxZyEaU0lJWK|4O z(qQS~gikw-7r!AXnt2)6JbJ~HcgCmncP^U@J!%~K0Qcayk_IW1N$TL*n>TADpE@ae z`fk!msckV=GE9I)kU5iEh6Ap?dPz-E`9Fk z$%OsV2$s_-u)as4I;;|RU+cdOj%S15cOWX>IiyIz!XN|id-;84EpLe9W%FbTEY*(! z&UTclB9s;^RvlLoe-ekcOJEZ@@V!!KGRpfi1&|ZC?jB0lfCsmR7>^rWuP4ZQsx*G$ zBqZoE{S!x#C=XN!JfYW@hGxQcBOQyA26nMvObyu#1yJw2V>z$Pz9Ese`{`gDP1>R2 zT+zl83OEB@0F8%4UiY=-W(Q0grx#%3J#DV`1gYI;Xr$6YE76;lGp26=h?O?(rRX=Q zW8OeXkZ2NfT%}VQ`qlg^FR&C>OCG}cd8lI+F>74;I)6AI6#}p%_CYMV8#F2it0J}d z5*8{97w?WQaQSB>93TUy12(6CvVW3vIrSsOOKy0#{W%a8@}HyF5^fD_OnArcy`hI8 z9*FW`5)n%~7Blr`NFy4X6*UoVG}57+?wM^+x{-L$Ul()v32GiJ4VmbsiRtX>#^#3S zfZ$p~e#lKAsveQC4BX`V(0h(?740_g{I*+nhe;>}No(>dj0rnnIqr(Z(ei3sZZ1i( z?xD(74kGt*M2f&n^;?AHN42a6Ufo=Xs+YCPOQvTO%{<4<0LE27Z>Qb3$(8PHN{urM zK6K^N8sd>@Z?Xk~A~O=0+f!;@o!|M+g8oBvlu5F)$nlYem#^Qzys%n#crY<$s0bPR zyD+(*iH7}!bXYNqu&0F1%i&PX4=bUEAvL78 z@WRhUD40Wq)vYe8a;a)qY8`O&W+FkzzOxH3mcZh(?)N?7C2-1zb7#I{v4J@f=l1Q~ zHxyR>(k!$Da3i&J@yx9zS)N6-GLJiVn>yJ9P5nYDqI$s@&BX053fWz{%(sQ|($Upc zmJJOz>TPY_1ARtnXIlPR2w=KJj6aN*jUPu$H4(4Lj6)l3N{3Vy*P7D?sl`>l%bO~$ zsIrnqs<}(>H2-Rgn#^vzOYME{&i z5V>@A^6ECW_Y%Vg%Gq44xwF z&)sA|8UvDB5|>TrD2UhI(xxoC8;ICd3Mj1L|KSIun;nRQX~*0ZHyi^#^`fjq^~kS0 zF9~VqDVSi&kT-LOG>pYa=Ypt z#&xP^S*FynwBc!Z2pRRz8O+<9+EqcHlX52myKbQ8xvX3(xDi+JMGUZF^_weaN@=7$ z1FK1X8y^*JlvX#;2GP>xBz`^#xxX4FB<5Z8LW|?V0ywaMb%GYL7<_@~`l)LYBz;zk zEy_&4B})m{He)uw&d)Lwzp&NN6q9d_cbpjXjSNcVT#r6*@e7oM+cAba9| z%b#aiZ?kbWO+rW3CQ06Jwgs*4jODW)?k)dpJ)(>?XPkbsk&3|-^g&#?C7650XwxXJ z{nEG-cUSo~i=rsXY>izUKydqh&bGKgdX)vKrV6&eOlG+87w!84UPnvEgq^~#$g0t_ zHLbP#_fe~zLUHzjLK8oq$Xe*l?0$J@s?M>MhpsL83#P~n5XdmHMTheZV6I`PzfQ*; z+qF$(qy;vwRX*Gcrr?0;ROE+`Rc|9ta4pt^$&_-hqP8^Gd~|eQ%V~03Ymi_g5-OVK z#clcs<2H3+HHg4;BCCMA`))al@`#96`_i?R3o;&LI5+E&b8LYee}=1Ir|*@$5UUY< z^?mT;015>28OZ^YwQ;Al!~0=ot40L=gT;dAYe@6lCe_yeaP}14`w_kKZ@rU%%iS11 zu-T{X5oy81&wvI{I&$7U6d!wK6MN@tyl>6515Kd#VUt0O4v?^aOHN?I`~=T+q6aTK zw@FC%ni?PBL~3JI1xg1=s{xsj)89=7b99m11ZD7BV&##`A661N^sli0bts~$u(Lv9 z?2*#fgg7BBF<|}tg?12~`XHR24uQoqu(A+%jwG2eUr9aaJh zrK}@BBy^UxmP>*UHix(Ir?IcQv(Ad=>=S89+BIW%%|DDsL&hqF1#d6B(l=+XqT9%V zCx}8=_1;4{H-}$tBF}PZ71e;v(3oxJI!q%W^$e7&(*lo3s})1HyM5n@+^i>$CRt94 z7QDt~gXEO|8SrbOeG&Yped*XN84DAMGM^0lX{He~n2&peDiNeUi@yh0}D&{fuLkL!6T5zqJOP?i$P zf}wk=>StMi3gLgft-32h+AU{)(1f9~K0|}KOqK;6aujafdO7R6eW)yD?4RT@>0n7b z2OhT-7)>Mk<9?_zLnGZ@nYo}P3~xuKBMX<*-b$RqyVlM1nue!Z=Fg0Ad34g#*0J_9 zv!FHX!hC_K+8FXKr)RX{G;5o`tk5XT?W5yew%zd}` zP$;jbs)7t2s8zQWuC;-giYpiyYT8$a+6Wy&$DB|Ud!5xxH&Vsu!$E+sCYtQ&U}i07 zN?(!&NcSmebh)H~=5cSdLY&120eR{b`Ns8?>1Ou5*@qQ}BI@x1MM9ruN}{zy;GqdcGmMyJtV zTyS%Mhr{7Y)K#4-G*&hdO1N=I3(WxVi)5jvNOmc)4l-P=RQ@8p{CV@lyiwyxd7s_q zHr9y#}wqMUBpg<3ielNKyUKMy$j-|%)l4{#k#9gcXy7!Tul1WfiM=n271sl|| zsD}Ikv%E+9{yREj_|2K($~i7E&?y# zp>|VHSa0FMAOJ)Vt}aF!!etL!P2XP%UW}&3J6EGlr5Q9uK4ffe{imiQ5nIL2yl^4k z>VIl5C48q3@h{9ytGCwsB+%@=VZ*rN4p^(XcCLEK6^hohN1XyC`+@7i^JBMcRjA)I zll5F3;mV^c``4-TSsfUqBSP_0aogXHY(v_V2J=WSFD$m-616$$9YrHF2H(2Kyq>ol zr-maX1N9=eZ)V{*Y;Ecqgvbe@AF(WBmX@fU`Q1_r^a0JmJiEgkq5Xl%kn3cBL&aoj z22MeWNu3a zIO)D_lJ8ZQn~g+{72&u|lhbV>`No?yP_bT{YKG)QrqBry&7iF}>23BD28ZqQYhJre za=_cKa44f~WyA=^?Z7Ql13tbBtj$w%REBrW>@tHfX)17*f{vZ!)+vIQW7pq#Yz)CtD3&7iTbYSDT6V#S`7hWN z(hyTJFPUYh%iYf|pcr>W7tLIxoE>x`d>Z-$YRGfq_BWP##&oPVSCX2AIXd*#4=wZb zRXe8Y!fh#1S>c6K&<*~LArcbzbUgU%t~5Y@nf^xcg-ld4SU0?*nTAgFT~NHn?Kj_| zS2nlFSSozYojAIB;?h-s!MB$`uF6kAq+pC8D95}#`({$bR~mpHOz?1tDn^K!*i4w~ zlr(#+ogA}yg7x8rhA7>LO($Ta914d&^Bq#~BWsl)Gj$<^E#cTTT%s3`U;Tnc8KV=c z20V`;4A_J1-(5@U@pZ@4+^SQPI&tJBGe6j}ZX~0;yvIHXqmc7PlxrGFM@PXQtT>;m z#%W#mVqOyGcwf3sj$KtuH<|6!Wt&}4AzjH{mKaUL07ej!iddfhBOSbahdxEcaq`>M z3uu<)2OnbX47uBeO_;IZ=@)X$}iG>6H2tc*7${7)+sV@w60zw}X{l3*UrOlb4@< z6p1}V$6P=y-oLoGZuSYhh;Ou(bi%zPvO1r1^PV6r@!bCZX&@F^H;4Cubx7e1od60b z1&E<5((AOXEBz7`2fu(ubF|cD$xNn*97SRH{S;g!`&~J0k+XQE7N#SJ3qyK;C4(%B zc?JybkS%#>Y}HLnItg zcz=hP*ss>3b-y{(3*hE6=GDxTmW3B`C!>kFteNYO%KWIBy(465I(}(v_&F)>)aV<# zf&M@iAG-@M8on>RcdYTP!9%+s)%IZ3TYq!VaOJ}0geye!{G@Z{)G539Y7l|7jOp1I z(46e$8Al^>%rm5W%FrfW#jVIDUa+TKxV#E)L3;mxD7_v0ga$Es17zBo?v93*J56^y z*%?i%{{H?^$DI1+@?Fhu#m)H9fBrUkc;R6kmbG6Qh$2HaTD{rde~seZb6 z*xKhe8{klh1VSRs?xpjjZ5TcyZ~1_OvhpW^cwDX$Rjjn+7ECAOB~ht;5u}kCZpCru z1sEgd^>LOYoi!BJIKp-TVA0%j5$WH z3NEN)HsGGNaZnjScU`o}pwS~+h<5POxGg91;_C|^8I^jwUweK`y!iT=<8s zFG|g_-cJzzX=3$}n?u@u&6sn#4={FP42HnG)a0kEn4{a39Z5_@@sLmNbp8V2_|K#C zRj;e-B61bHV8OtkznZ9X-IE`35Z2{z?SF!M_nqx6NJB{>OH8mlbk_Y4#%nxBK+0^9 zv}lt^nmRycPyl@m)7@ja4w7>*?WP4TOUQ`T($X7HVF2e?R$~@e2XA5R@EDBR6mu#J zm(sU?*=(0m;-$>Exd&<9jh7rH{f@Yc^vLp}EH*^094#_fW`5*Gc-JIK_bkrC7OFz_ zR_8n6O{wcj8yUG?dY3tvUItZs+6DVk^Bp&tQ>W%C(f|zXCZ6@3_{9F%Jk{<|OlkuL z-&`9uGj?tyqdpI?kpMv&OA&GlhOHT1wmvqDGM=T^J5Twkc{yOhD!V>zLEk5#9!xh4 zlUw;wChoWqj`>F3Qp}$sgnQa7r-Yp{=Dl?KK{e~p7cXo#3C&!~IEw;1*Qw0{UOuW@ z*+MV+3U1D70IfX(;%#PnXtM2*su>$AU6l!dWjnjY7IO>#p#3Gs!8iuww5CAj(Xe&r zBfPnAORi?O6E{wuCb4PYA*#UYEsG%qrGw(e=QkPuJ{+83Z1Y&xu5GY&%%y{PwWEC+J5qs_Nv~7sAiAqlNMpP?XB;d7`_iHO zUBGqR(YESBFd(bJ(SePPR`h<6a79yAb+F0*VCE@NBm`fs-!jM>iv;uB>-EE*YtVUS zyJTB#hz8uz4@yY*EBuc%bRzMiruS7G<2RN{6ExSCDk>p;SoKO<#JN(T0jKM9^6tr- z=@TGRdrJpl9`VytCZ2Getdv0~@g57{p}$VhNp!SQ%rW&VN1fN#m}LwOP|YC~qE&mQ zBDIzl_V*#m`yqVDLb467{i=K#F+;jGccNYkU~Ii#Ge{USwWpKr=&aYO7sWis1KXKK z9;1CsiUDUJh|?*MD~`auKQwkpkO8|SvjOWKLrd5+%%&3~A|yYZ&Q+$O1tA1-Ji z(ndV!)O6L?QyU8Q}suwn^CSZ`x!If?^bk zUUth%DQBlc54bge3(lZl#Vk8yZS~lv3ACE}gP{B}eo2ERv9CQp9}Re{^SF~d2v1g- zN?^NMD7$BHV5@U)-7KXMK8-0z?9%#3Z@O9~BK{-?&v9Qm8CbbS#%qZ7^322Np5O8B z1rugG)*YjWf(mJj|6(>pF`iyZ4+j8HzvNI=-Bh0Em_t+IEN^De*qx{m6c#6A^riNn|q=8UbW#uhAPe z^3%}Wbj&g7DlOA%0^H=8KaF9JQ1`5p-~1aj83VT-A*E^p46oM3suDBaU$Eu{y07Hu zaLy0QS!iZaxEsEr5`#Sj(}2!eLp5+fGV_p0CWUKl$7g5=%D)V;ZtQ>$6JH~(z9Qhz z8TjKl62^?4oJ3_tw9kE0RZH}UUsS7@L((yWneFCoba#&;qL6}A!o-4)?iA_65aw%{ za$^|E5Ctl56hq*=ge*ul#Jrq+=`kS1-iEAfL-JZ{+>jVN=tOyFGNNg%YlUk_nU^Fk z2MFOAEl%BEIcxs8%C@635sv1-e*M$K+r|00%M@X0+Uqv6txEbK!7`-WI&vOlrNXpp z-ECpYik=;!FRK^Ua^4(6%A^c65CS>vR5WS40>zH5@&QXQM&BR62OhY873V)cJ+h~E z1k<}VrwD?;{I+4s?0p+^ta~qb33UVFl+$*_J+VT;G&1?B@&{RCKIudRg$U~wfBKXm z!T%1wGQ1^EaFQW+FRLN4IElqB9brop@Cw0K5uZ5ZmQv8M5sm{W7K1nv)RyI2hvwCl z>%}<0(hvm*kT#3ZTydCT4KDGz4zu*?-=W9d^x^#FQW$BdjDxn@s3=bcUdc1O3PwRt zg8MBvMI4x*tqy%Ld(2M)-u*xfn`u<^tU2D>jqBHidgbX*gw1C(wU%FvOVsy=(AfM#H``VL z*GT}Tv9vWU7s)6~GTSg=(vw3}5oAAxkdt9(MIqyk4lHhjduBMV0-28WqMpbropiJI zyw;ZCDknOZ!DWT}I10Owzw84wUB}<2?fVnb?MmieDTP3Y4>@}CtF3F$j?*6gsKf5( z=?zbn0Alr8y`eaE!F^C2dVCoxdldo=A;q=KG~j|JUYcn4l<=vk@wUq_&+Ts9AgEEP z&C2rUZP4{t+H>|s3w3M@U+0n11~TQYK|vjvFSp}JlTy@TZ3q8RcJZjr)$BY5XsQv6 zRw`8?fDtFLxaH=fNA+&(w^LE6)i|-lq>o}k1nCRx!r=0KQ)EcJ^+4nK+jMw3#1G#s z2t?UmUxPU?B(i9{aK$!%@sq_mT6Y5kIIkXaUT~~wzryx80V1I~g65;@b5S-yld~Fh zlIFX6TXbQVO0@h_dj49;L?6b2V_;O=PRt&+wp+WX!S1owdloLGG8LG^Fo#^ z;Mw-i_DPH8ev6IM@Bi{><_Jbe6Zb|fJDHA8+W!yE$@P{~IA3jx|6c56q2^(GbsZH> ziZT(D);viY6TIgJje6XemIz;KRL)ymxP{WrYfDR1>iq>Pyja76ti&}sY1cIWU_28n z;^QrdS5~GMUb>sigCsxiV19oq{hi0qcEbpl+R_ck8@*Xd&o1y6zfsFOks>SM$Cc-`M_bG3OPcNs9T^~3hocH1kO7qt{L0a7q&^vKbC3W8m z;8tckyNjZ2n7`W6T|>DrTQ2O_55XY?K`r#w;Ez%t0Y63ooq=w!C+}K^!E-)G8}~Fl zwmxSx<^s>lRT5m%fA6UWPWACrr6hDc3tsG&`jMKLwnzHs-{D|9|X%JQm&M zZ+vzEg#cMw@T=_=8_PoFgD66gw>Pa|a8V(naTyO12Lq!KMGQ{g-kRz&)<*X~!2hxv zmcaEo>@$)W;d@A}DR!Rnk5T(0oyA8!cv&lGkH=S%lQ<-c!4@F5PrP$_h4A9;oTuZX6dfeIbL91Cxhoa)^kdn z#cFioR2Jff9HQ6mioWgd#KU+DwLhiN`lo;~20fE;kptVviFv_rGh zkH?jJ=Y$Ac3YRH*6|ETDSGDTv`KLSz?1$b6VPd7^v3~|;ODwy;n%xS3BYNYYY$U#& zby)2%!0%SpuNPrZ$$udPtYR2X*pA&B^vnFu!?j?TO~u%Z%$p9V?(e>Sqh zEwLs*+Af9+2S2jqFOjx&Cpc0fW2ZA2rJA==2ro?U3GS$Z?zlz7j5S@IdT&1QB z-gMT?rm1-UrT2`l)stRgs^FhpeAZB_Mr&1+evdNxD;Cr)Hy6J6{wuDH0lEU|ZRKHn zwKFtpO9M15L}CQ~dM>V^cVk8xG4-Bk_RYHU43?pQd%;Ltfv4{vp}e*}o`KwZ9>iDo zJ1g8uMTkpXYG>^(`XDZhzq7x!^?93DZD^VD>U;-M88A$lk`{0|kn1^4CDz(?Xg2y8 ze6#KM^k)1BVNn19BWFrtwM_McNs#@)B@MMm` zufs;K6kL^ure3Qh1d`U`J?e?qZhy`_?bbz>Q2c>%xeG4S^&X)5hSbh3BI7DFN&;4^ zqHI)H>Fa(BCe#+w;&L*cWsVC49!z@|9POzWiUoTccU#n+^ICes=4qia>8@~@z2cKn z&xbRqzAwPD*}OXPLGQ4A=iN==Ht}Tmew4zrC_aJ<>womY4fu}vL2{(FT7ba$OCeM* zF?Fyq2jJ!LfA3xQfV6j!PK8SpK%&VR6A#4*feZEMW3tVMq#01+fe$+I_rT$>1-Xub z*?LaY?}WTH>fXPF*UiDB^%R=kIe2&_#J@tG+J)p35pwq!{b7!w@jigrhSJ?r@aM>l z?{>nnZ_s*y@?br&`7;aLd*?V(xMH;WoY1!+;?(QKOnf@4$;Uk56hcZZ*g=2d)cau? z&3?+%PqL!}E)S6~CD?HG7siLRNI?LXPQN{lajv^Sq0t9FUf(h3UH$u{_>ji3`eQ7g zYZ+qEZxvXf$71tJIqded8&gq#V-$udePg_2|5jQFT~7*n4Nw>5%Qy7Vp=OBZ8!j=+ zNapnolp^ns_e3J$AoKm@^wRTy;uG%UDb(9|4yR@})6;apgUiQicV9xdBVXKidLv!H z7Jq?)V}Rn6OgSXQbW4uC3YUbPp83z+3W2rpU(t-Od#EQ4mX__u__*Lv@@un&?K69h zU`mM6Di(ylLj2s0Z*s`yG*K0mF$_Ck7d6ju6l7JP@=B%K-1rx7SD|uHSS&-A%TMG0 z%kiG`=r-_Z$+9V36Wt(f8C*%+Z7M=OeCOZ)V19tVv2w^M4ThU9;LYXB?LgeuRM^f= znz2qtdh?~41K;El!TTkeviU(}|FOVdxrx~o+888g*2gj!Q$6d@bkhd)7^}4p8OP|$ z2Lb9CCVyB_mHHoJJ@!E(NpP3qfDZ^XnufdbZYhAAc~*pfBPDV z+rRj7?f>>W|NYv5|J02Cmp}4<^3tjQ&u-iL|3CQudYb=F-mk`1oF7A|tpC2a4T=8( zJzD3+~+lppV_d${iO|9t8NssG2PW?+z<0>6j8FtxMPUOeOylx+G?NG)MdNaE7s z)rGO4RD#T$A#Vm)QWo9e0s1TqE4PSO&4ZvzYxQCfELgmJ;@CFUo+%Fy(5JE#sO@-s za(qlq;Mz`NbpAewp}+gyh-^Mr^mXE2fO`AhD8ERIsw6#L3%X11PYH}hMSwxNeKVL4QWVZ+@*U3_avn$ACymO$GdExK* z?J%vXPuw&+WghMCMhi#&3p8+alZ_?PWhvlj(G`CwH0b}L>N>!&?)SHf5~(CAm82yz zGO{A2l$|8Ik|G%)dnFo#w(OO?%HE2GL^kn|q|B_$%>Vt>`=0+f&vl*aI?nNWpXd2~ zzn^j6_vijB<0r$iy3mZ&5i2?RKuf)Owi#lCiT{|n+AR`};*Nn7ABrUE3(%(=`|3QV zL`G~(R?YcLap#(XC?^*Dd|wR7C^0f+XMXM$*9i>R%k7^_go=s)+{8xp@5_G8|0Kk4 zLZxExT4~AY|8Wm+^A)i3j6D$*p zdi)e;KJGIi=rQ!qu51Yc>fk+d6lrxi76x$nl8Xq&*Y3`K14VcsunbIRbDVm!){@wk zdS0V?wvgICRfGDn0`KwdYT)SmP%AzFSr8E;|K!d)Q{2aXN_{6L=wxi7W9wWR zLdq%d=XpHlF#9rmQkE7ZVs=#)_HKV)GC%%;y8rxW@1-mVYlVD>R1`5Ux-DIEf+$Dv zPaf>7G9i>Jv8lKZIFBrd= zo4P5d;i7%$@17xSt;`q0-=JL%H;#vROP|mJvi8QlJ^|w@6Iq{K>JS$p+8(;I5qnaG z>mrY?%Ddim`%nw|fuA@<0f^4QBOC4g@MZ7WmWG9dWMv&+keqbD?SmLqe{Ooo5d8UG zP9CmKP2v)PKx1w8?-%}(F~lzE-yiTU{DPdd(YXgaPB?lJP@5!ONd=}%NRu{q5I`d_ z)R8A_F!n@k;MbSDclYiCv@jpR2~h_ z_!q6uK;?R8#AkgD1TFy-z!Cf37^7XTOVJVaFY&>6xWQ6Mj8>8?-XT71Xs(`u>>bln zx^)41p{s0UM#cT>w4P0U=nGyZctqB&jK%Ps_a35d!&K3dB8qjQp|n2XVHD6ZrbRY~cM0c%*xt&%t0$ZEZ>2@MV(H zRtt~;^vus`%ifdG2M6N^jl_5he8XeR#+}%I{trNTHZS{SGB&yp6nPq;iS?y7F$7|P zcVDMkMx1I8;j6O#y>skz=R}rhhrgePYAX@t1%A1<4tuhQ?UPE@Zekn)vYi+5@cc^NdO85lr8IMl?i0r*#O$2Q^;{`Yn4 ze|w^Hxge)kXQkt>up&u&Lsb1CuK6G=`er5v2PQr9aXvmFHaNFJX9XgCi#wp@{i|6J z)Slppo&toXGrJE4lb-tMF_no$bA2|E<&$YKL}ZbUmbMt_4A*(b@N5Puvd7~Em{}_{ z|9p2D7-&r~jZf*+Wc84S4*?dIax@e3tfkkwzuLWjescH1oIyD;I84L3tyh=u{b5ep zwnSG!$^b)Zz)=Lv2Zxp2N!D?%iSFDEpcAfXz-D6zrL<;TLJP{jwza#!TSt>=0MDZ7 z$G5}hFyc+tGvrJ*A4r1&;S3Rpjb$&p8%T`lJQ^@C!-qMJH8$2B9CL{n#1J2eK2$@D3L$1GVD0u7ZR zh_Stw_6y>X7Q(Po#^VhZ193x5ZDra)W{L0&NPVz@7TUe1z>EvFd)$Dmeyn{hRiV=C zEv$Ch3&8UU5~|hBqHV4*;gP6vZ6qQ9HvD}Z^ZPqE_0NyGxsdkb4StxMLDbmC-h2R~ zw`80qXA+hQ#nlv2k-yf0;JMRE2ZiJA&AoPl46m)oMCBfwmPh!f6xcB)#&v4vIk2hE zaAao-^{6i1hHz(T-9c#@R^cESQqwT_S^Ry@#Dx;Ip@pA&7(;v~7`+W0DoEo5$<^>%{GZ2W8U;q3HS0#Yru_x0I#{pYnFBA}SSR=$j zqOiEWoZ(H~w^T&5f-EQcKq zlV!eJ4j#vEb^q7HW%}RqNKe!vMItzrYkDUXpn@r!Y&8XWi{LZVQ8G-pU*yt4T!9KC zO^)_w7#nqW3%LFP$(QG7A=_$t`o{=|ADL7ncPbMH)jmW;N{2MKlKwG~Wwc7I!}X{y z{($L43$n|65yn=1FHmG}P#=Va$f-_Elrm4jguxU-&iVhp%!_pYQ=waS0;@Gw+PEe& z=d0r<+i)KQ4vZZ^914b!?I^8OAFhp`)%qHvEA@zxTev4(QBC(mjc>GDz!$ z&JQjhx9B1ZR!Jj>xfC)hh`73&DD-j5uHW27=B>yz9m^$sBDQ`O+Cn~mOtr{%MiSbT zOJBF{kXkn{r3m>j7=IL6w{DGS(XsEPIS-~)4}kBbg7Btm$j+PgnSxd)Bj*jlXUAvj za9d)~c0k4C{;dgh9%w$5CvM6eSMX#m7K?5dvpRM$2sK{d$T7RJG#Ie&CmMe2_nR&> z`}bqN75eW@<^tttKS}z%d}9_<>^gWmJJF1YG95roV6=Rf&7N6Qk*9#R5<9}*2v!O> z*>C%7qMGqomjFkM{*z_Cs~_jOEiSl~-LN_R3zQl!nHZA9JjEk?9vRWCYvUqM+ak8@ zx*6+MzaZA=?i_aB+_y0EHM%;ISY*2NAY$2E z>-AYVn5>iYS+TtN1HNiS`$zf)^P#o25Of>|`NE6UNdb`IoC0KtU4e7E&YVZs>$bB- z`%WOrv7qC}4XV+1EKVI9g>&wKo7OXdTX@E&PKOU8jr;&7(fy}?vA)`M{(i&My7`di z4z>9?evrZ&xz8%1vE8Q}-Kom@;}yu{F^3Y!Uzzl~ykP%6E z@tC@bmk?r#eKDs*XENV>8axp%PcV z!q>(!ZdheT^dY9V#v~^)M6h)7`|eg-7e4KM#Ke`mmxF-^!dmaV^3YG~*9POm z$2*R45OQjat~dB*YKz%DdIwmJUi|g0Y4|a|`i!~5f2N(L(U2<+ z{^!ya@;4pTjK1*iSwyIf0RSNgN6EkDC}hZT+8Yuk-QtBA(*O!m6md54wFy?vY?{O{ zc)Mb@4_7EVLXnB}9HiaTyT^}~T@bqe=_%B?LHCaP*MF=6H}M1!FG$8ezDAXa;ekEW zuT`G>1~ezX6xm({|GD?fJ58I5_C(DwMO3K-;iz5E^Ix0P(IvA5fr#j3vzb;2pW~}} zLyup3L@MRUxv|K{CO3YUEEL>+-7D=Pb#DaP@dq%oX&=1j`LOoea+hkNR{5jDAhcaS zk|_Xo3B2I(gomZoB0E{o7z7Y#<>#=s#)wy3_^zS3cD{dhg^oT2IiQ76$n`ZPZlUKF zP}&8WM!w-`iY!u9OYq@RU4Mt9`~?b_khWLTFpwC9s$Y9Aya$V^T+m?TJ?qL;Crb^76tPKu^qc`=qJ-$&&CzHGMVRzSgCb!w^7v)rktJ&U`V%os=fD~UG}$>Ef*lhRs42Gc|&u_NxQE-(F*$>56pI< zl`XWAWT369S?)pz(r$+@G)nA;Jbzy?xis!_6}l%f@O5n95ae=?kz;~~4+9uFaKi;_ z2HqM7g`DU}8n@mlzS9Lxbppd3Gg~lichCV|{R=%Vnk$x(4w^D-9@&dW6nh)ci1w4C z@;lb;sCyvjLhLG?!z-a5@=W}TLPDi~U7X>y&AtPt%Xn|?oanvB=H`0H_211AE z?XD9l7$FsP>zdi(fJ8mIDn)eM8QtWBHugzoy+v-XG(~$MIr^075oe+>EUFLYJY?ko zl)H@$&NCACzHO`sk4Z&)^UhO4PIRt+^jjejL>>siD9K%E9_>tdG$H)JDes=e-yH5? zy_T33z<;>J!&kqHTs}{gWBc$uK!qT1pBfZlEQ)7YBM{x zDMno(+E<|S^^**Q=hBFr%vx#1D=HNX0+f4-REJB4k0ePs5j$}6I|f*~-9pXDge6UX zU+%+yS?Iq%c|M0uT!Rc5*R$^n6`Fv|@FODH+i;;TM{Y^koQ~wbhDr-9%ll*MbDB^) z@4@nKb1zodgdBcP!%BP&3p*6CN!C+&`jI{bqT-+uH3RDJl0$fuLXotX#{K=#nusT$ za`AZ0x}&y(nL+jUhhx`6B`K+T5@fJf^OjhY_(RRIjX>x6Gpv6BiKh0(fRVvri8~8s zB*k>+;o7P@E0sw~V`U+ed=DOs*}( zp4<|@>@~dm1R{49Csql`TlCgMb{Blab8)uvBgty&A(a2}iZ7b&aTDuswZiob8D4J3 zE(WMI_x`w=j`aV0nDZf7Vn8`ka?^P}WqBbRx*{L4m%!sB=2F`77iI#B(Ud^XTN@oN zjDRG1VX%0br&;^RQN}o>;O+^-T{iD9o9~&#M@m#~KwA%g^%A}v*RgqJMrU9NQ0Qb@)SVa!!(7$T3Q(QaVl*b51{0|7@#VLDxhoIwV7 zemi_SsTl1175kU^8WJ9Nl*Nx9oQBiO18Bl#CdjzQGvtAw2zrD1o+jv%4?29Ij1NZu zwcM+Ee8tJRlPmhhr``u&cyW_9x{h6iVcpofffXIJq@FW={owYNzK*-nia|mri|2kT zr0vk`h$e{c@>%EYu8G41B>m$HIjI=ceD0PkjHY@5#+H^gt@L{IH;7>y0^$rh%KpVaKC04n&`M@CtM*5BOMSUG5ZkhfEQ64hg?j2sAc7p71ab0Ei>kH$9 zu`-v)Ij#PFDebar#1eNj4kw@j>7vM1GG!exOGGwhcB>6KT^4~BWVgUOAd&q!!>)^K z-UJ=E;PKXzDtRdinJE4AJJ0ai5yFOp6{ zWaoya`6R4Rm$&aeQA4vWYJ)4JM~BDko?+=@c+P)HgSqfSi_>=hW(+E-BmL0i{)XuB z%gStMblMa%$p~Q8u`o=eo&rYp12q9zz8!orR$NVlE0Tqk1vaC@ zK#}&&LQ4MDm9-#nm_(kT2OotLM5G)R%4{(sNsxU%Uw;#a2(`RcggK;xik5qK+?$;JHMaYxCVkfudt$!_)@j;qZ<5ktpP2ZRAY4+iEt6S8LUVQ zp!6JuGxzGJwKZH8MYiH<5?td&$2!6%gd0A#Yd4eiG@h&btZSzdiRl*^BAc9jx-#5Y zs>|d+IpB#GhyiMj>#%z!jtD^uj}r(?$fWLhPo!B>Po!`q8vZKNOf-c#ehDQ^tjRBq zglTVm;~@Jdqk-9_+HzkFC(4}eVbw*E%(W^Axh8Q7QoS9o@yLx(e5i@r0T(wyVbEQ0 zQ&iSr!etNFxP?xKW+q$1)P?!2%~K;N3815PQj`fOc0D`JN_G)lq?>Qcdd#OMK2{Lb zF+bixmRqJKt<>%933grC+aY^K)thq<529Xs~CXGb8ACIWrE>&pzYTG|u- zM!v%uUlb2|{+u=oq#svPCdt_aB0s0hXD&1qbIpGqC9Xx)%jSzCFJ#ZK4qVA8=#;z) z32E$QXVh!-LI*S-mOZREEMOFbE^ne_zv^{)h<$J%F!-H=|aO|Gpe6*%~^nyRID7Lu$j$s-x6lb^^kIwPMO%+?F-xI<$;W<&A2bEvA z9TX2#UA!G7>-Is2>ckFKXVqe72kY%ZocqykZ@4eCo|Eh5cGz(X&JK+E-Ct?fd$f#w ze7+C1=pr{=38ua}(VhT2uIRYlf)Q^#w z_0boANjYU{LiS1>CF2Ko3bLM#^+&}Q2!Bh(6&uEd>t1e?fdAZq;RM!~al;vweyR&R zSyAD)QhR%h(d$<^d9QTII}MO_@%EP9(Kd$9_3n)GAvIGAOZyv;)&g$p_hmJ@OGs$% zB9+f?Jhl9`4}6>-wzIw#g#Ds?j)`9MjUtV~=#J=Lviil;6}MvNKQEArVe?rYRiBs0WSAZgQ_d;D`LFSv(4zi*y7^Gj z<$&s*>^wH#p|EepxW<3wW#e9d<*=;iS!2$lz>D5)LYvrQRTsa~z~uEEiJP(nmj?S5 z{Bev`r0eqwhfvO#$DP0Ss=TLWRb|$aZ$t-W=5}^pz*YC7FY7FY=YyXgyyAC4+6^$; z7AeO}2OyITieH**4l!7MgqZ&450`zl=bxj0XdQ|>vMt{N9`;dlZ8pn?;oDQWy9p%q zCrmN(PBjmrtn;7j=v_1&J{*w}|9N55;z5+VJI$Nh>|9^ueD`?2*N@u!YHG9L$=c1r zs~t3w)eM{(jH7Db8Y}X=+G-kg?~CEEH-7u#OZMd)bIk`!-pz`x9I7mus%&%mJn!5j z&i$}pxH1GIzQbEwinrOdUH9_wHC%Jgg@yP2eVIqo$CgcJF!U z(`w)4Tkr2S*R{87gY2B-`-pDO06F6$$$^J4njw2(JLaS>w%XJysawAj=l0O$VBa}2 zhw|&orP)pT_B%swG--KN3p5)4$jjVgc$V7peQ_|8$X3%hT4)}Nrd{QRifCI+m8i~2 zGHql?x<4!V{=u}lT*|BWtbI8p^JXmR%(M5_7YM$6eki9V06mTHpMue`Y~5<0(1)C| zLv<{_wWYp;D^yt3_R;$iNbzkQe+CQ|gM5QWf1OLiRuQS@@G9Qju~PROvgE%R-=~@2 zX^M7OcCHg>>8}UN&P^Q9uNu_T_yT%8!?K}}y|v}T)~+*>7RO#ThA+icsm|{9 ze_5aBWqM;HqiSQ_gGBq-v>rQ+$`&{$g?Je->#4kx$fxD-y#j$eUJ+(O^O_R>!09)* zJ=jvQ?)|ctfs2^rz*Nbi3rmgqF@;+!HBKf`W|vtF$o z4n3iL_}5>d_kVJnv2zV(kZqE3#-KKdDVZO%=)=Z0c z0~fWB$M_(Uyf0rb=RI_)`w{|fU->To5X1N{0dhwi#=E+284+QU*_Hi|!5Q;$JH2WWHuJl!M9?677^rL(-sc>i}$oQ26LtchN@Vb!r-9b_bcp zO3!t!vt~Fw9d%CnCykLyEk9TB-|9G!lJW8uo{nVBpizL{Gw8eS zogMUVH6aJ6%fz$woqcNve$m@Kp<{l?&dbA_gSHa`F$%oQo%iHqHv(*UqSw>yiMLG6 zsBc{*oD!78S}Q`i_SiSPYIVuX@43xWXC%8=R@>jta+EUVqB2FZsWPbN`P&5p+Fq%q zQo-uE<01D^DvcXP+H&0Qf1-|}Ld_e0(Set3csoZ(Ub*%HBsZVmi+q!Ac4m}m4mnV3 z;WD7}kNzjZclGPP$=rQqX5(2MqsR@6;|;7>uw)=Ukeos1l?C~=R05aUM*CX1O);1= zPSc>=V-Jqnd9GxpF-<#*zuy4kR~f3gQC<~tF6a{bfGsIs3AuipMVo0SFIHA*veZr1 z99@^?Ba;lKXSYPLB=C<<%fn#9ru}&S{Ul%Bk4LgQ*I##rxIxQja_p1ALb_@Jnz-DN z=K`YKDVe8PR2^*tVf9M|Hkdtw(K zCv3Ls(q$QK(y=fyeI)sbCqvP8m_NTaD&cIiME0g5Cza4NU1lkjp_q>6Wr}e$?CSn1 zHTfmFk)myu)mX=PVw+mLnZgP)02C88WelP}vUM)r+fx@4px@e`b@4vabvODqGj7>% zqULc=mLIvK&Os$0tLWodAvyi+R)W4#$RPpZ6k_cXW@2*pT;sBHazCnW4z{0G;MIr^ zrAnS*G_8EDpu^P4YTJIJnTI;*F70iLb;D~Sr0jn_)vQD48N`kC0_j(K?yvg6OmS&n z6z;dsbF?-wXv4%r+akr}#xOnP7Fjy-?Dlyf&5e9%2wlQ0XFboG<^6n?HyGyFpK$7E zv2i$fc2!6AH|QYh(A|W1eHkp8`l>}+RXjTs>^fD@5ra{#7uoACtOaL#HC>5%**wCu zvI1Y;C8eqDN7PFGR9U2Z%*PT3Gp|yc;$N>muqgo9GB7XT&Hlo%-KOS90?RW!+m>H5 zEUV9-A3BT=&r``+otTswv@qqim~(mT`?am+xfjHtv9PARd;ZXFG4m|7Q>e>ii&pLl z-nV&VL0V)sV?*->0sS8|gKwoayfH%8tDLoA=PiNk?!52b+;lX{LfsbrZ2>9MBPrE$ac@DCyN zPv98xf}Sro;_^*NP#RUG6IX>(o0FcGVR~UYsH$^x%@)&qNu23RiJDN&SO86L+UA!1 zS$yEfZM!CQswYj-<^?F1#h#kM;qp|l zS+tW zZc)=s-2h#ylja4qceEXTK2;FAP3!QdrP}owWr#%njGXPwf-0U(A(o}96P|H=`E^Hn z?t)TIOKlszCTtziH%(VUi<43FHocUs7?C~>O{i!~H_qBgO??L=uKF;iA+53V#kt`= zZS`u#ZMxT$&GDXOJ50eTDQB@v>3cQ`{lq;lXOOw~t$HA}fX~h?`nYe!`&WX==@ndk z1s(I|`@42!)%RMPSRa!)lhRWHEHg&Wu089?m8Y~}d3~|LbY5XR8I^(wO+8%8w@D<@ zPWcQQ42U`BwA+3%8-tZkEC6$%#^~7P>rUL?{ILHG^f)^4?WOemA^lp^?aYj(8(K>+ zgnwud9JO%!W$NE+wu=kpgxYD?*o-Q&R(JCv3V%0nw(JhP(lnk+GA5P#_kRH|nHBd+ zbA3fx3q;|_zcrd>$p%` zZ4zA5Wsr%an$WcN&^LD>SkbdFJ+)No9$_ zCltQYRl!zBIB`IQg<38*YG^7^%nQu8>ZgjQiW-2hwFy^3{;1V|y+)v?u6yRQUGZa} zZB2W-G(5F_wr3cybZUYGb9iia_S^+8o&&jS$epe#a~qKJtl}#guM_SV(l$+by46(@ z!yn^-jpH1dOnnieN`_XvE7;Vd;T_R*PNAD@^W+v|)1Gqu_T12Kzfq{$TgDR` z=?0wuxR@5Z#jmPe+%?D5b~MQDdr`OcVY z1+{XVa_Y<0WmH>?SEZ-qbqASXLfRs|3ZrXv(f)_ED;8jrIiHC@)!f+Twv6d8JfU0$W0de!67^JH$Kt}&HaC{YBn6&V#}Zco zkl@|M-bnlAqyt%J@bOiRmB5_<=KDp6XdCHX_`g2hYm723>(@e(+;rowtnc8IO|zwz zE=9zle{p~s8Ie}w=(_BpO2JP*vSC_kWio4E`G!V;VcOecj$B^V^J|^es13F|_C_uh z3n;+tSzgz$$bH|WXQM{S^ZGpjx{qi=8)Vz+*GY}5J~rbRiA+A=!#~@UQ~b0&NeTUt zP3K@cDw903Y@A6UT%FkTYZ!+$L6yR5HtXA;S};^G*n>HCeeFvR<+#Mc3P&QlC##N! z%}V2=kAE@r|Cvoe>8c)xIwXh%dbmZ}mELZCGOvmzTe-KRN^N<5SQJt@D^{4@J)XTw zl_r>`V!fis`@TbiespzA)vF4wq$ez2Hi7+mMCk^cz*krVKSALlSl%jS6aBYHK!B9K z9M!24btP_#nefIA$eusW@H!vKqGu)p+t6M(3ZPMz9jsW(kSzOf{ZBG)`^YfTh{MLN zT55`CVw}097@?J!{I7?N@6dtsM8Y zaE@+p5sHdk&USgWwYm6N;_t>ZSK%Bbt($_TDOqO|YRDrPnZVWI^E(Y z-XCw%q;(|*fnldaD(%?5Kl#xj`kkV};I_7ys<`L(3&oDn|(myt>LCy4R%gd)%fSe|L z70POgkE2SCGWALCSa&9}d|EBZ(y-!H=k2Y}*z&jDOmvu^uM+L3JY1tjP@6t%6y$A~ zTqQkyBp)HO4g-dI@$)WI?E$W{l#cA*v`Y#-_wN5FxELWDlsXbhd9FoO$}aG)l8}o(?P;EQ{NcAd z8k%~-8YZars?5gF-*#sc9Y!7czUE88zzi=hTVq7I=)-%eeM{LCgigxl-NYo$jxcukDi{@PG}gM z>Tl%EQ7ZCcJF$MKEleG&P0l(}ZU5=z@@^t}KJmMrhGc*Ub7gDD4G|lmRXdz5#eY0F zn%+}XuPl*bDMNN254K>u7B8;d|0#Kn)t0;~j`5~G3Q=W7=JKD39?P1}8TyK>rj8Jh zWB&-5-BNppJDzWY8xeQ#^Pycp7nDsBVxDo`=z#QGdDS*cnfxw43GG5(tfUdiQ3~vo z;ANQgiHKaP*vx}9&T8$cf>U{d_i#gwMMvFKJtrW_;ECbMCebkuy^DTC-)@>;jhTV~ zw&Z50M1wxh%gOHcseETPh=9A_#wGF0Zrxr8kK%uv>)mW^B6T?J*m_+xlAj+-4?`

*PoYLnAqsBxQjKaZEn0~lHd7O;CdBhC3IcizosgC}b(A2}8r=VkcG1C)WhpLo4{>047h{GY(fPRjQF zNuP~+B5@IgVrO;CvQEhwU*2@|rvq#sxW!mlRXjIc%cv6R+f*_w$D()xG3L_Ea|OTB z74~~A;C8D>FO2%vW1>HJzCflgK_`1J1cCgvAT4{E3n5MEWf@o|)CE=Q-@pw|5p za>Yjl1KLzv;pEHuU9M%RD)(&3bC77-=ii8g!t)a!xF3^L-U_< z?G|T2Zz?FHJTp+z8WqLRVLVGbz1uTh-g50m#+#cM@7{n8It~eiw;gw_56(y`5`&rQ z+BgcPRvUJObfXGU!SIVk({K@(9210MOu>g4Q^wVcrmjfHrFtd@CvMw_73MKum_1iWwwbJ;QhUHv@Y5{~O zagHoIo#fuXi0Zl3@1RLDSpV<@TOe7SF1En4;Nb<0tuU&GjGAn7+8c;*Gc{ik{=Q=~tAiVnX&1WpL7O31?uGmhO;&jxLV=|7GJ_dMWj&T80 zH5bHxy)3Ik)Ordp^)tP8SoEkG-)>ht7TWN&5Wl-H($xpCz#x<<6sB$T!kLk>fV z@kP*zLRq46!%2=ov%$Mc)4qSbR>9%J^N4v*&K0?|F33j1Sc}%xy+Vn3_rc95(+86y zg@Sf1U$%@C)LK%190ckDIsIb5^ZmmEZvsS%GkF#nKSBD0g z&xG*6Cb22+x76RvOBEBZaoy1=vp34;Y<3#WCf%2+>|}EdYE>Ax**$`}5hLCSuHT+C}tG z4ro1Se5PaG^6RlR$>*eyQKfJ!QyXjnhY>vYhxjR_E`Qm-b3CpnzNa{_nSZOx5WiK& zPv*&-RTFKi;0pVY*;3utafmyME#soE9@^?3j}dsnwp>jh?M-xi&O%+-ikDc}^RQol zI`=MksFEog_xZvcPj&sd#tTjxd-Rwy2j}h3WyK;l=QmPj2uiYY z9h+|aAdH}%td*V{$v5cvn{Ax_Q<57`CGr*XpVs|K#sd9lyl?Ld#k^z!lrec`klH-c zM^Nz-24}DqltC+ABGwY++zUF{rP2Z(uQp>MWpb$xx@EJmD76 zr7wGAaifz0cEpoc&Fw_K$uOuKj?$dh(0kHC%Q4-HuF%dARn=nGK};3?szy^*JKnWp z@le)2dj3L7l5gwx$Avti;Hifzai`{9OuDs$BsUwBopF=Kqz2`+3_;^JbdKbbGhWBn zX2nlzKJ&uD9F40Tl)Xzu}57e2E~ zFt)8U8-r)p3of){C!P-ZPg-F1#soSJ>$#TY^gIvFYMaEIseZF}v{DLIP`FwBrjVPdJK5J-i< z6~Wi0kXB=Q!kqY0UY61&=)e8K_(o5zDT9&vtc7^p>>YbejTqXl&-K7<&hU$$Hi$fY zEPf9B$YVCQxzV<4D6AEw=lfLZ(HJ~u(Y1VY%J1(PVbz*np!_px5K+c?`iG{`1qaU^ zji;%1JUOX$VQl06%`RxOTaC`EA%fN~VGKinG%C#0cTe{bG;o4QNAt{}?>P7I)tRC+ z#~rs=i!AWTnp4@9>yIB9fBIhYQs@ZcK`a+3tMH&_*1pyTn66$#0bxj=RRDhQv=qVn ztXBAx#Ri)P5y+#Vws~Pt>*;VJ!H?S_z91Io#CDF(&}~<`pNGeCHF7;=ePlmImU!m% z|8{GBGXDbOHA6*cexDASLRXw;1|sInnCyE&;IMfOgQ7tg#shW%(BBdJbWS^Ewpo4p znT=7lQ|7@KV%@}LzJN+lJ$j`MsF%YxAUks0ddqvfimQj=>eg+{a&M6cESlpqcD#7z z5);-_cRah{w8GEqEkF_y>!8xs?prYK;E?IV*$(NdM#?~qqWR;sD(2{vV1=*{=0k^ zeNG8ZK^-8J??3*=B<}x;t{B@+(Hl$>1IggME}Ax5k#Vo=bu>51&@y~9n*zg^`Key} zg*EpPTJiO#StoM8Aqjx!EZ~-wHR_LhXo2 zDc6Lr01$amRM>hg58x{tDLu2!Jh$_9X(R=pn{7sp!KKglMluC&ykKwTy0xBy_Oiw0 z9d2~ecDptKv;4kTFG~3J_gnf%gEsMeDaJ+A{@2G3(05EocVXUtPTRJ{h~6U2x@XRf zR{~RNXZFf@Rnhi8GVzFdjcxWrhpv3aLj}_x+gm!-F*~G zK=_Obd=ITewUDfOa~i`aruha}cgc$VCWA6hLxyPGC}_FrxRR0|F&`R68Ek^-r#=sQ zK6%oL*P|$Z3Z3z|K}im3lPi(ie8mr+mRS1AXpiy=c1XgK`02K4z>M5mw{C=-b6n*PY}-gQY?BMw?qgz~&CjI6g-+M=B^Iv<@~+SNg|O^zx?nHj@N=sk9=oRPZ>n+(g8484^--0 zE^&?s^8>4Q+W#*c!5aAg<&=+1IF~mkMjV1&g3@s%;&8drM?{d^pwT1Q!N?KUMWNkw z7a;`cvI_7l)t%IL%O~_s=@|M7TTuv$Y67x4feS8FmS_cv>Iki41K0Cd)6~4ZQeXnL z?3JuNc48x2D9LMkDa1wT(Rm+qw$k+9_m>rpeWt`3s$P>wRU%fXNvd~GBjSWwRqXG> ztyM&tcLrtYQ->NQ$jkEG#Lq=h=joBQdI2=3r<7apNFGxzdoiCiZk#?BgC1Foa2{I5 z&4kt_tp7N8^cSQnCEW5;Hu~*`d}cQIR$R@436cL)w*ZZC%D9WFG^E!>i1ZIHTD& zdyhS_aQ2gXC?dwpm>hudN#crACibc`Vr9(?aHTar9 zm^405`oJ0>OaY-5G7VO954nPNDuD+co2O=AZE}j@8VxYo_cRi=^-*??``5$WkchZ!y$Cy3%kd_=|z(zrZS+(lUB+J$>iF{I-a#979w&jDs~!PJP22wXLxK28_t+0?PRUP(SzHeCxAB*{ zuaz3FAwySn<%PY@5^KHk%$eq1zQ#h{jFmqM_v7@Ll4ahbajjk7Ze2=39v8l2^JDd2! zo*+`Yd}d1!RJFdSsvjv@oc2ke*SCJW%1qqRlyoo}KrrrcR05UDquG;eL~;Tw&awY;ZNN zbXGd-%z)OGyRZrnWwwdIJ!av=?36KV)AH#+w9zx2LEBVLzxJ+JRO09;PIj|ySz0L?qjV<@k1z3OK^FM( zl+K{n4sNr>TGi_bA4We&F9;}M-C~1_s@t-2*6OzFRdmF9BW|T%$&`G>CV^;>!QMOh zjl>q|={M`>R;`v(g?Cz!c3}xd@^kuT@6eW*aS6R-lsj+LVrcVFSk}5jBkP7i9`cse ziVu-e&k+ObOyWYH(a>#T>^eeyy~nF*=^jxnp8U54Qr$!l&xM7l5)czW&iCZx^2_cb z!S=G5U8ENar~^P?ap|%}BtA5-Eq}?O82J@Q&swimxo_ndKRT^bfMS;3i-CDwKVylO zGP_uUF;PV>iq@4-jd*@$-UUo3oYO|H7<`RM&c|q_&N1UEWIc`6Ns85)AVLrAbVlz| zEo<~U#QL*XjK4JFgR{Gx%>m_}f=Rkn!iUrRgyWHYV+lW$BPJh4noAahb2uGH8I0o= z+*SR(+K0Mp?oYLwGV+qVkwn-1gLGBRgvjgrAp0RmVQV`m)ewRSNCX5hPj9|(i4aYR zvR{LW*_y|EIaAAU4?8;X@npsx99_G=L8NINLYy67jeyfS-Y=_$oqHtu(CzRxxB^L4 zA&6;GwK{?dKGVRX^rpR}$L=@o($MWeFwAdIeTBYINHiiY&$JNDd~QvFU%ITb_(g+A zS2=fU&n+wyn7WO_w~*CfxO5NPkN>A_jNi&QbO3fiK^9YF=62(@(JBkqgE>?Sp9nZ5 z+o_NL?_`!OEL$&A_;mDdnoT3)oi1u2u?xlbnQ7e}$e>!i=XC5*mM>+QHd`Xf(#BB@ z-rh>cn!QWyG5s>`u7wa@Z`WNf_o`4g%8lBq5wz1h7{8tQ#A*L$3Tg8y&RkmK2&N~a zbWqzzHJ+F#K8-2Q@+qz7=sc4$3Zhxi1*mD>AoaTWMUF#>z@wDH(hkKDdC=)C7~OsJ z8SQ+VZO8O`QSJ6s7)1~A6YG9{2336iekqhUP-tA!wW*a;kYW+<)@?1nIGFiyZ}5+7 zpV>LmJoObD>iww?Ul7`wk@z4dHtoZN^Dggib$K+$*0Ku4*R-bT58`;3;QQcfutoGY zfhMA&DbH+YX_n!l+^x$!$P_lH;#ued%umfN5D5rn$ii(_ClMyn@0XaT8*fMZaj0(W zuU2RLY=aB{YS4MCN*(ttC~FV~4L?&q7%rG_S*FE&FDA2(>L-42H>3@_3_F=*HJ;^e zTR_0%p5V7y%1;0DEYpf-u8Pn8$}lZ%C%{_~@R~fD1SA&IjGQ9K1YRsbhp&}+m7X7n z96ul0^4ojw(TA4LC--{aS?qCJh_jq6nfuAFbwqcgj%FWfetAf34G5@iMPUQAT=ms; z<9lQSMp^Xj4a_VW3z zbG5}Npo(G0XLu@3dOjg`)&&C-bvUrkD+q8Fx6}nmpogvlA$PVypzTJpULQmid_u}Y z_{1^Wj!Kc4#u>uOQigz&sSvQUYHE?vu^=KEzqfmsh$B9VkZY_B;@Eo14v8X;NZ}Ha z-F*$_zHjnsJ-&XGS#&LWq^Vmw=Gg7{TmU`PLx3_bgjG{PzYZ?1^;<*{MuYcmrX}IR3wg#WJ znT5``oc?2AuyC{P!5pP+N3H2DjnUDqJ3Rd@nKpxtNbDkIQXMxooBQx)bG9wu`JF_s zpp=jFl=00YeSR!_u~OC@CNxjmvX{^uaGO4pO!EmaCCVFIKsvuHm!GxqU4!(Wk1BPf zex=x=)2>=;kF8`n1gEj4NTIOS8oij3YU;V^*a7txIB-3EhWbMGf^Js6ilKz`Oszke z6geG8kT#wGBK&s4V-n5h6r% zWy78PQZ}-t_#F2y)D&8q&zc@PV3aK8P0e#QFV^3osNDW2h335PW`U?q6}RL?BIPVy zwwtkh;~NhWn+YqQ@PXxsI3leLOMYX>l~LN0yn^pHPCQL>-jk!0oRXUR-8<+Pe^V*r zdt!JI>nExIPQu-XFF*b=r2zM^9(X|j#hg&&-3jS46@=PN2{+x@on!CHHLHJ3*&w0W zP9xKoG#I9tQ5Uy)3)}OqShbWyBh>!WQ}&?ur0rQS z2VAStquBc{`*U>MqE6CkHu^7&rx9;Y5!gA&XA^k2g-UxYGt+u4OI9h6-KZ)hf zSudOclc<}d(z(lD@)6C|pY3JWuQ6($eM~>p9o2|g6x$4C^m3-)1OgFsPWPR28 zvqGf;#_+FBxSHf04_DZ#*LIYi=cSB!!MR(*C@=^#a;?@CLpT9dnsUn>O?kr{yzfV6 zhMuc_%X&$fBl9{Dk!^nicqo8Ifn_yw$RGs272N}yb%-E2h^-pU<|CRLsDirtD!9|G zHAyaJrMd8Kx2_a;nl^v5kltj_=@xB}RDXPQUHHN{Fz!ileYukRQ;I3f!vz<=esl_` zL8kn*B{fE*&9Y2(kW{f%Dk%(C{U^$aaHb(_=2Tlr7F~3l;TP0db?>)Y=;qFsK%qJg zO?Uo(68qmJw8#wc5{xU?%ekt_G(575BoqQk2=m`Jm2jR3VSVQcA&5z5ZEWT{7M)8p zA0471ED_aBuV3hdFzFGg9rIY^0!-e}Y`XTlhjH8*8SUAIl^WT!5_j3XIhsd&!k|D) zetnRx%M}9oSjYqO8fl{8PGHyywPE`AK^PDejSL^}t?=ef{gOEU!S3G0F^;7m;v)9> zh=N?a0(DIr?toV-x~B>gq#29a;wZ~QGgsy;!bpo+u#!=vc?15i1Iz7=H+O67-|(4< zV23;p;^pI81LPu9opWeiP0g;`SXTmzXvrFA6I+zNdGBT7p{aeLHm7Z+vyR=rp!RLY z8y&f7^-4`SS#|L!+SdIW&a+JE++T%mI9z8N+XUdD!vh_lm_C|qJreBj#?N5mn{`An zY_{ox>-#NNE*X|%>50066ylo+k~`lv?N|7AV$xhEl6b%F$f_^5Hi_1OS|q*a4hPoQ z4Dv+qm7elR{RC9JisZ*DrhlIMQ~D*k?+N?@rIGx@{<-zt{x;cd&5#&euF{(@uU8UP z!e7?USU$^7rgrl)bY=i@V7e(}`ag9Lx?oV;#uV*Ou7s{?j_ z7Jn{zXaT>G*ywdrjmNQB&(7!wv%#MVO*$HtIkbnLyw!nR=om|r&ZHjtv@eN3>Z4suCqxlTB0muTXZnCFFa*mJ~Kzyc&&mh6X~ z#F}L@lUz0l-?oa^=hPJcKsOx&Sd^e5f>|e$lG9HuMI-iv?%Vcqw6dOA@AlFCApPX8 z*%OidFqqCb9|9=*63u{G4ci-e#?MS8n|j7kcgPb@9Y0^z$j4#%^BxG?={i-}z ztw<`S$dL$6KVOetTnItjetnbakxs8NagOu7z*9m^Z!QK*3{pht_nRt>+Uq{vtiZ7? zL4H~$q?uL_WsV*l^Vbhk;$UAhzIP-tY!Tc|bQXOOSbj3w8o8A;3oNaSJ2@SW(1l~K zC(C;lLsj*ztAU0!L8hsJH2Xu*n|O8KBr^d3Cd?~JPkxu+OE2fhE7>%qU%7*{b+-H> z`=|s;)1N!Jw~)@uSIwbIhgx$xsAW<$DU+yCe|)RildamDnrG3HC=#J`P{^(l&Y{8F zRYAMnM@+LZUenku-vxxFYLfBh_0e|BjA>4h+BP)hOp-i)N4y7$8txfr$HjWn4KQLjZxi#QIL++q-sN9Cc-@=Rz} zLXJ_Ny6PCGEcK`}Iw>V{epgYtKSUdM<{4%P#JQJ$X7cj4=61brZG%QEZjwv3GT(ME z$ZI$A>O+B#A*8uHf8emjU8nDu{zzz4jj+_Ywt?qq1@lTI(3>u=M)*KIEoV4Qbr<6jxCZXWyn%;dOn}$&GWu_QJTy>GxzT;R;RflH9!5<0AKDnL=;VHOe3G zEVEErxlRSt~C3WfbtQ#Fq~91P%1I zdfigAERdQL4olrYlx%$cXha3;%V|oCpJMp9%Yt~;mk?;?7$3q2f6_*uNvJL*{{cRX)#toFpg1OUCWST#H~ycms-31%Gp<*s{cU;4Y^MAo+L0Q!_#L< ziWVURf7iyMu0nikfMEqdn(f&n=__k{ua~PT-E=m)Z?oHA%P9%Oswc@?N?4^vn-kt0 zl~ff_paU357T*<7^7s#uM=T+XHp)&4zAM%wfjkt+cYU&jv_Mf`n?H#AGNESI2~H77 zv57Y@G28Ou(H}9i&9bdi!VdPUyVdm1H&`v|v7YD4>t3MyptH^kTFv%J$vVk7ZNgDR z4vEe3H;PzO>+fb+cm@P;>9SP0&TPXguIEVps5+Engc8>G|PIYMW*Qr9G_A~<>H*PhE?70swLy zSkQNMp4%4?d0!$ID`%y+Zm!KRfQN(4R+dG} z?^`$iuJ3k}aP|)5N88(HT7Ugz_bNdt0Ey2b89}$XtKAXM;m!PzXS;Nd@mb4=zYT26 zBUTYVL0A(LyacE5SXA{@Td~O$zz@kc_C4h)*T_N9rt6$?iB_E{7p}~-_+3h8%MiR{ zl5MAXwl9M^!DXjSf*qwxfC6?5l z6 zlvdapv$%mE;&DrQbsvcR8k6N8iZz7^yRCR_RM-hi-9RQrUPp;Y*%>j5%Y&&fSE)5s7u#Vs9iZA}Xa{czZ z*Y-s39**wxvCm7DSyTrWrXoeXgOK;v@!Doki`34d$*O>F@cIoC*j$fspf%UPw;m!; z0w({2gGT|VHUregJ=av!$f$4IIDD56kscPmNX}pmJT-u*5k^|5>S{ zv`u>J>%B}An)buBF#}3}yZ5q~FNhiqLGeUdR2=o>3YvnfSB2k_R&+fvUrdhbXe{Ap zU@jPzxbc#apKHu(>psw`AVt)yTsC-~{R{@XH52#lI|r<7OIDqw4+;?!BeLm!3Mv^N z0fxA>?W|r%9N~D>RNJ13vHp0pEIIpc9NH@b*GjoYUm3bt(^f&0E=)j^oityM{x#c10yPU^p&`S<<74}R8YAtV!O8r;C zaU0SJdBr18y^xkYu&5)BHbU9ZO8@i%hQK96+Rh^6Kf<+ekCYsSJnFJv)tQZepX1cW z%ZjC$G2u@Lt~qHoI)eH#EKlHgOss8?*+K!{9&ZYYCP3Sj^CyfEUJsafs3|J=I;>9| ziCB%kT6bvQ=owL5sC4RUj^~Q;h5#i~9&(^{CjdpHCFxr|Cuh5*oTT9dNJ_19^&?vB zfpKsLaS=U3#VB>>GKzcWW`y#ZfjqeY_iW|v+l17p_pjK}1F;?e`7}X&Qzp zmvu^oED=&iFP5URyy?^X9jsq+iX3Jyj+V3+2fFoxM+k#2yQoNZ}d2SJyoP zz;=eF2K{2gl&(?!ed`S`rEzD~r76I9wvR7v;&{6QSZIM&v=sy_ z)Ap$i4;AAhqc2qics7CddRy(rJzy{1m4APuUAa$`z%W5m$!w5wBR#euJmO(cG8xu> zGhmvR(Ve_I6WQm)Y`Y$a0Xgj>PXZ4p(l}LVZ(ebbIp?#G72iEYEf*WrI~469I=I8+ z43fgf8Fvawoai?S_CyYtM^h!>&LNu^ikh~1q*(R^!})eTp$(1xHfvPhiLES#fO+Ib z_*cRX5H&!Hqrd~Y&jRDp8t$3nOYRZTDt7nZOLpkp+qG?)#~>U!{ecP13JLY4)50w~ z2j+6*CV2mUR^f4H(|c%)djc$mlGn}SX4)6jpn4A+->bMRlUt~jddpm7o+)w=(y5X&itaNV zwZ(Np0*l3Bjs*fLqM)6jHhd4;Kp}nF5u-;u=bIZ7dORJuw(3SmP?Ch~Vn}rs;CCD3 zZMTc`Db618fME5gm4Z6N#^PoETgSQryLQ%U9y++SMs?8;V8Nsgj=qr6g&kW)SYE$x zfYje2CVf^pxnJv9*}B{%?`b0ld>W=sgB4>5R_;!k`cHKqD6P*h{)Ux}r*6Y%Z!pg{rlYZ4{ADhb^MXHZ>4F z9hhOsmuLsrrLvX}i2L`Us{EAvx|o8K1nsCM1n97`LJLzSROr&66v~n1M$K)JdmH;C zjo`aX!`I0}1yi;fOiLR}m*UAOj8_6lf=06h)JpAnH6T1Fvg!zc)%!didIn?VKEH=4 z#9j+Sj0S^A;aq9owY2n(lZ5T1Z_k@Cbe2}0BSx_{w4KI zc^wBj_XmOxWau7{Xp}k~<}G8DuJC^VrIuaL6@6oH0gfDl-ItiYk$vefn8xdAG4j-v z5U#R;Y9t96=RJc`f|14rLqNCJjHz!5Z=(+3!fDOXgowX;4L*L zUU>IMC_FP^#mJ%A3I&1vu)+G{-Znre)e^O2G-UPl-W)x!15>wuT~@gOJ+Lz8BBIcV zT{3M7txQR0?JkEIIi2q5%DC$3h5V^O3s{7#SluchJW_|2I?t4!z;RYFjt)*Sy`v-E8Eq2RpBULx))` zJV$fn;M(Fs_)?k0iwAlM^a&tdQxJerY#V?A%KdWyBJ+dSIyO~~oKat766;zTm=(9a zlrZvkb35i;XA^fX_s8 zv0>wL?_O?!YYrL!(Ly8Pj-MMga^!9vI>TwO)+C&6LY!$QMg59 zl_}!XpQ+O0D+N~tBV%B5AQ}Y;e;Zl}zhgYn7s~uB7yA_^Sv;o2^l;HFMXuf}`rg=N zhv>svR}gX5hi`eKg*8sS;Dr_D`yTdX3lqesJb+vp1WkI_I}+8Eg#L<}J`)-#wd#Oh zzUu-R17Z6R=_2U~pUH9u?1ITDv0<5*|G2|Q>LMod`C57$iz?pb4hY0!LGN@G1F5L9 zH*){-pwR}zc$zxG)k`CLKzTQPE8uBWdf%q->uFJUa^5$y{R+g=78yMajM)mK`K>-z zbb=RmFetK}vU9Ai)KMlzNq;H(uh+r;Fl=7AECA_ivHuV`Q&=QVzlv8iGVF>;4Fej` znWI}HvKO>Dbf4N|sX-u_eAN53FB^j9;Iw_2xTi~;JkVx7t!rgWh+Iy_)+PM)wS2;1Y*P62}c!jwNcGKv@0U}V9`?}^Wtcy1Y~;{ zl(}w)^M9|NU;t8*h5WUUE$wV`pat=Lp0wMy;cl86rfYHk3V6850Xk-il&Ow@bsyNc zVD}Wg&vOKB+=keoQql;%yI$5WMHB>M;pv&j)Gm$+Z{zIG_~#I7#s;Mbh3-WO{*DxS zN=I_~TmR>mmRG}&lxt{M!T14s@+5?ik!Ng(>%o~F-b{cpzyZd8$?fkwI6TvBHVkPC z=4@q5k23*%b=_(s9CaHixGZogXS9r}!*tF;jgJdKnAp<8KZ4vzzNihJ2~z=)c~j_o zC*Yn}aglNA2UuD~TVMe2HB+2X5brGpl5%W>t>V6kbOcftUc0u3e>{P(=^~bhnc6W^ z)F2^aYh}>^S^HXt3*Z|{AoKqAXqerMbM{}|W)N`MZ#ToB_aKmLCYoybU|kV3qdi1V zN~R?L-Sn=0cWA(CL+bH93ErVx+8sVa52uvjBnp`R|8MaMxJp>AH-7o1Z5P-%pKp!B zw{@IpKi~^i)sKEeuenUDhBP`CNLryV0W^dSw`Ma8?gL&O`yJ@$NgcC-dBD~${_jr# z={FHIDKAG40Z)<*WTFZf`75nCzDSKC){AzcAV~TC9gDz6f~{^od+qGE7tpg43FFT= z6%bgz0IiDeK-DvR2NA4hP$GPq;qg(cO6DAbiGKa?dNZiYD1Zs3KSO4Vs|9-9ZBSf$ zC4h-!AOC}<9z5bcz(05Jc5ZRFypE6mBb5al#m15$9O;4PYMj}7eh=CCB5W^R&;s^k zHOLX!kTO&}mz>y(LL0FQpP-`2LPrF5v{xJ$$K_ChDOAqtM&X=vEcx+UEXBzps1s=h zZi01yHE`v|5M{f4^3Q21I{7&psu&?3~2JOQ4|#KBoZ4E)|gek8%e970d@ zHB<+qu@7y!&KytR-c3qg9x}sOJ_;S*FA?#_3ocp&)l(L5S{o%n{ecdS%%l<=MFH{# zIeQT6^#d}n4dOw;_j9k7r0h-Qe|Gr$QwC^=`%_Q;VT$MJE>>fx(;NoWT@BR7dmhrx3!_kG#%7F3@Z=wHw-H|$#(xI?zW8q6 z(D08>%>LZh_v0Dj|Nl<;A8$GSv%5|I&jbH>7w~^oFZ1VN1%93cSPnl=!q1a18x?-8 ogr6(n=SrA81OLyJkg<*zO6+8JKfhA>3J9#JBf$6aPX^aW5w~=eQVJ>GIpNQc@@J%vZ+KkCuq}c|`|CmQkNiJhBJXZx5dD9C1B(3r z`NJrbi1>g1`hQuT|1Qh_vMB%EE&uDH{C8RYyDa}>1^!z+{>O6s^TmIc<)4-KA0Pj_ zEdOIU{`um+%ks}k{Ev_SKVFuP*G5|Z0jiz`4yzNw`HpK(@^!!9<(u|0=0B;!>sOidDQlq=g5Winrsx` z5tcRHb?4Wn_Wil8M3MuWf3I@h3;n|#T(rmeN|Jy6m?qatmEEX~qk8|BUl@nUyRdr> zS%djTvLj=;YsB5*T+L&| z4OA`}M2_@IDZxaE%&J+RQ`Pg6QWX;4G)3{>dr6E!C2+L5`4UrGt{An1-u`MnUd6w> zb!Vu+L?1)Wd@*DT{m;9Bk2ObEQK$ZV?a4e}Jnqgu?!K3~v(`XuA$R}cvmcFmeiHX= zXLZ{I_RJ|7-_qaGC}8tA+(|Ynr;Bo|wOtzOO(+Y|aUZPs^_@xnVDWYXrG9LS;(+wGw(+u8?dL1V) zujVuDp=Y-oDM_ifx$^RUz~#Y}M7UWqCDVcI>Remcg!^31EdBgfL9}l}wpt`>DZ;j~V6H`&< z2K4#GO9Y*N8 z$NT#R2Jdc%T>R(W8suK>%&yw~?)s~V&ag91{N%~SZP<)U=^jUiMTGnBq*-#0j4w4w z*L}No-=9rY)5b@NY}{>2E6#y7wQ{YVVx|-q#ojcV-!1_6sz>U#B~8FCYXHAkqrrl$jLc#7@3vwqG7Gc`h$-xUCQ8a5r>Jx zfd0$~ww<$I%Rx)QZOqnswu5$WsM@2Ne@0trz^+S*ZZMdr(v~G{0=DDDe3vcD^||Ee zaECFc2J?}U+XbR=HhlVH2_g#=mL*?4W7~L6=p94hC2}U<38KE)TUob9@pB2$G3~(* zwTKe)f4H-*iawMeargOMVpw?ied3v*GV#TZl@D#a3(R|~@vzzi6(N5X%e$uf-HAV+ z)12z)emCy2!jEjn_0_M$bl3OiB_9`=-)61ajw8BwiA~pYRN{K}t@0RJJ>q7^qvGa@#VQ86qT7?RUOU?+K#bAU!3O>Lt4u07S5NbqEOez8sA=fw9uE? z`d(JxlIeEuTa`?Wr^wOiF0y!h&*%h-W{h+rc3dm-EjSlElL9 z<+{Ra%O=aWCMM!17Ia^BMft1>t}U_^4kk(EJ?=N^`G#SfW_=0Y)?&G9hxd2ZMYnHy z_9SYqL z8zG^7tW}{dBY$765Qg?Yg^*(d7BJ$mRb}sp<8*)_RHK`Nw;mU@Xxh|~ah;Z~6Od%t zmAUVKQBUDZj^5Yvm8YaE;MX&B(xIA=vz8Ug!Cuo+ZQygg=fWe zb~8rH9q#Ja`=7Kc=Pr9rKs@Bxu(>oGVHD-$8$*N=`ji8`>mJ?COoY|Lyxqn8_J{ph zT8cN>E~A1!lZs@d?+=6h zDS5hle;?1?T`C@5)L2SQj=HLJ{_o3_Ja$6&pY9o)bd*x8m@LCUqGs#Z*yVZb_`_`N z%0Jy&ellU`@GZaOTxmFs*2d&joE`*nDKOdi&Esf{X9xpO3Y5 zE)EqokJxrUni;mLe$=@6w$yEJGpu;bxtZK5RgeHhqh+F%yqaF;cOgQF@VAO_Q&*xi zNhK4?lQiFAn5#QmN3gs;Lcq;A&VfKuJ>RIki1rt(fAtRRk@J=Jq@)^tLWjBC#6spy z?9=0%Qd8q~y!E51wn>rs;Jew;KbLd%*fF(#c5Xu0m*)f#&8*cU89LQt35@)cJK<(o zlML-r7g-9S)x2-1Iy&4FTM_AsxaVLQ=WtBY>C^=lju0*Tu$}dG!Q)OImDz8e6Mn-Dj1Jc<_O+)c#*|$IGpoA&oih)hK`x4z2*EZ$=`dY#JF6yR}*vy9)9M&-C6F% z!2unyyncUgr5ah8zh8k@8#)jF6nE(AQP7=-f3J)SvaHzcuQfy{%JTR2q{x5$z@S)< zpqUSVYW`}NZIQ!=r5;D_`DXnr`B3MP1K^_hqT}M5E&&bMfaeLeA_npLo`??-0geUaAc%$w251_<9%F-<5Zv1=A zh2H(U4y?g%4L6E*<4W2}0gwfo;kij~+Q(G6SZ|}f#ligHI2p9Z2PV}v4)cN4YL8@* zu3Y_waI5NrKJs<3^`Jhou1?t9>y6ifwhz?}DpAbn>`a|%fzI-`zm$iie((aCzAjbA8tA=@_PaQ6-D0d_sx!0bg3q`<&fZJ~9+iMcx z7j#@ZnKcR;O(-sRlRe-uXbeuc72N!t)IBYR9c_t11<#K;;jXZ|?>p)U#_vc)a9JSL zR5yxz`tI$_gitWNzo-4C!~)wi1egC_ zP{}+0syO&nyYuOAdjt75)w)v^>98BUsu6(YzF9<*Hri|s8V&c?2T-(@uT+@{9Q;;9 zHm}YKvhfB*ddP2Se6tuoTUm<5g)6744A$futRHNR(>Y&w7ZtT!Wzh6aYz03sPBmNm z#K1*;07g&0f4W6cy#52~tUSYyPP?Gkw2GaW$>*fN83f)^MRuIISDDA)Vy z8wS-|AoWa`w}t!mWbbVB>sW-wYwsR)zR*?9)*i+G zd<+%*hNIYqsI`w$56<&v)mciJ(sTbcG}Z_B~5zyi^P zQ3=Gz(YD&XU)qxCd~u2Pz@Yh4>tSl4((*{@BQHGk!`J6;JOs3Glsvk-v0xGUk!p2f zOM4y`+4AyT%g?s`e|lB$$Ajv7&;KkrispvJP(c&E+8jfPyAsqp*WaBINpKRqUg}(t z`9cHW>xI)w<(8|Dx&>5IUvC;mr}Y^-XA#G|Fm9VTqHJLUfS>DpVvILavd7 zPf5?N8p5yHAv&v|+XtUt?%uorgci;guBQmv_iZ1YE6V`K3%$;;z%9@h%{l9VX*BIh zcpXU>aEZNM^I6jkF7DGXYLGAz~{xQIvL38A6+q_KO30^+D9p zwi0q)tDe-Dy0U}KVGBo%(K0)?gT?P3nJgUtSWTWkg#TkJF)RY2Dxy_$FR&UPa|JM6 zg}FPODIb4-24EZ;6a>4&Fof_h>s0Y|9C+UbqPog=dL7p;Teec0&MhMQ+V?T4Pl_(` zhx+y73G8T&s{LQhGryWyw>Ges`vIP2rxf3FT-8ms-}&{E{W$*VSJKw2Sc_JT) zDY%4_JIl5lmmruj>zq$~z7X~0Ww^~)| zBf;lc2Xf#ouEu-Md*b(o+WFS~vtJ5An;Ziid#B^zPdS}(HbJ*(Mg`mKTw`()h^KY&meE*(j)NVOcWD%ioOijNbkn^7X z;;ZeRb%gw6ECaYRoUHYc@nS+9M#GyJLC)WfHGKyC<#d(IXP3tjjerHz8lkGFJ5Rrd zo?g-&o8(jV7hZ$^mt zV9@S@)PG;N(Ed5kf5aOh%w=Gr2t@uo?6A<+iomh!@gQb|L-QQwslfq`C3cO@D+2~U zoFCx*;S;m^6NOlA5if1b=J$D6KSTSKv86#+)rq4YfEa(zbvNt;Lx1n_Ro~$>YNJfH zZw=>UFMakCv&i)9q|k@C?qq@@;_bTudb0c*9}pO{K8dJnbN$)WFG*0r=Yhe_^<)7s z;F{p>L=dB5m8(;75ikrQR}Z)X&#%;ciq-CJ?Z@w!59B0p8~c&p%FB+I4-IKv$v`nE zB)lr!=zAPVTn~Wuv%3}IyTgmd@n87i$>ILGEFui>77H#FbhiNOGk!NHenHD%zAtlO zpJVJe!I=|Pvg+!U&KpQE<{KuW78toc6nN!c%RsLFwW7J6R7K;x_boGRkqd+O>wdt# zCwl7Psnl8FgvH%{0JmYqg#!AMwP}BWi6Y5$w>CU9h8sOktI4?OG81O>aAUqV(!;R! z#YsaD)V!82pipb80NU>rngPO9pj@Toc^1l|RmNSYeESHP>+x|qrk5_4)(V074joT{pj$Z zHMfDv?A>Uj!`NJp<#c_3;ncU+UMsRH_AR#?LV-`Z6j9uM@zep`fTg&v>tik}HuT3(QnCxK7XY2#ZW1}Brg6`u^ME3x5K*7bvO+h@-P*Z2 zSZEf4`zj82Ivvhk?9!<&oG>0P%EdBfYyD3j{6_t+$`%YeCyZ9qm%F$czUyJwou>~O zQ&h8~Oh!s_6^EhFqpJkfM*#~i4bC7S)$al;SCb}XZ2P3bhtFg!5qI13iH@n&bo$k6 z?!5!7vMu1x$c?o@m{Lrsr6{c$p$?qTRxO9opCj7XBoKA+%VYsg&977+*`L{wA&17z z4l=8~X*r%wDgd=M0-DX0jg<{+>d_GNz9}$p?6C1N z4#ZmI^{Tot$nNh?;r~)Uq4{&|Z7yL=(?wd~OpYgj^j7)}XkTZ3WYmL$cEO{w=AK6ipd`?^Aq2Bc^kCsng{y_HW zy$o*?`I5^ZR7WXtOl}mY(FclOG%=jLOb*K{byxhYzr?^=z$pd4DM+GCj-2aMX&CVQ+pO)2$ByIBZ3-tHA-bF*!0B~po%hX)K?Zr8Bc&#rGgNG4HRimK`N{(HmbW>m z3~Kl7FO=NY6Db}ZK!^ss4KBJoIskkZ^-=`ghPtC#>1>y5%|zC_I9@$Y>9Uwxg;%8c z{K&fqFjL7g`;_c@IULJj8s`2dwFr>{s7?1{L9V2}z@&R|!~*sJmnIgJwoMJV(?Lyd zue}7Q8ALSD!Iz_1`fTPKi7W?{fc$=~xXpVS;mK`b>5ayO}&SrNmGJI3ZT!6_(?m@h)8~pRi6^=F( z32E*2px6Gz>`6i8O~Cln&gL`rAU(?dOk&3sx1e2j+pPI@rA&=prF4~64zs?NsPt8< z-ZbS+jZU6Ia6eoE=wC_D`tgP+WoV=+F(5VACBrUY`;rccxEX1rM42{+L&r`$>UyN; z{BcsQQ~{v{vPcQ=9txxHGGU1(l&B*fYGxmZ5$LA2QvnCrfoi*GhkM{wUZ2tyc`pKr z+kg$zU@EaZHLRa)zU$7jduOsA7qxSY3N?APwj2Q6#V3VqvH$R&(Pnz|;I_ z)q}tpjc}J#LFJs!eK*1#X4g#@yS6jhGuCzrx@FiAa*-;6TW$$sU)s*d6*ar2^>|+2 zF3Z$aWn2S96_+tCkQm(CfXCc$?)NH4IK%;1DqerOQ9N(fOOU?O6vm!>Wq6}&SBCa_ z8B6bYlEQd+ZdSKip8b+WIZMZo1Xg(?)0vE3t|m*rx7xHSEmtg>v$ro-QWpG{yng+J zvo2cs&J!(AU@U5rIyOPzl`sd6n%uETFeiSH(>JU^;b^lndg<@&>^X4*R~`Q6%%f08 zade#NjUzs921qXln<{-s@-U88pM7^2TP4sB0>2)>|A#s9lCp$bm>P{dgV(og6})5z z5Ebf`$2e#P9h)}>Q;nPb-h@eipg;Hh5S0p8UZl-J**a@#!Lhpd&EQwUQXBak-7hs; zXo=*wA>g-RQtxj%QQhzKg~+;_rb_cQ?+JSw+eT69RAS~ZA2+yJX5H(GVvD4{(CPjBNR zZ#x;=7O0m)aa(>YWsRDSlRhUGs5jgS%?G>kM@eM)^vdj)E;sItdmJ6ibV;)Y4BXdq z)bThd)Eb|^yefK(Wn-bgczqvnR6H{bV%MMiXnecLrx2ZPeL_`ZVICU^`u`?x=aFDI zl+O`%wp~DdcT?|Ao?~%8+%ei4r90*GjOm96mZy^AsCiKqeMVe*`biKK|J5R=+1QtN zi0YMu(%j|~!M4NBKyf~Z@9tVodtjD2@uxr^?ygTQm z=O(0Wd!WM(RrM}Z`OdsI&8A^%ZPs?*d8$}oERO|)Far9)b&C~;5&L0_s7czm1yS0w z!g5DlF~Y}PqQv83@?MkB4XxZr+4;Q7z=}EU-^02PdmiM5z-_AVJ(IviabhCwq07Th zm-s>HNuHPn8Is{cu!xosD3*EsKY=Rq??tx+4u=^L~&)iwY;vcNvnoZ`qBtEWlaUs`nIcWC?jGQYQhY^0Dj`ZN)xF1Jg zV$AF5 zD<#&R^hoCb2udcA+2_+w&NuF@wXMS)t?&zm|AD};g*2A zW5)gZf61v^oV@{8?)6oG=4A*h`?k+R20{^5vK*$4A8(kl*u>wE-hLg|aomk%Ls9z+ z_1ju=+se5OIFk_zS>G534Y2N+>)%3svDlcH5#v?TcMZD8+l zVm{sWPDix0n{QLHjycxoH5TZU(U7Z5{WW#3kw{iMM4D2kzi^xBZwKXo`cS zR(Is6|B`bN%E&`A?l*IYdE*SX#k}Slw9;tO>HN0gW5@ABWd#pV<>yUleWA8U|Upo$c=1iQacZ zav}`q-@MFS=kK35>7)DY{POhErgt~El!R!{tzoKpy8x55x^0ab@!HKBxbOVD@P1ae z3GmnOYNVr&&eKgq+0mx(Lh6;uD zJ3>ZFfHl>(N~n2ljYqPAkU#=W1{A}F)J9$=wbEya{Mn>w=RvqI)I(!95spSAPnZ{U zZ-^&hiK*j}*V#^Aohv2)Z)QM}$Xj6ZSv4TbGCSwvN$+^}Ed%!~$lZaDw|gf+($UQ> zg?8USdH-A{B5JORXn6qi($lQX4oukUaWVTW$Ngcs`Nq5Dpt)w6vSq7rhlZ3<%v(SZ% zZR->jZ_Sm?Dij!Zf_!{sj2pMaY-j@2|6J0)$N4!esvbo4bSyKl>U@LsvaULsD zRCl6uNR4;)od|~%$ug_)sv=4DhnQ-)Ne4&-h@3j;pp@2ojCoF&x4j2OaAE8tcf8Y2 zIG=nY?5=Wi`Rly0Wvb=YBJFerTFM#b4g=tjn|y2G`AOx}9{ZiXC1@vYlgNGW8$!*p zO6wu|G5SIe0f&^%%cVHiR^ok2(J$WMWIy>t_&^TzN!rsdMNo;J{!a1&t^fLcVOm}Z zGGDSJaCHYa4ng}OFSfp(ol}kkJn-FbX_j=nc+{|i)Gm;BijG<9bqpk~@hs-716&y7 z8DnqfBT*YK18W8g`dkF0Gs11fSObUj)YRX z{R!RDZo2`fE7*oB4kw0=X1Ay`ULqFei>HV~la-~L=IHC}w1IQ(cv&x#+3uAyOZ5|) z+1&4%w`?YhuWS#<7?m17+tgo*T_))YTKL?>bp6cVAWew-Uurgbo8jV2YU8V9kQgIC zymBf!mCvRSi>?L`CVwOV7Jrc%$x%BnuFsfsCEGI-vwa!r)3%PkaP3ylaBWL_a*Qam89Y9KYN&X;_bG`0@j``u?-ilB$!pg8< z`}jNqqGsE88Y~8F>enR#U(Z-W0^l_^dH|w0A~|C9 z#k|5WeRyDo*GeG>22)M(2CU7nT|)Cg5Dk6Gqhnoq_{7Iqw$^NA@X3V2E@kH!^UvZ2 z^n-{jJc6S%ag>N?II;Jq#@A5%OG@yB^Z|DvlUhy;52!?VAt-x9;iW?*6{)9iocgfhsj?vSpV!%l;z|!?O~&f zcFi*x$caT9_tKegvxn902Q2^~k24sbIB$Z8h)WPqBI(^@3a(4cSpZ;K(67#yO1RT0 zL2XAx!Z9xNtRpmeLgGTcVe1<+<9B%U_5PRi&zry=PXeHhNE(C{D9vG;B?52ezA6Bt z=JYZZwS1&M;;$}>DXuLjt7?%MJHGYyvn>7jNzTvtXa`5V-XTit!>cNTk1LpJNV@$-@_j zq2jBEP6f+hb5G`nT?~pg^K~KDnQ##!Qn^x_>HF^`K_aOt1)2Uh|1w6l4DgKTQAU|I zVlTcwqfuZy*~9Xx<_cPNCo%&6oGlk9{3R!RzJ!9|KU_El#ZQ!bJREd5s-b`+?=dJDH!0iui!n<8g zNiXM0=;4q~?EN8KHIL7{gFKy9IC2)O^|xu0=Bx@DV?iiJL%l_SvQn1gkfxY-iZBnP!45c za2*G6BiawA87!cX8iGq9Vf3i(pvVpaA8>-znKeUtlE4s@$mOM zI$0|0LS}J&#iMUQwT*}>BC1nC-@-=Km#t$3s`4|6DAG*OhsLU)zF2^6XM|XjeOX!* z%Eke;=UFO$E3Ctf3>scpi*n{JXa(mNf6XG_TrF)*+;RC0e9G)1MX~N#CIJRa~l?UuW4KsERfZBp1v{&DbuCL z@mjK}LH(D7_22ag2?YZtcq)pjh8)kUB}if}GkxIFz$o^YdT^9^C_u=jX6yLeqpeWQ zm1E*A|5x32{g=A`PBA6eY*Gch`r}Z&_!;q)gt|WSVEIm6_dZjj0Zs~$Irn8UhqTZO zM@C~6PEjx8!PVQ8C0la`M|}ov2bP5|NUSczxe{EiBrL9N#+Sm94dg-Gg7EiCAP!@P zq5FzeMp9T+bo%Zez-f{3*~0vs2WuB%q%KOsipC0HShPa3j2s$=Y)=s(QImMAEXy&q z`5%*uDctlL?*Kf9STZgqyejr%K=kybH=Uy(aN zMDBMa!#CwvMF_nWxdl;YSNwW&Ya~$B+PjYi+=O3OVpM>zcbD0jA$qW4p{a7Y;3cn< z^Qr||ot$@`{g9*>m=Z@X%uzaKu1UpSC?0v{mmS@`wm%ymv?#sMiP$g&kaoHyVxh@E z*s@UdcGtp(F6!BLhOo^Q%xeXmk`kk))y>=Pez2kH;yx=Ycr2>SkW8k(+QWl{6#<}1ldAK$jTj&B zq=!*WEq!$J%>`MC<7osZx7GFMO89nX<2Sc$1Sy2OMHwsba!5^6+=cqFSE;SLAiL-h zup-1ko256>j;QqTIxF}umk587^Ai0JVL2tSMIfo(Lt^9-DX2V9v%Qu@)8rjJrBRGqVL#ks@_ z=x-^n4j>Hio0C9JzQp^4w#yPHPQ-((7(OX%8y0dz!pWkiHQSz&U8TNtqBfCxE316@ z7w27N)K`n*QA4CzUK)yk)l^iS#7wdiMSh(EPEU z3oWhX^z&m8{Rk6EQjg5pq}q;uVb_6ZzzGVtMd<7CS|uMXM$LLzy&`XUmWovoEbNkq zsX9Li^Yp8|7&69z5cm7B=P|WCJo6gcS*!UVwKr6()}FelGL$sdEL#&gBks2Qdu_4Z zZjlT>DWvwvZyv$DMRpx8ojF6uRh~Q2?ytKMqWL6yEne%|?uv6Ns~jo}lLJJ}?JuA8 zqK{z{;)nd9_-A1|2WeLxoj{cht`vU?r_aY<0@*U zu1amKOMg+u*vMNyDxjrzX5bLjmjjL1;#DLbZ_i3ny5vE+z!uU{_6$@nOOWYitmo@g zyU!rT@@^@F;3JSsQpN8s>6=uf4}KakcWCaWrDxqi@^=iHB{@be0VZWK8wwt=RfZs1 zoIC#hOmePBzs>y^+YC~g)SCPKrJnU8{xK$KVJG?O__G%(ZCCE==htU}uft;@tN(e| zaASIH6>#o7tH-t(oMWcib9Pj-W&P3%*U>NMU6ehd+H5sBLVm|tH-V!g$TF-%Z^^M0 zTyPU{W4fIdE_cj`$2xZ6OEic~yCkf%ENn-@(sS+es;3aksb$uiW}XX7I7;Hl6QSn} zQE80y79*3{1vi_=N@9!H?nv0nA-Bs}VM}1YgL10xu~p%8j4ELeo+kUQ$_7t3aWx_* zfqo$~+%+PDMYv^y1)tPi4R-2V6S7{;8jl_2QQ>2BOp4n%ed=$$={Qy9!{x_GNWqO4 zxcu51=czVJ6|6e(7=JXo$NAjf$46)?apl_N-=CvUk|f&boJ&gn=YrzS(y%iUZZK|x zp<0N9iA9d*@Y*k4oZVCmz&MccO^Vmq0u?+18R^EWm&qc5`a89`Q3M-fs-IPIDdngE z*bOa0JQ<4J>C5MnV=j{IEF{V?k)5Yr+Kqq#2q*AV&ClVWIxbMU+s&sY57C^=wn4p) zoqD30C0-JSWLqP@rYLP^OzDtC%(<>(s=xcs6f#owl_1XEy@M7o3@m0J@ugS~-cVx} z1=j%q<*h)EmoK@0RF`{>X@cQrEY9-|S}iqrgjH*gSH}I#2w3{k6sM=i+&^AI>!ps% z*M#g!s$x`>9St)`Uu{Bil)G5<)hj%Kecwv}s(XB^==l?#iUiqMdr{)xLR z|4IaUxj8B*dLwJ7JP@nh?~k{mX~IXvWgbej@|}5~k$wJJuWDMm67@Uv7%`P!;2w!x z>2u`Hyi58yNmm5z13K`}qVXQrM$U(~dKpeUSQizXRFw#n-4!7H{ezeIFZ~B%X%91y ztQxcj0o%FrL<#TgVt}2P(=atrtimF5r#lY9nQNh zFhUTIIj3+|E`&vEQB$J-6_bT*tVC`XPuz|V)2L|2NDC7r(N~lY(u$P9WO?64`g!^e z!49B->7rK9o$nvy&-+c+CAN2R`_0a2I3v(UGZAE`su{+zhS)V#?9=|3CmRPKM|0(B zl-p+#wPCMF_uMt5>?|U~Ti&92B@jJRG5?sgZ1yTHTCpQ-yhgc(RjZgcTdQwMSmB~} zLh1V%PP#~2g_u3h-#A<>F{YYPAwhEDucRC7$A8nh>!sk?@)jz-eQ&0QKcNaLn+58@ z)l=6=`_3}FxL;T5?nm1_3VF%TZz_=Ko!SCO=XYm5zWbPx=sG)@Lp4;swk2i4%fz5o z6vp0~chUOGIDyiV1QD{LY88(Z5gW3ig#)FQ_^C}KzKo<-cifQim*ZMsgOmJB7Vg-8 z@j{HkHYHkd!s`@SfE(hgLS!``{VlhvG((rZeoN_%mMvWk40Jav}FPdHNJb$yOf6_wn>@4iTMM|NB z;0rzmCFCXc{b%!qjlGn!8o2C*os=j2i2n0Uq~E_@sNegU z&v!;+-G@FomFsQy5G(g)dY6&vmuDQb_uuYr0Bs)@5MHPC8DWT-f?=QHSR@~M zXYRf*7DP*6_IF1EKI3uwqDEv!3D*XYqZByrjQg(`GnlTTtL>hh# zIAxu9-NzTghnarJPDbP^cqDqV`y7y<(`JO^F}t z0#QbK;JAz}_4!wIVD!3m`>0G zF08Ep_mmQ9J_iv*qj$ZCMuYToP)?@BRxrT~Xj)`cpsns5MpjAHgCz-nfM_SMhqW$1Qo<~&a#JIDbAG))TZejM)3&M-I=zg= z<3iJ?h$C}PHHa=?Z+>0VTZ6QRthtgJtwV1-;De_OdaKD)tdJKT*GL8=t8QwQp3pV1 zOfch63CGGt@^U7jm_u&&C04IGM??6_x|+>*@4N=Z&PlCJ%s{W>)Dy_jaFH@LX}NUX(2`6EYw-BneE4w6W+d zr*UBFMr5B0i>=ipNCqcetN-kbEm_ONnbXW_lY;)~TZC z)I}JF$gMkU`t^*6UP=NPI6=S=Rf))c)f)*z=x#Zt9Bbt+jd9a7eVNU(U0pMt|!SG2NX#;vmvhAoMnrmE$}Degfi zo-1v~zczmirp%IyU?9e;bT-cKAs-p5_K16%BmS$zIFRJ(-1fWFTtHRfeQ73U2CkXm z@7Ycn3kaB4f?6VrC~=U@w8q3ssfGuEi-1c4EEH>Q1))uQmw6AeSBX2pcr?lhD;h$P z%(gP7t>jimJMA8ytBC^!JLQ4{yhAnV#eUAGXLAX6;*=*oNYng8^06EC9NHx(0vJoA zz656pGj_h98@21#lCjvhV#2=?;yzQ@uMG*FjyNP3{O!HS84jI*4ek4RrXUC^O8$6p zlH|$Qr{x2aDzaN>o4Vupm7&@DPs~sAxV7ez9M(~t>TdJ<$)p-hPauUu4{ER#1y~hUC5c$bP)w}?|{p+ zrxD+NAI>z_BK_cTYukVm!e(NbqncHIU`T~Q2^J?aWOrEBO!nkJyP+Q~Og^c5<5bkm zJIGmcXgwcq8(7}5#j#=?xh^h!tOOJF?Br>xd-fSCTdFai-TR(KZ2;~&@oaQqtPe?o zcfD;3f;l)|bbS5y1%AyVIJ>MN195eDe!RKY{5?<*c`){ravyuQXGbv7zr}5Oo4BHZIOr+b3$tU7kC%p84tZ! zFoHe2*W?>Su-DM2ElA$_9yjqoFz35~`anoUohTy;_>$$7fy~jYY&o2K{*Hjkk>-O7 zYTil{GtC)71vzop9oC)BbtBW2(?zH8eqFeX<=aT~kqk=W!itF*7#5 zz7SAz7b;Fzt4-2fj@D|g1Al>g^+34KZpb9rX{;F%be=@|e#xMN-#@+5$5#Z0%4YOt zSBU#j3PC=7hRS*wGH!;X==xn!j#vSd>+by-60H&ZSEA#Iy9macc?ADXN=I*cqf*^z zcEgre!@`QRsoum_C+8-Pii|t2jvvh)A!8Kc-;uL>1I?TIUZ;rvjNq{`A=LHWEkHDzm)}Gqz=e6;AmpTdX=&6-53g zy*Kv&^>H9JFj1J|#h7tx7D3V<9WZ&-4Iy5{up)W!Q3Ph$Cwna6URQ};(%gOObEjlk zl&|E3sLAQy6k@RgQ!pyNVICsr;f@`A8WI+RP^VMKpe{PM(?Mkz^vO2#J!R6UX$ZY=f(BZFedXOrTIkPG$2SW zl{67C-EEF4@G$uazgqmG}rgkYLkE0_yXFd+E^)c(@LH% zzSik_>ywKRnBE)aBnGb(dCTY1((&9pGVehWeu9H#JCQS6IyiL%5}91a?58p6rIWrC zpNCAZr0)p}oh5xXcbWM!?e?>JBPBwkHWFR&xwnFTa0< zYD2~{kUZ@s5i(RwG$His7be;Nre-fEw-oN^uf$+ak;UW3LW^YP@p0RT^}gdK5pbC- zmu7vH#-Ao@?;2tiBSVtDn^$hpAmERS@cv%r8Xvp1aNMOV7>kRqk~oXlK!_@cB#qjO z5FuLFEbAco&5cT1i1-_{M~$9%MC<$X`-j`eBN&<(YiCJhU^E*Se5Sf;XNCeYDh{>b z`KAZT`G${hJa)X@u40p(U|$7ly+=k?i?X*U_9<8~>zxC!Y|Q95awe6Q>s>IVO7-2o zpigk}1xX92N!N-VLqKt9uZN(l!x4GDM!P+5uBwyo>zN`Q5FGZ+9wQ29{3GI{tcSaS zUmOLABQv9~cfTfh3TE#K>}K`UbMh%}*+aov9+ZitM4uw((0jodWj|yx16756POL$u zKSfn2t%}%ZL}D7c|7-4k1;Qln;R5Qn-mgfP`2ftPtWcl#Bs-mng?UIzZDgTGm1j&q7^8VZgk4h1KG7r49Br|;@}6vP@Ub>ZWPAa5k|Ny zR+X+oil-Q!M1nj=V7Ma;rrB@U>YNReR(mmC`V*YC7EmkrhB+ZNjW7B_O5D`wvyHD& z`6YHkZuBjKZLmiphkPz=C&~$-U#4vD0DWRdl-5h!)-gmR9XLg~cF@gSdP+N>IJ5%U z<13D5yM>l{kqOhtGRIs%dwUi|?T8kjq@q9Hw-es`#))izv;c1Ly+Z6sA?|035$r~R zM&f5H=p4+*kNZ2V^se zEnKo_&^Q-t`3()wBXszGQ4q=J9H{)vBCH1Ppu`jMjzRZxq4N0?B{-9L* z^E?w8&3{<}-;nA*Vh_l}Yxz`gEI_)OMs1&?VlP!6Y(8!wc!XZi_K{YG)Xqv^7$%E#h9Gx2H2sGMP?+u^ZrT>dH9?|jx@7+-Zvbbh#GjmIEc>E z=1UX$e1=nU^BgJRPjPevWM;6=f@p`b7q+BuSjqi-6T!=~XF=vM2h6r1F~SKdNm1QC z%dwdGQ<|SaPOaMWb@X|f);M-U*AM~;GAna;2$XZo+AEoH%p^FJu;y^8SWHEXBxkm@w(^c`mTEpEz2nIp3w6c@=2h1Bmm4rLaPnJz;O zz}fIbX%t&tF=Z=g=4Q^C&dE{^u_=LolzqUZBIk{H9<+N|pmG-WmRqtrUgESk7B6IJ zcc+2{=N)Ksk@n3?0`Wxs@p*7(R#PAog@4}j<2j|r<@OfsG@Kj5oz^D_9*C)F^LA{J z`y%rMGasSC_%|29==bLzkXJ7f z&i-?v-zn}M%>47@N>7d{vOBiicsXau$s)MS`j|i9y}qqqFUD64hW-gfM5``-9~k90 zb#fh<`v%NbqiNTxj6Ai2%YR}OFCU(L5)LC^G^=ii&wyyx4}qaZ4l$)jOd1L0kvGro zmQB53n%b?xJSYL!(&~SS-POlm6WhvJw5=Mees{$B`7JJ%$3+=UA*|7rGd=f@oy9PK z#TNA}+xw}{_>R-i0eJe4z~*m(GabMGg7BT0d`@&}Vr{o3lKH=0%aHtesz*lHngxU> z;&s3*z(Wd24QnJb;8_>bJu{!)7(X0J+OrS zk~+U6>0$Oh+ny|1BN2;R8+s~0V(2*emNt&1KW3p~8XjdLqhBzymPLDe4I&p;k|&A* zV>%agyG6oHax6+$PXCm>W6D=cN@6F2mRhJl(hd2eke6B9tK{rga8yZHE%0|L!)t|M z#mUl{RSFdbZM~gKUp!Y?GxjpFu=;|>)F3xrte*SC$EX}q5#dOnOKc;vOlSI8=Gq?B ziBw=8Q5K{!Wiazmb3Dh}m79QO-eDv7lb8?n9~x~f`kySI)I)L6Ec#bMd1R9NP^f2s zOy0J@K`hkINidJmhZrKGBgjDPUyt;FVVpc9ICD$0Q~K!fq?(@mF$2Jqkwc80V#pql zUt5N8@uzL@5TKH(6|f%k57(oQLZN)}9{Fckp^3&oS15uUfzJ~y+fH#V(kaNUJ78Fm z>8C-Vm(U`1hN!%gQJfdt@z9#bDG}0qUU^{Ud>-=LhU6_|h+456dGtfV`F>OenBt*g zJRr$UlY4T)%uWe@N}ZobckP&AoD$fuAC^isjWGX*z5kBKy8Yva@w1_FmJnwmE2XSR zM)rzGDw||eL{>&dW}OikZF^^Dl%17b+3S=&$_$AU8M$9apLF&6UZ3B8_x*U>kNffQ z$5pwE^L)RL_i?;l&-DtMuVVF6^=1d1+$p1Uz`G}Ul^n4Jg@CJBVJ1KV^R;h0&X@5t zoRa*Y*&)CAw&%Ie*{Xf-RHaP1y_LD&NrOe(p_pe#ME``vz!7djX{B)i9PQLMM-Hq$ z`5HrTY)bjY`9i7^%GXc#PIittWg?#Q6SAe14HBl!I%pUq&bcudyfV?f`+YyTeuRMK z^YI_k2e#k$(Cdt#`^nUk!#EG_)S$R)A>ZtZM z&hyVbSQJ%$!MoRwm2CyKCjq!-UzmU2(evE$@wd0maSXj4Kq67(&4AMiKi$)awC_9A zS)y(qK4Z-fn`(fFvh+4i2>OlGJHml4HN-_geFzXPZp@0UrB=C;@x2;F*4h zCjf%p9dAaPe4kw%1JY~;-RSjW3arW8#sh$ry+!i!8bAV2GR!Ags?>bKc6}2u)5`#j zXKK`iZu!$!%I*}I9mYjMO2NP}|4LFwsT!jVc@1{k=H>4qOsAc3tGlG_vk>%__!cmU zmkyZcA6=fOc*Md^O!~>Xn8bMU8D&HI+|_M~*bIQz{h?U9(3k&GN`mCs{zIpj$Alwj z@Nug}p481P*RqDvz-0*crYjVFQ7tQmQ)OX4!Ap{)m*ew0&c#!D5;nEE=^*f97nevr z>V5+HFxP6qsZDCBFDH%>I5E@E;jP4MW??d*N|79CR*Dw;VIWEKzR+$|Y&=kZ{8&(v zO@&h~|7P$q@Z&R2D6OL>hQOsVK)di1jto46gM~ZHZ2*+{p!?kL8pzV@49SI5R-Sn@ z@2rmKa!0Dl4F=eVu=pzA(;m{#FUt_l2$PKO>si*F6zSatSu(PWGEDk7~T22NQ%AVPW49wZgP6 zIUS~&RdLM}<#+c^QuZqCBzPRx%(R!qx2{ZHzCF|PEO$c+Dp{)6{SHZUsyvOa95T7I zvnzIWvOhJ3u(M0c#w@hruDLEC1+r?f3OPVYcPw-GLAZe{iDA*MV>g;at~VTdlCiM= zN`ZBh8Rq%)coZmbJa_a~j)~AXgjp5^H9sh=_4w%EG*CDTcMR|ptGu8mX%+<0)OTX^il@WiQGkru-GrzMC+PeiD`eUi!vV0X zjHkW_l-`S8RIh3hNJYBVb++j|a+vCPzr42nQRR0agNPOD$&%Kfgmwo&7^~VM<`kw7i@=pd# z>iUzkVu#5QYOj|vO1M};#+8K>D|P2j5V1m)4?%nP%P!tKTce9DRkbw$OY+x4^tq71 z^==*N?@LxTAhs;@HMqv6SDcFqZ+FiZBR@e4+J;Vr&mUj#glu?=;#i7rozz~W$`G+1 zQ+m>X+_R`K3<9(`^e)@!v9U=_%q&jvz%HV+{&X0ePM;`t=LHho)R|?w+XM_p6;j2x z3o}!x?0zZ$s|oitL&p7VAf3S1=^o20f!+d}f^C`{Jz6VfxklPqrAy&RZUsA$KdEB; zsvV(1ltiY+ayAEojn)*ebH~&ORH#qy#OJ(R9d{&KSF9G})T=Wi-oR#Rnd=_IZrGDm zg!7u5tB$RQ5Hy44J^- zYw4X*{^iy?zwfD(#b1$w#~}MdcpSqeu5yLslGGu<*RBV&t0y|~0~`5zn@VPlgYRY^ zXuX10mtAkYiB=Bk_$4M^I9>(LWtLrr+qgRnK0~5k*&_7{s&62ls5@S}RQ+3q4-fvp zQw|M|j=O1w__(vaB6*$d(>8ztRE*QZGn-Kaz{XAQ2(7rc{;8b@AAF&U9XgR)a~Y$_ zp_~$-CA|RY0V^@7wc)`mczD%AF8>*f>FWA3> zM6YV+=*0z^M$3{~^J?F*huFE!3)?O+xi3EY0dWmccQ25WTdLSjz+)8Uv=K1vggZyu z{6p!rJ#9eSjm$6KUG#cP7wwHM+;>q{*2%v)N$mw?!foNwLC_`wl#6tar9AGnv}UsV zDny*W#g^Vqp|IdVlKSZh1!|AD`2#;;``~;@9x<)U-MVm#P?Y$Q4(IpGqwWAZ<^uJe z!xu4K@Qg6c&jCAUIoG*ezHNu&>Dlq=mw92U)}IO>oJ#1A#Y6A{u`L*4;ss%Ft-q;RTN)4<5h9C%nIYs8Vz?D`>F@nn}@C`O^ zCk6Z#PSI=<+eb1_6`!&VPm4dIrxsFyLZ9zO+p%OrNmFseLcp0KaK*ba>sKNHH2@J4CLD!(Aaj=BK1*-| zHc3hvRqk8>Bsm;B8_61*NWYuxr0>3fQW~N3S1NAVD9Yd`+=4!4^o`H{lI<7n78krM z>vY_wz^c>x1(Zo6#FuVwv1GSMKdWeK9>W%k^myC>wKb)u=CU7bevphg{V54>s%Wba z59y@LX$gD+GA|@n_SC)fBpIRfA_6BYcasl{wTA~L$Cr6m`-7nNIS=V_QG?{~Po@mw zeygABb)2gq0`ET`=QCkzZnC1`(Wjp%!-U}!oKb-1(fe1fjiEAjWf|cTAM3Yo^uXi3 zV~~dVQOWk5e7aD7#*yfL03Lgc+B_Soio;`G)ks{bCQ6c`0dTuDzUm5Z=#frrrQY5o z-0K(SnRQ*$e5?!0p^~0I@7<)C)vdaG;1f-vj^tBol#~I7j|uV;4BEc|i62Y$hN8OU ztgD18{_4x#lBzEYiS1@VS}o=*YXuGr3679EXhoOPh-&2Hmm(*qdjxF;O8l&X4tjWk zZpCevX|dS4G>GJ>=I3zN$IjeiI(~sDkMby3b;TGr^h5%23i`0<`CvD`RV@e!3*Yy> z`H*2S2$*lR*QeM@2#Ea8N9u#WC=TRZjq1G=cAB-teOr(NJW`?1@5+gtg1kUSZqp~I z`~hnNAfmx2qzV+xWe=&xbhsE(6?Q2&a%^AbU_OYuzBJKVbuU%n5la%nTC5P(xAK$u zHJm0Q4DnDsiI7CRhh@_?mHl?7vUj;w9Mdbkc6JIY?j z$LT9$Q5m8!xz_cijfu7D$8a$@R84#C7+?s#k^3MogYXV9Na6)y6Q@LeUqY!{!v-2# z*E8fZFof2~6Vu`JD|v#P;^LuaG22OT^03!Q%5SUz!R~zD`bZJtcA=C#L>iKY5;xQv zoIwd9;#*sz_KUre!*a=fqdq>Qu`@aj^z2P@5!TzZANdpc3Uz}Yjh%R3@96nt*xm)7 zP|X+1@<=ZKy|Qg^w+l%Q$phwlpH;wN&u2R%g# zJ>;UHt8rtntaMx`W6cx%a`t7(Ko3gF-E7Y>k>4-z`>G6iQv4Ih@>uQB;*I2}u?{EL zh|sn6o}(ICCLQK*wmmsJ?<(R9eB8=*)kw>fLD;&s!&9gNq|Qu7Q@XMcTXN(rf3@x* zz=qn=yd*nOI+w6e3XlhIO>k#jbtUZ6qNX!PtxoTZE}FfoMOmm-?;m4D@{Z^#XPj6m z@Wf(6xmazb6Y@|FE&nh`i=@dU$N*%1^r!ej&|=+6q>xU#OTX5@>uUjI=1bW3{)qs+ z^JZC|xExKdhI#Iz8ga&Ewur%Wrbt)hmAjWS`<2&;slbAnIrsTd9*rEZK}-Ui9B^xq zKNk*w)d9%P`1K*>I(p{l@4sUbF*wSLzp^cUQmz6m-XJ_4N%g^F8*(!v$SDF^HU^>> zs;#cKC7w?LewC8E2WpQ7IK7V^Yyz^#XGhG4`qK#(fPlT{_J***)emqpR}o{dZ{QW& ziDFhLduRIJ5;A~GP>1Rv*S%^JDb}LM<>8o&>Ky~5Jyj3Z*G1UW+mp0=IGuTE{a(!T&ly=eST%8j0;~#H#uTn6X zg9x6Z)}ye_&iq&y5ANPI*~qG_Vv;oWMZ|{c8h{LTw31I~-<78(Lj{R&^nh+8;ur%z zp5KpcCt#acEJZmx=)-IA#mQFD|NI^PTY{YTKTf#=g-@?~Pq8K$0DB;|AciNjH^rO> zf~?Dy$luoqIQz`eO*U}GvYv%~Rw-T`I*`4-5dbuKLa_N2oy9-u_2I;BdCHNcKvSZDeR;Py7M|pGYMTa(lIxJ&g~|$1M`8;@z>!CcwjiR9TBFthw(Hbc?cC`?!ByDJ|^PfOzh7b zwx8r*w%ur$gqo>#9IgcG&CD-!)nQG`&yK$0EMfeh3~JM6UUlgSY)i20yASevd)js| zmDE;%K|*fbEm86XF?cmxHZa12AdyNx@!Vp#_0-a`sQ!@zXo4Od@(bln0$I{=WI5lu zd|`j-cc^Vq3?ss4l|a*3u}mtY3%Vt7^7<5&1hIflXA$SFAst;;==n*?c%->y$w!DuoooTZ$P-|ag9R#Duh%# z7}MFQ4?z&2<#lj(j~!F4Ci1b^DL&_d*dJKy$0;8V>jWkPi2#=gIGgiM&GOydLW`|*g)-+GMUaDz_d{i~=&mBB^ccB{%?7j}itW(g_ zFmQ~sjx-}Z#0^Q^9u)uAz9|9O+J2I^AzEm~)oW+llj+qHl(e0y&!E2 zgJ^-LX9HRg_?-ma@9VHY<*gEYWN?iFHUsM^(u^OUpA67mY)?E-^+X92l~s`5st5?Q zp03Oub@MVPLxpB=6qyP&hQ13_I@C5OBF{n|qiKx5oF*%DIW&))l4ethzZV8EH)c86 z+s*mM&p&JaYz#X8pCn7tQK1I`UZYNByNm#~ zIUZHGi;?~5OAt>T4@>nkuouj#7>|{4Lawd@Mo^Jln&~+6wn1EX>}j|gg6$KFGXx2@ zgm;@l>5&9Nl;cWJYY3o7%E?R=d5oy}sloJ30BAmx%Oq0&1VLFx;ZDOS@U%u-!C7hw zaT%+Un->RzwJ==>0jBr3xA?_uV-$y_IR>TvwDz#P_6DkmWj^g6Nr#6bpJ@qOfsA?y z9u?=|;E5lQ{CyXsTt7aaVAxvP7;E+QCr5pI-<&K1>cUT2R;mQ;vq%sdJp+k5l#X_? zuzCemtYCkHJdEo-jTN$rTV&3v3euXK&o8~ighMo1H%5gA%iO7ZM1^18j@0)SJ4S0yU;ujj zvtPCqJ<(uSPlf{PyvxsHx86$IO~%N|tvvXeF!IC2C!CKp&GP}ZVU)BtKX+VO<`JgJ z)37I$U~ekz1_tStMcoR0JO&g=x|0p%KG|4s?dR6RHmJwa=D#^w8G zrMnaY1`mk~M>dKwO&mS>AL$m%op8RgJf_Fi!MzXaQ|vWSzbt44-I}z<8^Y{6e!K^& zrV$Rdpw%}hLKSSKP7Q0Mzb~i4<-aH=7Io&&It>XYZc0M-In)BSLJ&*=$1a0_vfO3z z;6&iq>1}sQZY(}2f;z1Mr=@EYlQID7TMekr_H3iGR}F7L2zWROf7ki<(3VF!dY8W69u*Pk6%DBj73 z4$B*NX{pbBgH(*}>=tkYg+b3Ex_J}8q$r=I1GG&5YLpb>%_3owyEK<)J1j$$@35Cg zQ43^u-)_D!s@nAi%-vLixQJvblDNVqe8)2DTUaVC`M`(RGWY5OByVqfqvR_j|FUD! zp;UlKeyUc_-AB^h^$8fR+VP=CI6$I@)xt;&A?qIY8h~l)2(H<1U7}6CJ{p#SSit!^ zgRoZ%&+TY?@v*0_+5Nq`_9iSJtYulOC2gwwvUiiv=)Ptr*TULX4<5=%dNsp%%N3#D zw}5&2@6Gxa#C-@AbV>>`f#NiFZ3q1oB>)_X{C3M3F@OtwmEfr*ZJ&rC!<|c!+CBh& zZ<8IT9U|ePnZhMBhZKZwSxyj0eF?Zj1mneBfhQP*cFZdNG(58%vnYn_!3f>W#O!1D zmg(#fgjYwaqAz8YRr^($%+pN0QI1VYCCgyA*OK zFCrr`+Wsb+H6fbx#$gb73IK^?^@0LND5`=VdcE>_N)gR^Yu>Yrab=c#+GE7a;I477 zSOed8s400lN;fnUePR3FykL$liIPXB$}GQ0OcgTbA_sWV8%1^NIA2APGm4qF9r=cW zy`pCh#oSG>jeOH2!f6}KV271-7pUc1&$*l`BLc-X8KK$m4W~(|HEM|;kOcN5@=Gj8NaRh-7>+Z>Q?*gzZ2{9z0-33m%dmaiDvA9p|Z2wEpXm;z2?1*gek+wf|x3iFjc8lVBHBv8ogB8v&();Dc}Q(1UBD zwtJz2)uH+VB;MUT<@Eyc3nQ|xb&;poUfJe?-FXOQTLJgNxA6&q!B!^F$sAYFEqIj| z@&Y)rFgUtg^CNgo?vs7l7a(H{G%jt=5=AN4)kO2uHmRJtTYwi-5&?L6T04PgEVjDJ zyknN1yXI&;j}C}}(n-~>zv~JE$GDO1jyUStgo-TLP@vnRcB4pgdI*(CT2`$uxZsyD zJ2|;}`5vgvdgrcqE95ZR>&+!cIfa|O1-GUzV>nby{7*D;QF0R6cYd1v91&P16Tw)* zLmM5}f>75yas>l~f&h#{yjcKG=~|1IY>QAnM1VIt!I+MCH2h>?ZLIzgRH}!TeQK-b7KFi+|G^RVz5kDIIP7rk8~v3;{(* zQwief*p#blPz-BF(-*$Eo;=PDmyqpJc^8kBqbMJ0NCmrbXFs3p#f>W>x5(t%h@)of zdwYkyc^|?4bRygRknpWU(kBEuedN5hAY{%gdcb3ozW{>McLH}fqaDZ-AlT8#`ek04 z3@wfV5*x@NxQFRigeMB6osF=hD4jm^%8l*8H;`S~Z>DSGyhcl?x=9;1iXX9xHtpNfKBz^|D1{J9Bo zSnB!Zkc;8`yx{fr|0uC_r`xFh(HJKhN?Svv64{$>I|-m1Z>s5FnfrqQ64_KZOQZ2O zUFiy=onnlDeEv_#W&pT!!hI3u1>;JnW`r2RVc+w0-2)M^#!yZScjUv{0o{9u0tdAO zoQvYUCjpN@?8Uu{cuVfMb5M-!z2^CCb4(ryB0s@^koI9VAlu1q_-UdI&EFiFvkCfj z%F9?ip+1!B3Bs3~%mkR$f&yW!?iv!4IdP`Pub#S_DTl|Ob319Ful!0!pgBg%`?V#n zlNg%A(PWRltCr}t4m`h-cAG#68^afPI&v)uO>d&MTf=}Cx(^cUcRm9cC^(q{pm>v2 zXMr!R)P&g`r8wlj!h%H!ROt2R*Ddk16DMX(b%RYfvg>BjJ>hdXRt339O#hY2*rN%JU@eK|zu$f*~uuk@POXgVWKH6+b4}fyev}^%j%%g z4a;Wh7zg6n&cdzhirEhiS00#)q!#)f(C~{bkYO4gV#_O=1%`)9GrRWFA5ygS$)qAV zo_G|0g7b4Pe)|Qq8IaCRHeDOCo+K)rNv6E(~L5MvbTn_d3-Z@$W%co^9*BdXl zH`!^bTq{G{sB#hTG10vCAG(OWnE${^p=HEsL^8|ZVbjFwps`RRXw@TBY6c8W1FAJY zK|X`aCRI2iwYx+=PoqW27#{pMx!$fLMDfA_H~+D5RO7F?Wl?auM?*sV@w5#o*S2Cd zXv?E#P#3}n+Ec3Ijgz^%ZWS4P#!=rs22Ctm39mV#GfDRnJWu8RT*Ar{v>#FKPZO9! zd`XYyTul`UE&uhQ|91`!RLCFt;ATRGG{7)tz_OXFwxYy>65@Xa(StzXlAM#m4D|aM z9EQ?z#lF5qHFUb(o5_G&U-Fe1WsBt8jy{J9lJW4jM90Mbm zgv+xkg_?wc-*FQ0MY1KTbIJ{Hr0f{nKO#?fdO_hM$kC$43tCmRXTbDf(FO!)e`GJW zen~gle$U$*m;n8Dy~1nTC97tLx2RZ_HGOu0xP@6iQP z9lmCMlrP#n2G=2f`8o{hqRfdJ^orhxgmB~HOeCpnx&70Oc924?l&0$U>cuG^!S;)Q zGATjqVq{MlNH^jd+4&{j?vkG_S;8`jv>{_shczJZ#}#0E_gx63?Q%S3M53L%7hms! zSDs!LLE_oXtDfhkvZ!xO9Z>qs^I+PpfRnVe2#D0jX%!dn<49!> zc&PitWj*;*DEeFtcpE<}Sm&mDtzi$u&EEk+E(s!ORWWX11M2s*5t@;YT9T8Ou#f#8 z!V4Q!&rUZE1-u9H*|$B$*OtMy=8_NWW{eB?N@YGJgRfNBCLJKbB;1}B9+s2TesNZMshy6_+rSX91~yCaO`m1L)uX>##Lc?+<}+bYy`vm zv1bF(;IZV`EQtD(U0*}y4@6$u*n=ASxOuo+;X$t$gfzhaC?hCxQ=1B2`#=qP=di(P zn)ky%x;oD8rT#$OL0^1(Zs0ofj>n_evom31YnrJGJZuJfT?;h2=7|FKbs?1SQMKFg z_ltKf)K~xx%ZQn?M!XP5{24g0(DuH5uKQtwd6D%3e09I?hw&KwBci=y8pro#iJk44I_ak@X_Uu6@Y03`Wm& zyYZrc7~7Vrv$&&p7PC-*qc$K(F(ve)*b#kQrdSG1KU2-UGn**+H3s-QzwS(ER_%Iu ze(xqytA8=L)JZHQ05h$v(vs{?WuAjLfL#0?P-x5H%83vtkC0x{ewup*%yB2~y5Zia z&{HztqYDqVO-+7k?ut{duKoxX*ZkB$4BvAy%R(yZPykQ;KgdKy)$g7CJ z-qcr>CnzG3BM0Ts0;RjUS_G%l;4aCUyFT=pKtGNf-UrqGi56ome4zQj1Tegr-%H5? zV*m_-!WhhxhqoV~4vgR)&gNLZP1;)sF;0>B>B2vN0XsNRA|-tTf<8{KEzbp8QnU_J zhkl_uxE`~)UWaVJ)e#ag0};m49f`^`FO-kt?=FH+XxI8K%BQD*$$TbpU1bS5o;Kq#D<{g5gTO)=9az@H7~vTP3$3~5M;VJ<`U z&S6M(-kUgh2?u7x#eMbjSU%KqD6I|U>9?-m0DPFT7fBU@j--BI-8Tzo?*3~AZGqbz zN__@SPuxC+CG2a1J;wszpR39F&??>9&m*ASK!#bO_`NnltpD0g@To%^D9O%qHIAhpppveGEJ)5S=Hgk$!8+;`P z4v%7Jyua=a$)Hvo)yZ8$DR z4TN!^n4-gkjbFAy&?SIvL6_jNikd}dHRdNiXoBB(M;Ry=RB-!Yl1VkA8x4xkL{U%= z3LqoBd*~Pq!mi2a?1UMMt@Wx1)aUF$AWvljgTC^|3#N;}2jV`?oPKoj5Kue)lg^eg zNDDGDiPUw-7c=d~A|`U1dTLx|{7vO{y|oiOIx(CptbRXPVGu;;hufk4{xdkPmzGv& zpV~(^h$%n{ccWN`p@kA(jlPW#-Je)S)0WeIWH!=I+4c)|PPP!E{UW1epV) zjx*N;n>dvl@M8-ogl5i)9ss>Dq%xG)wbt=db*!;Yx3T`zl>MeAv-c!#9v@nr}FPio?a!?;L5$Ij9P> zdN%+ud?v2tNOy>oT6VpX)mi|{H6o1kN_K`;Vtf?vHs3mBQ$dY22Ch#hYf0yCh6|t~ zuRTd|c`hNg6k#$y%wP!9^bb=PW`?y31WOH)YaH7*A#P!$n7mlual=w#m_nWLoD#@t1$ z_HlY<@dwD&=i#1PhHX6*jBP?Bx>KU+7)>tz{TzEfLd-Yk%|nS}s`H7lZ;)ELsI-Z! zaXO(D82GhYIUuC#m9G}9I6oRYMx8n80j=Fd5Hck6Jg;p1j~TDn%mKQwx|tqLsb$da zo4ziFpNQnV@;Bc=UQFnY43<{$D($Y5z((CFvwr;S;|5RE8U+9EzvkH%Sm{Z0+e3q` z?9?%Yc?=-OgW&<)E?zP|x?qW9O|linGrnyhx`V5)&g#_}P-eA5TmHtRZa)2eh)4`$rgjV+f@bfZ$4}1Ao?a`x5Feq3OGBT{3@e0j^vGSRv|i zFLOKu@G}ktx!ybyE0d2hJwgYyymxU#Zi0;ZBD{8ESDs^KQY??n1K8(19bTnmw|0Uq z!*XvKwKC;iy>Sk6^dh4?QBD_c44+gQsFWfivo!FA><=fP(qiXD2TZYM}&&aWqY<) z$<+0C7m*WhT&e_Au{{`Cl4DTbH!VBIwQ%D6Q1x#r2cF*7!$p8DZk`CllC+iXCebr* zSb?Xt0jw#;1IY*#L^HV1B5|3XuK!?fKYvs=*g15v_j_Tyw^OH8Y(L29fb4YZMYqYH50#ScV}nPdBws9K_C zb?hXwKPij+N{`WePK#GcY4xI_l!bdBq1y$y1C zv`Qan;!%5fW*l_lR10%{AoGT34NI~9-UqB0?RVwH%jrMPg^+8`i&%5p%ae$6IMDx0 z$#N{ce>%Z8qPmT+6+Hj?IsJoyttK8r!Q?8k0t|r7-kOAyy2`B+6Rim|7m)cC#Zql{ zqytuZ+@S1Eey9o$?qHgb?7RIOPhp1%BV%uQGMNk`ZPCyVLe?^@+k#;VgbW!0Y#Qv& z8i6WVGzJbBF9q$jLl?KDVTWP6I9%^>jTMNLoM9)9VyBY>oocAE%r7wY2a4Ec9`>WhW8g^YkKiyfEP&gR7tr6!? zS_Y{lW!P~b|1HnnZj17v=HIPIf(F(NQ-|-}r6(o^_KopV?NHf;0prxe%!TLvaq_X& zwi#s0+G~M3H*BV)a!4bCI-ZEq-zeB=6Y8CXsQRQi5e2zR4a>KXnHVAiQ2XKUfainFaCP zY|7`ic|yc$AQ{((DRom%SZ}?Uc=L&s%V1n9J1wJwy4|WpH18PPM>bW@sKev)9`{VU zDJ$c*zd=j|z??=qIB`#7HJo@@)Yj;|Xu1RheOR?mHVx2;a z8bR>qG9B=+9)-@YL>(B}>4oS3s6L(A2f^)7@$IzT&dB|{i)>14fodr^Mk{3q9T9WvD_7brx$MZ~MCoSIYcCPA zGPhOeyy0B`beS!B>@u2Jr8h2jc!Ytc_#pqTw;AvFYb}G4r_UdGh<&0d6*g;>1w-(f zcHb?_dTilfZ6S_gmF0*F*yX_EeOo@E5P`pv80myz)i7ZGjoms zWtBddCUXriiO`9D?_e2joFV(ZMjm`*&36a8^%#UKqnTQZ(}d<9*(qML@O=d}Asp%| z^V!&CjbqVQ{DR6%aC_0bhJ#nuAb!E{c_q;+BLk4tB@vLg6xb^~%;%IjX78vm5FL_$hhkTb988=sPW?yv31HZaZ;Q_Qf_tXj5!s# zED-__^H$>|K;{0_x=J*FalV2O^_?%*x_XEEotpgp&`Gz4S_TO$*YUR}*Bv)X!gsiX zHb_ueHf9;(b~m)xQm}nEilhZ1pfumPZV_^psmAVqjm|LmA z)Rf-KlXLvOT>=y1tMwe?A(>hXG z#dp=ll3q<=1guI{6EB&tv~XaojT91U#UOfbX|3IPQgLAq3F{56Rvk?fDYS4I0tA*g z_MDx|8sd^PVx@&dUD|>WClVD-c0a!rj=s zc)}}K@RikfeU|d(GRFf~NgCy-Po*qqd(($&F}g(E;C}5esfuQFib}Zk6FTmkweK>o z>{Scv3dE|mg|}sC?`Sc0XCfJ3tKjVXLCV@8{3@h8!3B1U8Dg-fnn4_f3$I$FS{_$q z0Cn$7@g>VEfvjQT$D}Mx0>kJeI}13J+j)q0=jkq>G}*%$C4gEaFVU@0C#Ezf50+09 ztD8?eawyj-xA{=GIW2H~N*`ZYNWDnU-)~?`cBHFp+A-IyW0i4!&Ns69z#pxf7w13X z7zI9#Xq9@eO3U~S-^ur~mq*dK$BWR09Vq4W^YK5BKw2@?opZEKG)R3t(Rgfmz^$3k zoJygJS9E1JJuBCSl~VCe^~V#JL4p?KTD~a3u}@2y7Gts(%7b}0kAH~JbI7hk6*O}J zC276WQ3OZhKY?1U$&AR@8e+nr{FP#Z7D{=lZj6n5HuIyy>Ty_P2CU# zV@s^qIAtL}pEUlm6d|qgGW}N7BPC;#5qP!Cn_(#T%mv72-e_4)X^t;ba1R`` zEv=f|pq9{p)N`uK*y)xsRM|C2qN;NNf}lUWD3izUopS5g94K>tyAi#iyBoVe>@7d) z!e**7Smfg!Pie`NEVZ3@7|ZK_EZnff)ByrNx+TtSxTfLT*zQT&Cnb{HJ>3OUa2~2{ok)rQb_J#qNA*v=fTAI~y65#{8E7DvY2D$cygO&S*}b zR*1W{GCSC6jmBA~(l;99e)-%0N)So1KZ2Wd6GP4$=$41q z#$iV3V&PXLyD(onLKq6)hdthTJ9-NIeUt^uFb#)vn>J-imgeBp@@>|sm|go085;No zv+6uajcF~upxHa7kp1dIgW!=1#eT%;BQS{1ZRsX_9sSjPhcB!=8-r9%wRyV7 zcWci#KC0YT{-Fit&mC4yoEOpIxYsG1w*6{UGI&dZ*$_F8MWUW``E)i~vx zdj>Zi{)FCqJ1GMi3JT&4U8?rmz?JCoLAmEvUfHZGyG9zZyh>>|X{KBvg9S!==-h6w!Y>TK$8iNfEYXw`i3hi6EH&^X~5;(fECxwl;)PzloQ%k8G;SOhPFV8A)3Fa(Hx*{LXk%n z!YDog=~9=tUHr;AiYfu9onK4W4lfK8CspYMKk3$Zxf!*}je|oq2hzOjNffY^iVjBW zq$eT`(VXL}LZm~3>i0H)&p07x0HK)|4=boaR|Ls&KQ}{2qYk4URAz^?x?!4EFtd;U zItK(_BR3LE_|P!yZ5dbB26u3tQam-72}1*Ye4m~O+>VIK=U*b?7vhLX&jpFr#h`hCNGFzF@% z5aMlBWh3=SralPBs)~F&;1% z6CY^2>V*XNOIt=b$a3u|t|;eKb0=N}(4eb(xULAUE{8-&GHiiQtcJ zb|K1>Yj<2sdZB@l zBVEEHZLFay(zb#<9rVstk4L7Jg~FRu%17^%EK84@BaCuT6XjQGtC#q9YS`OgdQWbQ@iHjq$^}f%Fd4IJv-t?6ah_>MNa)rS!xpPWDPv=8M>> zBLYih5WVLV7(P&AO9QDN7fr}cGQpQ5-L&LYXwRI(Sj-!>Z3UX`{eRpDK#oC!b zn(z}-_PZH8o=Gpi?zk-pfy<>gMqXjJ5s3c9lq?bBMA{e#fcT4R&zmZ={AcJ-9#+;}IS*iua&Anm8Q0tBT<}XyI(oWQDuM>N|rM z<74;Z;tN}rp~+~kSY}VRguu2Rfb8AmwG5&cmiaV>4yC;WIQoXU3{0_wX^>TSvY}&z zVQfVk%4F+sx!z*-U=#*YSO)*JT!RVSBB+17hqkc-jPwTGkW5 zMIzX7(fg#)G^BWweO^4gI;40+2wUX>u`QGG?N9xS;X$+jQGprQs3-{@=iY4#PlqhG zubw}fz^EHlrQ4zPt6ida#;$g}^{olv48-JROFw*o)MBN)EAk&HuHXA+ZtV(jQZ# zED$`VW3==N+t=13W$)|U#8BGAgFm!1*?yu$IKDOwgbAnPjQXL|Nh+16NQb_m_7}ic z1BdT;pTt)ey59#iv-n?zi-T&33Y7Nk7te%&dhpYBKLkOaj7k7ea z{Db*;T;<(wIM752>sDOWK4{G2qsp7^?~9Y7R8N72TcZIi_192{PGzGPoB5p*h|O<= z`p(KOY5U2$@y26t=zO|Alj21=+_sXOjm1-OELs4#JBFvxKLsMn0wa<4O2f~f?s(+_ z<|QjOC1q-LTQuDsV2Aezv?7h*gC~6OJ|99+ymuN++Or#;eP;#*rHOfr8t@_SR4F}R z+3)ytADiVEWXbRiQscVw&c~yX#PJhwp&GZVJgVJbpmm`*45kjs}Oq&xIOf|PR98FxcZLD$_oC4cBA3Otm z4`OW(2C|FHf&F_`>aHC$ch-bCk$gm4pWjLJ#cFp(9*Z`*fTiimOINm@72U8g*U!Ic zz4=tvvYL68;m4#J=HEom}{UU)d^jvH%3ieRpc*aj%K!cBjaVD(`yxBt5S1Mt2A!WYnF z>_8pDLDrq+MV4;K)zSM>)_NzNM4F3#PGdxp8+Lr<|~01xPxIRa04Y8Y(NtHT#I%r zuhbDah;Du?ejCQ5e4&*Zxut)+mx!a*v&UZPx*)w9jD2syAQTPNV4%6T)}1qBdnhk0 zHC5MfhT|cxM$<=BLk6)UV2(`d3Ph-7Ew_{;f|k>%L`^d6rgtQE{|s2}CjkjzxZCMD zy&7=sWV)SloFCmAP{^CknX>3ivgzv93Xz90vI{Er&WD>rZH3$ygJd-xulM2|SQ- zb1#TxyXUXiA1Ghoa*qLNJ+HXyVz#@(>!_;tQbvizX6cr77hnjN>mX3(!X)#P#h2xE zUQCO5!uHqb%j97WTHuK_@#oxn768i8)c<_Rhv%@MpZxSDS+CrJLDX)fIur`}zV=Az zji5@qC*GB5j4q!k*MuE2C-Y!D>fl^LQAX1nr1A@T_P#62qTFFqYB)EXW+jRW=Kc2E zZ`wtS^;1vs?J?I+BwjowA_)!;+8&Rs;-#&Q7fkQXk`D&9mNq6tcWG|W295hM;JBv^ z#Qw>ie!#&`3Zg(CE5u2z$y$+6vPDYaOE{90V}mN`7gdjB`KMwMK{Dsm9q*1GPGy2Z zGjL`Hl!4a+ywt@KEe#0kKn9v+x!`W68fHQDzjS%XR^;{o9RB)rc7&$=K7BbTuL{*IQqu_1L;fI7GPA(Pyo`7^VIQ) z3+g2OSj!fdv>%BwA`U^rx6q66{40J$-LYP#|EeB_|J6EtMK4Q%u8P=E|yN& zr)_E;dRVpwD(Y~so^9Ud>RQ*f-|K$8QFJk^GiG{C)BG$)e^r~9;`Fw_t2%F)iWpnQ zJ8x$(lShqWyVbh`#+b0Hs><%C@poMGdT9q=>4$4g*Lp=G8_?YQD|@{S$J(CBZ^8w&oJ0d1(Hob-Qz#&&q;)XGrqel<=(VcZ&(paa zI~ZV^LAux`!fi4Gp-2}+eHBQ;Ie||6gkTuKLJbqY+6%AdPs>~t(6oqDE>A1+u9PQ+ zsdbjI>a@l1g7jA1fXa6%_8}?_Z z)qiH4|7W}QI+kR66kjTP3kswN)N16q%S`=atUzCMEdAzRw8wwGEu;vgmy~ZxF3s*x z`tZGv>D!yY*IwIYq)7JeVL3;AToN^)w9n_P5!~?s|i)Z$S{Y_EnT`@DATz$?@jqLUG7$ z%sjc(7aHdMBZ1}b+EM+em5xONK%A#T30s|$lT)_Tfaam zpR{62k&N;3>%RG>F%74LvUES+DF+eL{2{*UO0{>ZE%M3rrSNcDK5I$HrkWpZ))7GBAFA@$vE95XQha zTLVYwXnX*-|LfPUGv<1MLme$B->?Ehwb*EPWo4xhAdjyHwwO80=evLXn3c`_5-UR% zGc_=8i4x4diZBJ7)6#7jLU5}o<>^&sgrUD{0lH4Ut844JdENO3afJ!fAj{l{hYQ=; z6AZHnIF}$Ppb$nEBFSRjKov+4s)1i%f*SgI-uMkDBvJJD?3t~rsd=@A*uV7;42J(0 z1aM2!L)()Bu-HQrFen}(=(}01)-K|pDJzLZXCL{B&C$++}+amqd$WygT51{4G>sfmZFfFoxKbDud zapT76*feOQ-%Il774skD(U08M5DxLHQA=x(bQl1dtSuO~v5S#zX5$;Zb>2>v~qUA6z?RLRzRRc8A81(8!9JXlM1rjMegriRzJ+Rt?` z1{C$~dnMk2Ge{(-@y7lD7MW(T<# z#pWuZd<1k7{s4RMm|5o83`zuvOr@+e+IUbwa)nb<*PmTwRb3gLTaB!LY<(3LD13_- z;8|x-Sd0S0S+6+Jc}aAJ8t6g$@8;mW)?D3su)Xdp{&jUb-a*+vnh-4E4?qmM%Oe19 zJpipq9jt#7U>Q{O^%()y2yAO>1JF+ntSP6V9HWM4-t&x5S@{7*z>mdZ`^dN66+hiS zbivl=Fqr!E%Jr?Vk$*mmkuo`f`SU3L`Mg)j-_OFpuMXY>3u> zeG#00?G=B%_Vcs99^F5m{r>k=`tw;A*8ja+4Ca4V_t%2>-(oA)rMWc{hbx4r=* z(2R|<#!frC{rSwFzobz9?-+&9mH!nfXhqcjf1;++u=9BFe>}8a0%xyXyY}?ivl_4z zJn`Ix(Qd~bJa`T=!k)-WOG~>KR9Rg5tC0TB(_wlF8YgE67)3?6AE+toK+1|k!_}X@ zc+o40iETZ<)6j9Y?}ukF1cseMOz`KS_hXix4du2o@UJ5%A{8*@T)#KFN~3`P+^qQ9ZPHt5^8_S}=hh(c6G5R8w6T$Xp2>jKw4j ziVuL!nL=PWu61x#86@T>yXQG5+G|j@Sgd@`~NC$f+Fu;=-xW7uBlne z`rhEk-Xu^u;^!z{f)4&T?u3R}CI_x{F z1uec^pHr{Z{EPB+*8D*aH`F4sqeQ0qNZg)DhOjH%f^+G1*Dz$f9@qn#1Wn2d<$&0G3>*AKd>-kpKI| z1hxTrPY-Gu-AOl?%md@4cEESjo-(XZG-!^Ilmx(f!*$yCzkVlq6%Ay;9sqnKy}$#5 z>EZ_%&w70^SyFG(jR}TRl@iZZ{g)MV+sF&HUfAlHA;yx|LxZ#r=@YgttxI?76i`F zb$U0(@M0%qhd(Qe7PQj&uCwo7oo&kmdiVj~%5PhS$*kHoaBgR$dN>C@E8KNKH{h?? zhQBW!U6cl%n;-N;`mWnB+D~Bk#G?Elm6x2)zi(6&+$bjZtir;=|9mYDhtrtWd06}J zWdSum6$uH6({qjgwdru3|8F<}f7NimZ`sx#hQvHDhSW}3Kxj%1RHEy11bGH=v>AVXgif_k2yqIo100&XR&{NErVeipFs=!4n#6bC2g^jlmkC1>W`76SR_M`5}gfN}NQ zN5{{`X%C=V4Gi2i0T?U?OkANtcE%4GLN06mQC6mar(1~;c)N$tPt?Jr@n;!e z&0vQCeR0kpwoQ2b`t*$0nZY~u>mxMfesC@%F^t=hfByFqjWNH+=se`TYVA$0aPNwq zT@5Y=)=nK*nV7)0b6Ge)KAz?0TKT3Z+<*`pyoiy=&TcBW0}8%LcMYbBI5Mdwk!Oj)+0AI%ZR}lQxkE@`&8E`V+4-`Vl3_wbR zmJoM@F#l^s!b;MF3yhBJ9$v&c8F<1nio6 zb)KPTFE1K2y=qct-re$;mm2o|pfNy)Tce zd0pRLnTH*=xk83$FsC9km=zjmo$6rs{!+=d1kL^CN3tC3Zr!KN8SYBh*xkSs~1 zLF0SfYj1w%RKNe;&->TwoX@H4v{&EZdG6=Fulu^L8LlbR^bcReHi)N*vTL?=}`@jJ{ik&oqMZMq-4*sK02Mr&%tO#nws8UlLFkE9`py z;M?E#^XnT4A#iFv!Jd%CA?$BsiRs=4bLXCPg9yAK@!KQLc1lh$sgb+aH`?2}@k@%) z4(I}ntGLj6TqXayqlV~>$mG9`7wzvz$;?KR=#58(eM3-KLc*#eGCV>$+LV_+`&jf3r za=#+^H#v*<-dZF5IrLm>@#H5iQy>58BmKHNhQ#d28^6n^p}ZI6pw_nm;dM!F_=HRy zUxRdq>@AaJrjwl=!uQ=dogi6f0r%&Np$6G9CE1^KB1@O<+7JgqCmlU?u`6AhXACrl z{QmG?-+Xxu4xE6?&*%G#FYf0>qIm-2^mJZ^jLTRLYG|#n+)o8krQmI|3T#kA*tsF$ zOnv{w7i+3RJ3N8SYv(+#~MQtL3RDJukI}x4#<`6Z6k;%6a@>y%X2c zg^Z=Xh}1dgu==-4j>R>V=^fj18Yw{x6C3_{V)jhkEdM)q?+Wn=oj_}EV&;f1L3BpB z?wvE)T8agYbtFU{ zSYN9#u;uY#utJn^pH!=%#LdPL z&ri@JF}78;wNneooO`2mC+#~bYDvbtRAyw|-UCXeLpF1sA@Doi6P{BSsQpw^Q=3j5 zIZ%1%Me`%dAEhv~8V0`@TH;dq7FHshL=fMy5g7eaoO#T54JqvyDo1WC)1C@E)FPqt z3B7>@pW)w0&Nwz;;%`+@P$;k=Q35Ka6)V!dKcz5l;5aH0@{M0N|Ld!Kx>?>wP2R*z zP-E7CJnAJiYUxL~n_b5AEctOWUl|U!9`;~^o_pVivhVQW?#^aG>b6oan%fBdO;4_~5PM?LTy8<;bmqC$5ivaYQ*?%M}}IShueDv+Bu>`vId zbUokSf3QgNBh+W^%CbL&`Y|#(;#~*|YM8vr987t4;=ZT+IaRId-|~@Pcgk?rL}&Nk z)BX9E2;@0&AVzATHZ1mtiH^PkDbgxBO;X-7#D~;~3`R)+LIQO%KdAD@cwABlo1xYJ%`s&eV9C?JsLa)GY+1SMj?Bil% zrmPU{U>#;qVt`uQrA*{M)SvG{-Et5%vx$%B$^wxg^X0nFlUWa(%ik}Uu~$?{dq}R9 zcWx^~9Z!@Fe*|Sg*^Njj(hj3@%`S8PeEYA1_^t&LE0%Z)J_3ATgaqVLX*Q^WQ^}Ks z?z`zBwxIM0key*TxbMBg$S&mzOPxmle&_#w!$r&!5lX_DD-F`t%>#77jK?Q7=U`|1 zC%wmZEC&u`WJlZqI;y=K{R`oBUW4hUFLd`a~ zFDc1jp^61JilZX5K@E}n=1SG6#5GSvEKTZN<}&M*J52{l0sNadcKycEhuD zIN__a zg51~!r;zZu0COX_io)abNhFiGTRH*<6l`DJwB9v=-jN7k?9iOGKq?7XrR(e*?sF+- z@PS>Yyx|kr>=@@zzUUjc&q!oq*l9@iFVW+mwIQ-{rSgUju&FOmx#ItUNItN(QnRS` zTc+)WbvA4=P1#^ATla`N{IA02u7%4NJVjLqd0heDmQ!Hv8^M=iH4Im=e=p&ZSLApK z3er>wa=A`K7|j90d`(;?P_rC}1Ls;4J1^-F_9l$jWuVigA^e{#HP!c&4e7&;@WNIT z*%aw{>t-{o;{2=OS=k!UrQJb`hiZ><2r|NuqUDD@eApj+2V&Gc5kb!I8(9q#Z}>bJ zv_3roH_g}J5Fdtg$Pyec%7?RxT>9I}F%J6<5Lu`Rw_UA4;NICmO6OcKro5b1!gD4y z1a+ueNN=6+Ygjl?so*s$(L%9C?Dxv}=NqOd1k5vFu;76hBy;QaHKm2QAK~z(IM1#W z2rIcR9o%yt&m#v35bERiJ2!kgGj`r3yqXQE1B_JQ2DbD6eE0)Nbq+pZ?tVDARVMDb zu}op!<4>z%ClsM`^0F!?; zjtUW|xI9I~#0ngn2i@uuPe2e#Hpr%gw0fw`fb9>DG z@E0asvW_(ncqxB6G=0|92p-#*Rk>|j!jhmkGIsb3o)13++fF&(V~%4m{S=564vTMU zW8N>k0&S3(t>|kG271pKId;-+0cT%D!kWiRzHX*N@vEHR)H52F>q@pdd`4vu*s*Y_ zFH%s6LZGBOij03PK%O9NW)!h+MVj<=r!2*UVm7v!F@Rb=N*4N{3Z8gK9{-qbBs+B*n)kFaP0A zz!u-eG1s8C4)uV4>h(YFDvlDae@3n$_4V~OD6J&RzsEE$2ZIjet_KeR=NQGrCKptu z!9nUOw}=L~mfDJAbUWfi4fuvHDt@VHeys`#YkE5be3_(!WhkvbfbE!9u#w(qYo4a- z8-koRetH=&3j!f$7KJfHB{D2&Y#dz!FwQ(mE>&83)4#>uy{EVZ+5@iL|QgzqI z{auiZ{U_Jdck5xMz}cl?low7v`#6El2tvyFcdp3;f1Y1<|l3-f8tAtDD7;W&VG2umrNTgczrA73cMK{pC6jM5Ome ze2>Z6cA%!_)Z?b)tN1nR>^D84CX~2|BEhXK7}6o^a&h+oBP0s;SQ)Z*-voX=&VRpZ z%otUoviZ8k4g^aN^j2i-B}QFdc4_O9r(X;KG{&W{WWJqg9{=R54raXD1cH7DVqNfw zDvHHiG=N7d#%tU&#W|OczoHj;OT+DWg zaeUol&YC`5XrFX5usolSnlZ(A1Q1Fd6+ab)bG-f`QpCEO*g^cAFw85<#c8yc4ck3I zih8W4EVQO(pnSI%9~A?zMKm|PwP0xrUg>uIUe5l3E13g*&1T-((>Cel!rIL9X}pFB z^s!f9fh30E>DQ$`FH77%=v^S$Mi3>Tbs# z)M6VA(3#?Y*^@6ELOsBcq~LL!NA)*!aWFpE-bN!c`Of;TIA#7SbQ5dBS4Ap;Y~}VU zoImmoyP9h0lEd$zu*e~&IZCjlm;Cvt9ll-c!@srWzb|d?G}qCr?*z1fX1$k91mHSV z&C4oLh|Fu6Zf-rbZ%=QdvWuejuJZ8pTsr0&tAO{ zTjr`yJ1V}lIC|A%QjwVm(7OPFc6ZJ(p(h2#6K_9|EvKja*29PgYqm!2T}*N z{m@zsA;#TgLO>-v^^-DUKzQF(>3Z+@N0sWOD_43#$Q|DF7;FDJFKwKEo&hBS?w=fM zE(`BvL*B@ZA`JMuvj%vEI#@2387(^HU}Y*3DUOq{xyHL zmZUp&`n2bJ3!4VkVwqgUte<<#$yixQ;V2%7*6q`db189qTRO@pQ40!?=ynJt(JEKe z#>{plnklEP`(H>8J=oAhWb2X=h;TkELUc`gDsa&IurOToP+A|@`N2|#F{r$*@40Lk ztp+9AylG2zM|LX$R)XR7wvISgX!pqZEyi%!LPEaMF!&s#3%R zEd{egkUSowD4#z;{(a3(M-u)X!XMOQq1-uKjO_3u*1F%jQ#E+~Y#}r0I;f;>e1TIDHZ`>%O1%Rlkz@~i4GOc$A0S`Z z76*)JD<022HYQ7`a56B7b}F*>t9lA4E71-ps(3W$6B!J5wABxXa$7V;o-#L1yg-cN z16ufUT^VMo?qe5^Rx{bM@NP`kYryYTO>x%FIIsH|dPk|tqOCFIj_IE+Vl-B>Pd--*lhI>#! z@!!N7$iXo-2lt@7;TtsxK-hO{qg2r<9gp(*E`)Uj?U=~OyxH&7_dAg#hLy~|!6goZ zahE>s8f2N-5X;ZTyl#aJ-y*|2uJrfyXj&9P4D+BJiECbXO-+pt3ssa(XD}yY&C8Ie zlfSO$_t(9%BOY4s@1RG8I1?kWCfRX)foHx3*KOP zq$f}PTj$p1=isRTr#ALYiABEGp>tWL_oM8Rc`GIRDyBNBYiV_b^{jZ;*SW=cNvC&i zaD(@;tbwB|_RAQAgsu?R=Y%_cmEOUe*D$WryC>hMs4ZW5p-h8qyt2VD{T=IeJiKV9 z|1JGur*fQo^fSkkAv?Otc_e}0c}$a4HmwcJnopFY8|?Q!mU0;S(7Mc!mS8_k%XGVn>fH1jdj)8(93*JF!1 z%v(7u|4|b}CdCcbM16wW%vMaKN$SYiP=;0WspyC2^Ed6moDXq=8K>%(GAv-A&9{Hb zw9WyLNWOuVw27m0tTg0X^IviyYR$4pfqqD7?{a1Tg=^ee3%MuJi(mp-%IBMk+ zY@t5q#Roz7{MX%5j7upg-F^X^oBFSoeMvcmMatnLY`x?^E4dS%O+ zBhy`4`}3ur_6RqJq|vu5)g}G?!FXW59;tE9nj0AKQ+jwKgHiFaTnjSjb3Y%K1D&>V z|FmR7v2Yqg+Hc*0c}_oG^F52FQ@s3xN9?%&Jx*EbePXR+j>yRAp?htXhDC6In8nxI-ohPxrT%J#WVERDiN*X~F` z_qJL3)6vG#t1I9cJ7)@3y0(cGaxhP%_`>y8%Ba&wl%uK5LB0;thK08EsO{b0b++E5 z4=VesD0WNCzd>KW-noynKR$`lnH*eqKt~5iHZgyUrF6wVuz1=61Gzt16-V7Dm6*Q# zPh$<*1B^uf^FBXsRVmMN3|21ubKqHO+Q$sp<*7#ZE?qH^{KC`U%u}&UQQ3L<&s^+W zewmJb?_*n^$?SL-qVeh9rLr?u`suI#NHRIL&5{`nPH`4t$J0)B+H+U`vGYaCoF)YX z7isCyPBC3p_@}HY!rnzbKEDdtxH)s8Kc zHPE-A3YD5s_}DC%`c22V7sc*b2?k;Vp6Atnh-mKU3)b6dI`#HlLdSc_B$%_Q7p?*3 zN7%ag1F6z|dM|C{XQrbVB(Ne2{qsvBq731Y-T$49w%6%Y>L0wB@%&Nu|l!%a;0-2^Zb-Z|$!8sU~0$vJw9u$Gc(Q9($+O z)6u-~%E8K#_3ilK2<{JG{GT7*$Nk}F|MSD5^uw%=|MSD#Pp;kCb3FP%dmLL0P+HdQee`q9 zXhZOHqBYtMZHf4~rhU?{mHq3hy!nWP`N~-1H7Gxi(PUt5G5QsW%51V}CyLNg3-B}_ zZ5pEx&RKdJAJ6h%D^UEh8d*bhD1p*cN-UGq7YA_g&I zLslF%2e^NIDbb2>+Utxp43Aj}%NV0;TKae2`t2SjLkwmt%S!|oKL@hOKMAnUNrdi~ zZhJoFe*`f*u32Y5@V_;VoQvU+c|GYp$y8HdXJA#?jYNErW^vOlZlmz`c#Pjq!3Ycy z+r#igw0X5@?Baodd>D6gi)Ei1Hx8u}KQ)~BGFWDVtqGD)GY%*@OuOpRTbV2Hk;sy4 zlmC~GtOHPYd6CDug#=wZsQtyb1PsziL|1PEXm|sYq!20snc6|AKmz-8I`xBoau&X; zy45`xzhlgo!Gj%%MwKyvHZUgeKcNSB-{|{?zim*eeEQ+MbCwW`2uZMAP+|uGwJ6Xz zaNq#py=F%SyhD10Qg^U$*~G9wex{&ZjXuJ$q+zZeb77Lcg135U*NRjjdYr4P8Yx;4 zpwZCzH?@_MWs$yME3F7MKgTO|346{sFboq8?VC5eNei${Qcz+KX`LKihP-FyJ9rDX z+>`4Qt$j$sieQi&x=``@`}NWFgdycX6^i;!i9Huu-%6QCkK>L$F&QwC4oE_@n+>&L zUj8uY8C-($yr_i(Z;L=1y@CF|TD-_Bh^uv0&C_o0fh2ghHUWA`8Hdnmt8|SkI1fExL7=Ms#7BeC~RFbabi!UGn7y0lsPJ zsUDC;5%R^P=p=Y{J}g3nm+ZcIt+bs?ntWIg`U=C6Qaje3q~HQp4^#^)Ov(U)bjVI-&~Ib7=Ilh}^EY70sc?}oQ&U$DclMewGY1{)xtTrhn!nh^!$bc0 zVMNt}Zo*bsEcDfDLG<3GU5zm7-LqF8yH8o$5eSqQ$K*L08vEEf4-WrHRRo(oq-;6N z#Cu;(ObUVrsP7^j5g94c-<2zuv*52edz*U##|2gb$k@jY!D>LO=6Qh*xt~$8V$ug- zH2U>v;zJZvVw23Jne)b5Y8qUC{l4jd@Aw-gbI4kIb(280M<6?3fAR6<_I}aKG+*UN zGZpV~cQZAG*2hz#%`G>QGKgiVp=+fcChdq$2C!LW?M(a~vfPDS@?}d?N_7O2Ko2{E zA+M0ZqouMG>~u@=5Hic9S&Bp#kc^y5c3IvB_C|5Z;h5c5V#e*zqAL`VM6Q`-=I@U? zN&{M*EpEKCgrv8{h1bV5DSjQ2;xp_tZ(fiNRn^8Ulz)1ISX|kFpl0#D2AUb%VJZM7 z62{Ox);N|46v5{C@|3&GBPfY&KAaiLHve{R(s~iJuF@ZT)L)tQeRJ%mU_^@EZznUa z0uTjc1gM<-zd=T?@VEpp2trZ&2YU>TPgmfZiT|tv|72jsx0(Af8WcJpt3|lc(h8LY;Lb=r0&!0c{UZ6!JByqPG49Ml% z@)#puz7PspDSsc4@UGI(lPLkGoytfWmx2yKY=rY&t)`p?yha+|&FZn!Jyl|e2 zJUf3%4EaJf0a3}x#=h3%*eUMXRZC9_Y{nf@LQfwcl+Jo0XAJqE4`B^Ke8BA!obU-FRBkG&w!Y=VvsXf4_x3u}@ zP>W7a;rxDm;IPsGLl%DMYLxS(KY;Ln&Q-zB+7BJS4f+5%=r8hLxKJT8mx<>_?dEJO z@42bpLF^Ex{e2kB2zF;3u5^a;kO*;Y3;HYmAS0%3|E3PegG=$j@WLHn5+#W@P!d4z zb$N36Lfa@{Jo$FigMwI%D(^8yck%V})?t|+{uUr!GBV>8NCoP+JanyKO2F3!; z^FQGmc?CtshRT#9f-ok$g0{(ezECVEi&EWYIw;2+SdQi|dJVCs@JP5SCM>kYnHcGp zWV}-oLzFQ~BIxqwSjb(&Nbp0~iPt^6So)b(Oi zLdK8}0KRD>It(G(RkE7uU2wgqD^=Ih>@u){dQ~`yuV9Y2uqS*WL_j0M-|Zmi2tyX~ zjnG4MhskReI4k?B<;F0yhoQ){5*7I>dJ-b==zMc?y`4E7LPQ|s_M!n&=1+-4?>`$* z;oPA0c8c?0fpD4o{Kf%~DF4u$vh`Id9mE9F)eu7#D2H2$D2S9}jGR!wuRig$+}E5G zC>XT0g1FeiBNL~d(N6sV>TE8We}PHwAvvKQm~G|AC;r125Y%m2Soej_<9UE8l`rar zQyE&f8Xzgi=rxKgC_{384Dq3hlZ-u;YFhhW%~5dDhd^cYLi_m6vIL>lJU`As{-1?V!B2RX2+wK)D5 z`+Dj4)VE9v*YoAHRfz*EklE0=hY(?Ob18W~0-(^Wr8+v^Mxg z(cl5@Q8?}k+HRf*REs^o+=LrUpmTR)3-FKaKxh}~f?pKyPgh>Il=VhwLpB}s+y)8! zft$*+ifEqdXCUgdpGfW@{4Aab+9Z>x#lkLpXkc$|PhPVmbJTL_Yma#+%=kGB$1Ujt zK4I4wNt4!?{e5E8)gmjKVh>%%CYJY(g-^`xzm&R4r*eKX6kw8k=8|yubK?+bk4a<& zT_|&>SEy)c%mihFd7`TnaXffVHRAjF*nn}P(O~NH%Ehwy6Lz@z;zySSZ9P>^jah_% zBfz6e%$yiW@vFuj8=OGPhqdAe%LenV*h68IHd=;T==eMIN2a1F6<;`8H(>N#1^M@^ z@lbYqUZQKa`00b~p7VlissJGg5R`_De!1PG%}W1%cjizF|S`HON`nFZ% zI+F_Y&714agJ9{7i2T54ED{QE7^FF)T`PwnM%t|c_k=8-xR8A>XcX>b5a$#!apFYp z1u6*5j+Hi)qQS~Bip^PuwmoSN+-ec+QtFB-(ag{Li`1wq9(}&^zzFGEjPa>i4Bo3l z*J$K9iWg=aVrx@gCjGW{>o~ScUztlbag0t>;dazHlGd4h9XDkhPgJf(BgGxEMNw!+ zr~W*S;e|QKBhjWH`VF*!k@JScNruW;Fdu9RbQb>c1w~B3&;DdIL--!}7Gf#r$E0HR z%1rdmTq;-776k9KL8kYMBc)3cfaOh`i28-*c3Q_Md557vBy+&;fNw zr#n(@pLT*PDR-!~$Nv4s1=1lyZ1JjN5Lhfh0E0^uWY&A{g3`oW>< zS`IYS``7?nTOXZ7N7RF49wts;Xh{06{L`6rs-?!K9e1z}gNW`d0HJFOZD~=Leq#7z zEm4!wzB%?b%F(M-tYd}GrDh$G=U~+^An4f1vqas#!*k$}wuA3N5uk&Y?p!K=eQESI zl~Io`cl&QOIyg0K3H$KD68v{-+jjTG<`Sx53-_u?XeHj(`ZFPP;e_J5ei!Opg4+7a z1WioKVFmZ3;h27dPDhn#aL+B%!dn+l?WR53P`8Dy(O0g=}Zn}`S zGMGYYBIBn~^IZJmc&pu@{2s&sN3Wl@DQ!SbOL%yAqZJ5j*De|@=GkXr5jbi&#(pg& z@9%YW*R~W_$JA1(0~$eiRVC3gkvxCZ;V~d#3aQG>KC3S=S+FmEH6o=46=r6yN^qP` ztL}N@zvyP-RTToL@2*8J=uHu27D(}Hi7^KO$cb2swv(AX`l+?O zu*>s=v1fg+vwYl8+PSOw&Jm&FqYY`;q1iY#4n*y1k{EknN(w8p59&!cnHIwKUN<}u zK*}7nXbK!&fUP!#r2vQBFw{$lsMhYC(adM#k`>3gSY7g(5(5fP2qN&wEY!5N;-9e8 zYe6_xN{LOb6eF$~NqVL~B|v42yc39`>W`eJZfyrGNg67nI9l_?Q{shIt?=_|S77%N z8I<}t>JBszYbH9ZZb&Aj3t_Mdt=J}|hH;Yr>9F_jR#SUMffpVAJgv-@9D!OuxgvS| zYt2hUh}azH)($S1ajfu}9U#CSD_N4?=Yk6%b(5rwuTDLz1TXLJZ}lV5(WlrQ;vK+N zXAn1}7cg#|?f1$@Os&L)pe;Tki;-jESe9)>Z}cQ1I;wdO<0Mha*|RwZwSVC34JRy> zzO@l;jUby|q+<~0l`j-NcHUn@?ahB{OmO|K?G=bt0W4Wiz`2|@r#2|mO=CMjt~Fl- zMv(vzlECA|3iiC2I;BJwC*fq9_=%~b5QqB2Oz|TQIR4)}iAFkO?ujJhF6qIJO?75{ zdPs>NQ7EM6&N3fH=@WU8&=I)2+-Sc^{7u}9TozDn_EnYv@Rl@m*h?q$-}$uoqs1)! zAQK%MO$^ER$S18Wwm6k(9mhkP8&rgRHB zQ5h9_?2awfD)cbIK;`AwP&wUvx?uy=7CSJsT-;UW2|{j^y_a=r=^jJB+vPFp1VpV;YXP7We~J zt?v;kYf;@X*Y|x!5;0i`<6b)5xfG_}pUdL+e?_fH($R8b?>z^R;Wp}I;$-F;)uFn4 zp58>T9wjN^4P#mE!<rId~U?YEeZ@Jt5zTcro=;$ z`GQ}X@KeQ}@J)V0Uxlgs?au`JS2=$_CwTXaX1y5#;v*|)HMx9jgp7B@y05{9a?>Pe zEWO(G1R+`_V*+XWu7Zh~IJ2%`ves%sFm2;+*U4fIg5*1|52>>ZPA$;)Lvd{K)w?QF zi204w(BTcIO4T*90g5Hf^B(1<4>??;U1o&P^pzD}oTc`bUh|p6@C#jJc>#UG9LspzXP5L_+K0MyH0LlMx5@ z&fvX9>H(y_i%4GDaQc0|zp#<%F|@Yc_bOQE3 z?h%)sw~=5Ph9B90jYz0DH%_N_R4(-Z|4yQu__{5qd1v*u{6zY2pN%1MY6+<-_Ru0| zvb%|>h=1t_>qauVNknHfk@-aAPk>0<;ufFLq~qf&P#)h5D^!)W@h*K964q7gYF86m zAwK_#PO7a50qWYGjaUviR$LUG<%)#?G_|3Z1Mo;E>h0g%@A2YA?DpYKwJ`m!qwna1+!K#Vmka`*?>}DDIW(*XNN=Z zr_In3)j*|~r=|QfJ_9izFI>p*o3cyyKGY7k@!1W{U|8I2ze(dgpVrD?Clg)OtsM=l z9Ez#vvw1{6!sf31aBgy3YtwD1VdNRom@yYh&Y&?fUfqwNF7n06g7YXwzL^qe{220@ z8aSz6LT;7E!eDdbSO7WV4Kn%eLT*ur8Ag3OK^_!-sPHk9I@YGRd%vmyQJqAIX%Gpz;`OfU1bR0M{}hc@oq ze1`-ev!!i~13_#ibA&Wz8}OuB#B(J_0`FS@D5yEUu)8pG{FCT(_^*Uyqs+NBX7OBb z7Ae2WLBN$|9XiCFsS`0hy2Y)LlX-~&Nwgsx@cHa)}*8yF(zq86D!U?k32lH?USVyxy0C7Je; za->hSq@5zFnqn=}!n$IFOSIVdIRz#URhIzkRpy_-y8Q)@<&kga%{ZLi4Sf*k1hayl zw{4V`^O_&8R{{6eHEb0Adbd}5Sx9^1X)i{3@E6wU>2Dyax&d95yH~q;Pu{!IKL%f? zF>f43(in1^8+c?{*q^gW_Q?NqpE{`={{Y32BvN8Uk^yLwsoNYX@c~}zNhz4v5$yqXhzS!!rQ<;<`;#rTNwLhwp#V^~1(g3*@fFi`I!q@IzJU1+*m3-D zH9qxdu`Gz=&a@y7edANpiKCV-U#{QKBnJYP2mD3W6Zd9-Lvks1QnRhzi%^x^Us6JmxRhd#Hq7+j8y?30l>|WY;>@c#?`K)0EaNpn8GI zSfq5MSVO%Kx!H)aN#g7;LcOkg+Zp}W=UU2}uWMy4USN*Yb?3QQYF~~qSgboW`T=Uf zNOyjT!^pjp=Rc%9!#YVW1?y#}596w!<%x*BvI9Dx<2?xYd{kc3;-oeTGiBEolG{*m zupf^3Det5#^cn2#MjXX%1h+xX%m;d=8?r~OkP-|&GxxSE)5{o{KxtuF%k3#cU;Pt@ zXcWpVVS;@kq|JMXTCWy?HqW6Kl^O*hlII?hL1CEuT!%);EE4QoJl-Taj*tXW&IP_O z+d>fbL(~8@Xb-v}NU(n#mzs^{gu|6+d&j{GKJ+3Bg(KGw9EVgu`$KYNAhK&=3qf+s zb|SPxd#clDzgLT#4ZyXL@R05PEHP(!*+*{g0z(Nzq%`u|eH1dzC^nw^>*u>;(| zIYr(;Wm~S)Tgz!d8%}!A0ZY{q+|i4E>Xn%yrWSTrT~j9hC#Oqg+%S-f7bV&xu15av zqk9Cvw)-eUvis^O$8w@{dRa;aP|cgj0&u)9;H<0uS$HyOj3LQaNse8PU>hT2YHNxn zY75}P2Twn0lE(`UjBV(r6dw~FpJom;=R%LkqcKurh+e7{0QiZimn#0HB+A?GeSTL$ z2HIhRmxXT$PBG@+a;R0KUrWA2-5@^3a53QCltR?-)1$E2S3|65%R$KpaJUv7skn~Q z`9TO6i>MdfeiLCiTR}KzIu%MF@k*6`Nr4u99fpD41d_rw-a+%)4gyObEa$X5Z2uL!V z)##*L0UI(4J3y}qG8_b~gGMa$2Tfv8V&vL4K89qj3m4rgE;Ze2mVA=PDNTF z;Jv(mdPM9XN+@j(-fjG=b>#987AbOUYd+3|8l2p5KYH^#;Zbsk_bv!h>Lhnh#A*GB z*?^=^J@%N5Q!;TUVl%N`>>D>AAJak+XpnPp^Dw!W`YU9E%=W+LIYj2L+i~YFbd=9t z6XOb)P&+Ym5e8&P!>RbvvzM&=lGvzQsz_z5!}N$g)&`CM%k8w zWEnq(N|TP|hF$&WmRddF(Yg#uqN#oWP_%}$hD(1gvXxy@GHPf>r!tx#&^vctw9HkQ zUxHJ1|94gp0bIZW)F*w@+1L5)T`f2OFCAVGw}j|0WHDi0iP`Q(MiN+k#7xR@po$cE zp>)%&^C6V(ayA~?!p?LVV1C?B;kxiy6WJQRS$#WVnB+~22<8V-yB0dUAoCIkUPM24 zL#xKp=HLw+4$s|)BlRFiJoyeUXcfx!H7Xl@e0o83pVz7gf{~<169;|?`m%+NI8-Ij zGGVEXHEeoV>+u=|t6MuVQ;I;h?jKF@A6I(r^e{lR`j2$b^n-Hs92&!EVCf8FcMufW z-01`k=vt6LU(}C>`Aq~w+s#`G7?&mm)PV1KWwSpvwBFIbN4F3s>(e-a`{x*FIVo)2 zq=r$$0%17g4#g^?>v*pZ3)jx(gfTia7qR8-;FEs}K6*d>E!@z$V(*^N0-if5o$OK_h}f#^ME+e7k4apf6V zoDe)?sm4}WY5~(3wGIsjVmu|TwgA>iHI@Z~jc{XIywqOLrRC~=LcG}aUvs9o+F`Ik z7M4{|V*iUZMOo^a`ZEA$GHpHoT(mul`1oSg^UQ}jQ!-R|300x9o3mQs`%8$8xHgl2 zfSmzO!m;E}#0cvJCgVmhg~b!4$0wRVJ!{;K;3 z8F}HN`y#UN6avOvdxzL-mU%@C^%QR@4gqgge_3Wft5U{GK8Emwmf~G2)jxL;=l$Xe z>3d!cXDq>p38u!{8~L;XBrW|jP$U}^#xIIA8|HA}Cyzl&_Ld(13cO?{4kT-G_Tjb= zATRNCSh7izoSkTz09SvHu2*K^TRPQVqq!65ffO@hlP*DX7d*&(glp|Z9cvscb4a#h z+YIP4i`G50Ex1BjTu~0~@6$bX8T&Di{5ARkh`^LEgSF;MnG<7 zWW40Y)}v2&#uBc7e;x=Jy~4Z~9Dyb*?LFOcPOx`AwO{KbBqp0*o~NT>yLk?k@5;71BiEkLVnM)5M(F(A*Wjv z$O2_G!W^-xFEsd_|2zlP^!17PRm9yOJhTSxAyKcayh7tCU3!OicE22(^G#1WPKIq@ z1co8QZq&f^WvvH?Rr&pWs;Y$;BIETP(9(XhDpsf~M$F5AgvrOhRY}{5j&D~?-fRxc z8$6lhDjgulZ+;r<-lNljaEN<<8zZ@8sub1PAQWt<0b$`X@)gCGYo$-Fok`k6vWbv6 zx#lO3XYPCpz;B{XG^RVA&h;>mhUkp_@yMA_`-CJRk(eY-(9XC;T7ttwvN>I(A--5j zCUrA1_CTUSRkS8rSbrMrynv6@N6Y!AXzA&%^@ZPV+$Tjw_tL_}FA( zsiJj2Rm4Ubu3@>1{2Rdk+ePkEW|wrO5Niz{sb|BkAz+M|$wJ1}_Wzj2_-ke4oe?r@ z1lW9ux}z<~bIYzs&V@089#*6(5}QCzeQ3kT-v&b=7jb!2cYSK;#;lM^9VCjJbG$NBSaC==A?J0KMk&n?Jr*tRd8cddary$@O413iG1%&$31tCT2wb)ErV(mp>tjFWF8}uKD+t z+r&INRd;r^bPErTDLF74?MRA={Z~s>i z2R}rEfHy))kOr0wT3sE!qB} zfp&fPJ413-u3 zfSxWT`gF{5JKY360Q}CR#w`$smC-UCAeGJ_CJu#blx&V$k>4K^Z{St6RoVz`WDBBc zIM0E6drfVH!*6JHP{)N9O|4D7^eN&Oo0&E0u!IsC$uMJ0;krdq+o=fah1Z-rISo`911Cr<&s@jOHH=fL-iy!;Yj2uC6kNfYQT2OeXrX4WFs=KnYT_U2-RlXi;t%m z%x6RqBV)f*7V_eV@fi&HrlilweT#0ajA}GX$Plxh?Ot>$Bf|7Q9~X1M_<)@wj5K`D z7gHONJwfD>P^%GKb6e09&c&|fC#87`c`HM|R_q!eB!OlkMu4kQ62Z$-ID6cS5lMsu zUKjW!zH?}YSEvngj1~ZK5j^q=^V!7mf?A;QMpY*Up}IJRfPf%K)dGZy5x3<*2LG<7 z&zmp37AWG6v;d@|{7-K%U6N+Znsv|OJ&Nz+FOe$nqyEU!qZ zPW@*Rp0l#Mzh~^4xAUb)-0h5C(c#zEgxAry4JwHmlUFxV2X-!qOK6z67gMHVjaO>9 zNBM&JO`s?_IzYB8)VpRF@AF|z`1J|Dzn0Jgd5I+MlcAHU-q02pdSI{5kDtc%`lhxa zHH7H}(wN|QnQCkbHx{9a@#zcvhJq0c`5C}dXAyP0|7|r;Pyw=d;J?H|TOeE6xz#%` zj)1CK6j?8{nsGoY^=y)zQEzwr_x1hyrd*z7{my|^4MpDdiw74kP^SZdSX=k{2=RuB zDqpL)G)9v?U?>2@a22CiC(icz9)oD!V5x74dLB|}iyQ|CGK`?O&DWO8TRSvsWvGm% zFiOX@Q6Cxz_P_4W_;ta*zeayX=AEzA1FXCT z+<~h|WYz4QP;v;!0d=Md{l%%>2sX8_aEY;<965eyBF6~5LZ01`1jKdY2$>3O=rmX+ z0>1of$G_x96Ozg&V&OA$`w@&h@gz0PPe+<%OUnG`?)c}LwD#K}K4f&JH)o^0daLCo zTFh%nx>F9S9BGFWL224XH4zd|f}+Xrf@ojV6C*T=*ty6K4XgTrbS9 z9mdDr@(GU?W$`)PXjQ3Nj7%UE^1n?*0OvYjOOHCK4|(Ja*?w{!NAk7Uyb^TXHK(%D-dpzu$Dpd1lBiy%Dcv zCLL*$ZO6%aovu^%L&7jj1ulZ*EepX_S5K1PwH-b;Sp;hu^__9wr#kd^GYpM7K;p^8BiLuu_t#`W1Ecga?tVi- zR#$#CTn*&w(bwOTCn)mNJKG&b3JiCUOIp%zFYW{b$^*ozvYkf^Ylnd{LocSIfh#$6 zt0Y`Y&vuU5IkZq_h+MmKr*+YO5E|zZ!*rA2UxbH&5u*Ppm|-$Z+?&3$C|G6<8|@tJ zSGgZiI=Q3e&17W)WFH+gHo<0?u&Oej>}a(d0UVI|$Y{r9=${*W>qoUW%W~3_oMD_( z<LY-1ui=`UOOTavn=C=`_nFguXvr+!2T$5%D`*cPh2To?pgxWc2{;^Y_2^qnl z&!UM&mcZof-uz<>DeEDx7tuHppe3ILVMx0}WM!I^%|Cs1Rgj0_Gqu z#!sxZ^0yA|8gfQ+IslEXq?*Lmg8XTzXN^j!HOU9KtKzxKULp0;wQg%_Q0qV6b)>Zx zV1Xww@PYG{MN?%(qC7!a`x0EjT6Ax3!K%+DI59eJ=4t%khZC**iSu+;rMR(ZbS+5&17jNyHH}c@H@*PrN~UOxn-68ppGv1YBzM3iiu5iW$|EhD`O!QGUjxN06UrP9@m9)1dy3KQY<-FQn@C8q}8D zdKx2UP7QbLAy&t%S+jCcNgMocpSySJoZ~>14)HD%gD*zwBaC359}Mv^cSUO2Tbkda zr)Jayf>I>A9F^_TlLOe;GXUPivCA3i!(^?|AE4&U*CN9zB0j4dl;K6>0D^oT%K`{F z386~Nb5GN8f}e3i&tm}&i{g+C6LY5)9}P-+9t$ipRMBX}aKkJj&yrg!a3KFgSpY9$ z@f@@|ZzNLxh=QYUZo~=2d2cNLKBOR`qn-xn-faF`e@LrEcJfB54h^ZHS4b_MCrR zQS|(YsF95h*z;IBpfIRBG#XcH%aO{ASl6}|q!8F0xGr&T?c#owS7ik`XMaDQFb-O&_~Nc>qvcWODoW#ywzPXGX2nqK1j zX&iY<4i2E9{M<*-0lHu=AT_a#05#HQ{IhAOCkz-|J6t-cpNO0}mEb_CMkW4pICA$d z{PB)lAY6VKo(1Sdy-mr;UVdBFByBtSbK?e8V`hREs7V{lbjWaUc)@aM4@2=HJtj~1GM zSF!YQc+A`QP^ZnfPXz!t9c5Mc74_^LZnITyGniN{;SD9Y&{%YY*9adQ9XW|dOW-D; zIyn+IK+13H&wglLUWygeR(uS6aNrO}?!Jw_twmxO?lMBS`wUHofy3auxMU30@D*yE zaL-oSX{70*c)dOcJ?UB03`k|s=w>Qf17G@8HeFiekvH3w2CioTNz5!<{m-i=8-oS9 z49PtV5VjPwPm{J0fWZ+;b%;&OJfo>_@USeiV(Iv7#ZNw#m&uBbR%662BFe5jHh9%V zijr>$`SOxf%a%h%&Fe=d5XV1JiVUDZ>|izIci$$~F%U^8F~8`rsF{cM*fih;MW=ee zX~;&4SC4y&O9w5LSHE1NtDFw#yCtBycgdbl8!?ZpmOxXqiNZIjndnYE!TNMVtNo^) z<>gj~Scex>@Pl2V>#yN|-f*!Ckn(toQ!@syJzG^pCpweKx0U$BBoF3dV-V@x5ODC#J6W#2Eq3XY zr4=a@fzfNIbwQ_vyfUe^9o{qeLfAWq%>gGcQ*LHtv^-xw&ls$N(qI8j2u)fr`d$G7 z0*%>-KEC|z&+#gUM)$pum|aSy*b zW%6^pcg(mC`5bb88MOWC4|$@)omN!~AJmXHsXS_$Hmn`(5TyI0v>^)~i)GP^D+I|$1q-q1trI1f)XcP)s|(aoa`(j0CF7n% zFGhGAAcp8e9o*1|7`qF*ko>K5Y9XbK{~zSIN(LM=5EzPnR_#($ES{Bxtdn%HneRZb z&R_?wd3R5}`B=s3d~*Atw!Q#y#Ywr}7?4yqdN3Run#{k-P@ddzi8LOo?g%a|jqRe& z;g3#kKL za^1>dH$=2Q+jDUm=__xYP4HS3OYaD75(pt@5Mt<9H-YAU1vN+4+cu%`Sq0Mz#;Cl-yI6qr>qdUU zSOhNUFPZVTs$KsOlhkL&VS}sYaQDcG4z!DCl0jIq!sb+X5CmD9oY7L4!Umtv$C`FS z&MM~aO`c@gp_7O29Wc-;&;wG{_cn_koE>WRzGT6%KBKhYVM!du+xiiQc4q zS<+>t_gCf3@znK{P;DWo23|*5-MH~D=%YSb3{8sw(u_UTJ|x7iA7c>9ciy)*IL+P> zLi8Glo;+w&$y1^K-UbX6&R4`060k^w9^Qjzc@S-hyMJ%PGy##$4r=W=IPMfWV^)v# z;`P--h+%=n!&j7zccrs4lr0+FINowM$zrId8VKVaIx!_)n3WpdQ+HAJX?lyNT#LNZ z0}=pP5O_VnU-{rbS_?8~{Bu8Uk#k8`8f3`vKSgV=Yt={{2hujtH zhms%f2A^DA8emQO|9M^X9OblTM^%@=p+DLGGzUeua#2*PPtXoBluB(!N~EO_sxj47 zeLi`>@_&~w&bx-i$?i=@S}D_y&2g0|vP#nRJ`{N@FM0JiAcxOFrRaC@fIfyhQ48N0 z$BC9?m-)-;5Ww^6AR-|NT--;;!GSWt;gY&`aF%~RdQ}m+_GBIecb0_kI$fApUk#RV zyzzJ;^$yz=4JI~fd$TFYEACf)P9wi zBKBrt(Zy)QkQRa@C?9`_^8F4JLVRV>UpwoHXdVPkq8_dvmHyx)#nqn;;VGu+YNoQ# zWnag;LQENO2`NbIgXeMr@YRH=k3LZSH#wYXcaY542hNZ$9si`@K52!Bqoe*-WoXw3dT8retvlN@VQoyuh^#y?YJCt~Dg6>F((8FGw=NJrlo4St*E z=KvJ=J$3<`%{PNrAO*p9*~@B?B7hsu-^qa&yRm3O=$8VQ?TbvwZ~n+ge+^zsv7Tc% zHENL#cu?yYPx?Upb1vNiZ)P5G1-ZgJ0!u;G)|fb5x4j7HPMqD~;lJh=umN%~|LwZu zcXRa(UlX>Ffxv`%MB!bIGb^tx*h2E$QvO;d*?n{rpcs{~pREQ~BLBT6R#lWsqoes{;N(as*ppBo~ z?A-ZSVq6Rmpho|}>$5~-1}}@K_|)oaZadIt2c5^=28C@xsdE^tTMXt1h0XrRO&)WZ zLX^yOn?~P5Y_Xv^Dl`uPNx+QrgK4@Yid09Sf{kx`h~-SR&IEB`wlZj{o`4!PMf}2G zYCQVZGv_1 zHZ-i3%~Xk@RS6x5Bu$)L2R2w(WS(uquGX_qXXFwp$VZg2_d|Rz)v}+Q8A-LJQ97zB zU@J$iK40J7_(m8c5uf&($O3g0LfOxIqjxHigP=x|E7DBBVBvxX2;ym=R{(1Zv@7lY%va#_Eyb1jxh{l8BJN_}j zW*GG9|3TMblSs9j*q$W&QO1}Aami(ALq31rMcA`)UOSC7$Myh-zxarEMpJ)ayvAA6Wr{!0KyjpVc^tBGq8`^eeVgys^Lg*qd)2Rt1FlZ6XE-5$j zm*1n1In*&tr*1xuPI$z}-6pkuqVX=|` zNB&J6__LiEm|apFt3p%csE9~|29eM6CzN`#2y+V04TcmVr%g_JfXHw(elI+Ca8zW3 zCgAc$C>XvX`54-G`=QG^&H514IwA_rOF`ZlWSBxE{le%VFquQ&YGR(>cH!xPE?ap;oa1jjwr=0n5xcQ#JBcm8*-vK18s5x-uFMm zwdZ5a249r1TJ}+s+KDvd(x?x7*sDZ*R-Y%Q%V48C)T*}`vNA-L@n}hYhFVMUS-jWL zyXxoSHF&+mR$hyK#G^$0;Q=s zfNcax_v8LIR~yV&>P<|dxJO^2&D+V3A_qK)chQnkL-yT;MW}AbLXLPGrv*G79Qv*< zg=9ttz^EQb*3}(anwd$iG3p~z*;N`c)_Y zkhLMD9$>sgBYHSFdEFmq`jJ@r_b&A`cO{W5z^2$|sXOodw42>v+n z8X-byR`5RI;|zsQ8EnB32@@diGohy+en6HpAyX{IPjo|EG>;ZR62#ApwOP}?r@I}T zJ{ojG-00NYV8{i{X0wT`NvnmNYmFXiRP5B zYYP$4q$nzBA6leb`xa?IHAqpmX(vnBN-5HYd#|k!2`$JHEp|#-vy_l*$?x&JGtFn} zZNBIKJLiACzwhsyU+2uphQMb!=Nv?l$WhVJiHC^rwY!Ehn-#Pm_u)sGa&tF4UScCjo2O=s zXj-+J4)C!qIg?%DEqRa$rANGQE-CQseyYW$CMIPt@kb(xTZ|?fcfyZp@z?{Ap17x3 z1TRek&5|Tas=RfKeUMqSlC|7Aj2_L*Z798{``8r%Hx6V=oO^_$ae=7cF%yXy zx=Dk1ML*PpRz>M~z8Dcw^U=pm$o{MC>8f7{&^3*up0vB!q)R!Fu=B<%i#Ix0GA7p_ zM(0e#_=OOpsFw`hsqo4U0?61#yCl;TBdZNZ?~`cM*x8?@&mK#nuEMltl2tugVOs|y zw#T|MOR5;RNO$?jbm=qH6#*nnd(1IuzZRJ>q3R?JxZz>6?WCq#Scx8A{ z{eG{4xXL#%=eT}_on(tiysOX`|Hjmv7b|+FIw_zLO*fNJ?6IT|dxBy#DLC%rwqm>R zb(e29AWb+2!1c z7Ipp74-pBy0MLuj36?&Fy9yNNCXb@ za{Rh^9XdcYbas;R2MJ|&^e~dSm}IY6@i$L+fj}9b&XHGRX~hjt(}yOs*SoOd&cr4( z1ygSUnaa-mzE+f~n2XNdn39EkG=qCpIYzb|>o25OQN>XYV%**{VZzAjnjX4XN@4`ZX%+8dbATsyeFZ(KcEvv_ z*!@sM%EcQakaw@kyR>8xFi05cVKx}{dWde1zu^Ktxd`Nv@UvgO|BNv+bQkpfkTcP> zPqH1GI`y%B71G3xgm&s;9lt!4C|Qu=&Z}5T(Z;q*^1eZ~Fmd}5UultoXbp%N`MM+% zPsf_0K~v5TrhA}%`7Ci?)mz|}hUUmxkRKDeT)r>)>n)zNG6jnz1BMG z+v_^6qosAbI2B2@gA|Kp%T4Lht-CkHyk-_lrT~6GzVWHR{MPZhkc_A1oBNK7MD$Vg5XHBloZ(xU3U` zO=vLqO`|a(ThQFU-VhKRGVNTb^{r`?+t^i@3l^6rq9_#%8pBFIVT}+v>Cn8)%ylg7 z%Ns*f?p6MJ+2h12b`hcPf!5GFd@lPSOB<8zI2aZe83X*KF)T)&ym>X*z|){rujCCSS>%Edz0HI;R=Nq+Wy-B~j$H~# z?=b0^CjaNeKV+HS1eLjes zF3`9<%PFgR$>ge7@y{Y|U2Ds_?+R8uJmmdB_>Iy7SeNukR>k_a!SF<=r%y5@#3`|* zHRN-@Uz%%sm&Slk*VH47J`Ej=s5 zSEc#TiVdm1VZiZ1zmG(hMvsv8-wn!hGEds^iiytK(hI&x;&%E)IEBqom`)OX=-j?o zsml;I=ZjB6dwFNC|FyLT(LCPbyv+z}H!lC_mR7G~pN5e>ep%T`4Tsu=+gwvlcK&&% zTfO2sM2q`uLlm4`+J)cTy4#9dyd|?ivegUU=*e!teH2JF`1CTaHSpe|_IuBgd-@Dr z*S+iT<3EkBKa#RYf674r<+Q7xzCI>jpH^L>5+b2CewWryGW0SEKf_9fgw(9+oViKS zHwFw^NPT>Zxt|ax%TMzIi;IU)C(Dw#SSw~Ixk48fEH9LYMMhc~4cu9xmUHz$G8me1 zCp0c$gE_tnaas|xL$D^2XtHpkdDG)Q%? zXuyGI>sWi%(RDtlsE~_r|8gaX2ro>kJ>!XF6mO`O|Bxp5_e_HF%McGT&}StUSiYXg zx0hnM37rw`K4)-%6cneRlT?I?;C6ZI&={$$I{`70N$d9tu7HvWR1+pOhU5k&6CEnV zFGTI}W*kL!<#s4xVlg{kr>2%|>72_gAE4PQ4vcjw2nowbV_{!k;FBxXd#Oc_TPLTe zNKuvo=d|XpJuS_xau7P}#_10VFw|pKBNH@RWe3S1B7X(MzQS3?*C{Hh!RUU&(S)21 z%p$TNED_lGDZ7&N6 z13~n?XpPf*E0&w}fUh~^OVy~dUj;+jJ|~8hNZY;V<=Cx$=Zw(6%ddeL_%cWD$C-A= z8W0yQ&sw_9;g@FTqyj+;r~J-WQbHjwoyD;I>7T3gdPNnNRK+@oy`dn1*g~r@)x1sQ zbl2KR{O@&pXEY$x@W0opEAVB>dhCF7TWF7&p}B1I$e%}lEK3FLG-w$wXsn0d%J2(wYv zTqT%N=Arf}qmzD4S6>#B9n^0ih_-0O(d&lipA$WA2jC#!paD>;S?3-AkG4&Y{< zuEvpQ#YezIV#Xy(4VFyPBQ*X5zg!5*^zr(E+Vy8KG1uuyC<;i~r4v7p{OUKY<9%8x ze+Ao_LIvY{T)o|z!t)QywtD3>IL=$pnroyHP5zm?DQ_kHD^#Th23sJo zW^FWND<0*>{KMWTmoH_Bt2v# zayZu1?%ND~dKwuB7rBI>pR*JLvOp15=mkCv5FKOfii7B%>+7USAl>l&{U$hmJGi7* ze6yeAoe7yJI5%F-YWDbs_3|N5BPxgs2c5f9g$8i+=NeGgmG&*%P zUf^Jb;|?os9X7me`Q4>dWs>Nr)pQ+zXxmkOk$#W5sQ;6&bO96z6uaxJanKd_BIzED z==RN@b`GW!Bt^?h*oWlR297|Z?hj>!9{~;`3q8>eG@RQl`&>U>60gr%Vn$({mrJ){0s1Sb~6~R5oUimk> z>qP7CjB{$MkV}f91^m{s4@K7mKHrQ4|0xZe%3?{IqS@PcIGl72Dn=6z>FEqDNNZp*1l=-*nhrS*-!zZTMJ_0I`~%*d*Kr;_hotFt+g_qQNfWg^ zT^c<;AKpCnWwP%~KdW#3Pu<4mQi~X$+CY}OjcyKfscsDH?L9VO)2U%UFP@HBjrP@! z;jHjIzQUK#R=D08B#K24|6o-G3TkS*LI)`^v;Mbtx8F#X6~-WjD~$a~T`ni?DQCOP zn2269++UnNNKLW>(Zr_QI9{&V59fvR%qnULBlsA~oUtmZT7tuyO4~le@beJox*_&p zL1RT?>)KdlB}kW%u4 zBIx!+9Qaw(-vdKBLCc>L{`qC?a}3@M-hYWR4W~BZr9YN5%cB`R6)jce34i$YQoFS$ z0rXy)YZohph`&5vUPI_HT{Nrv!!Z5R^z-dNz)Mx15Z6KZx?VGd8solii|Ng`X6RBM zmqfy2Atgs;i*y84{gA84ggwd>%zKY^&bnHiCt+&=d}`y#i)@MeIu>s>Q>&TUg>>vo zcin-nff*8ssuPwLFM!E?oGAWTdQ_dhoeJj%vm!`pQQS%?IPMVyranZx=a@UAG?ae3 zg{&s#W80C|cCy43EtEmMpOzlNat%Y)WwIk8j5l^}6;2KuIq?gl*`Ub1@f>NQi38X2bA1Sd43d zrs0iBSATGa1lZez4`E1V29bi?^{t?EF0&$`&HVM>XKy6nyztSCGJ=T^!h{U!sZ`u+ z;?%YV%vSYsa zm;l$KeU`ulj}$;f(FihW;RXaqx-;Es$hDb0b3@x>)Qn#dcdmuc4Oi;T?=1VjZ2>mTDL<#kN2!+2y5a> zWR_PkwA{UElkdD?0Nip^2?2QrP!Zy@az?C@SngNlCp*eZ85M0ciBJ+grZMC4M&FT; zXCGLs0(7P4wyhi$H5w>B1L5NFIbz37+ZjBbwslg6p1mAFNr4xr-y^5>ih^fs;5C`a z`WQUM6;tW-@lbZ>m^ajDr&y!g3Jm_HzHblIm@n#IcveWxUm$>n2ZyPPYD|)Fx2G7M zGd4o~Q^1IA=noS}(41hA&>EFN?sT`fjTpK#Odv^9Itz=-?pn7{S5?z6I3(|u_{Szv zMn{e?&uY46nW1ke7gcecg`AeL@9ay#IzfDKGm~Nha0)Kk8yu@*-&9BdG28M5Nnc1a zVf(Tj0O2XUV+^fh=I3Y#3e{pG+mM1wV&DdObd|SXf5}Mg2{RJ>qON(plbn{3 z?ItHoZKmHG)$~6? z7s9A5dJVyePNTJ0A}^{+etM^;@D!LF9F~u0bcAEh9lWOLDB%L$&l{6~aR7UEBY7Xy zeM}Ya;Ny5O5+JyRvye-`ZWW6z$p{m`kTebF9_A^k<@5Ne7U82vOriLYtr5*5lo~sU z`wv%~%bn-RH9_(o_Y&kXB8he!A8@B-=EP9d~+jyk*ny#H=f|U^w%+Uwu7N?MU zKdHi!C=3N0j7ath=Ch4=m0ZYK8S5KD*lTD?hwIzT=py5`SZx5n383ILw-2M!X=sW5 zR&aL%%>YUfcFmEBMA?QQ3M=H}QE*FMP;f`5IMS9Oddd1ChI!#@&jtCXDBJ&noxS5H zZ~XR{8c#YRI?3XeWvDhGd3{_zo5za<;_T%roV->Lc`ptED)CTg$i(vy zrJ?yJr^ojIxZ1&$@G;t0PB<6;t(zd@h2dT`qOc>W#kPUE#okp(i+$UEJSAg{cx#uY z;%p~-AKu4G#&t=V*qrE}n7$8dn+9)uMVc*J;`B|mEO>&U0hKO)NhzQ*g{7_NPwMG`D;i@;7}z}ePO_Uyod_{oRJ9m%irGn zQhBT~-?JUdhfXyd*jl6@K$3;_pHfLBk|crO++$oV+NWa?A#1{bg@iOcaQ!b{hkN|6 zG2oyf+HZVB6%8(S+HDwmtXSb(dwqcL#bd@VF|gzd*jcO87Q>CY22n6uS7T_Nr9OC# zd}(A_4kSXy&6>T<7Fx2uB{nGo9{=JpV+Yh~YkFs09ZjuMzSasEg3-X>zNUpi*xZR$ zPh>R@7-G#35Wy3rO5MWG(j*LyCP#e=h)S)GNHl=}Vw)OP?S-7QOQwA4M&xn9&d$R=D%S_)r6=&rsb&H_gIyGUq>CjPn@d?e*kEnJFrMi`<1 z9SaMmONkXL?A3eA*89LF1vXer?k0I($Dm)5=m6t(0u5AnK|&*+N9`OEc)xbV#$tv| zQX$R`lB+bs7R>J{W!#X@*lqEfadKAUl6w_{-x4;3>}jcdh6XSY#iw_nJRv@4|FS9s z+^u_Q(lag$X~fMdI@A!o3bFzqZ%HIX+BuuWX<2#~ zC{1JFINaI^y+Z0d=w)s7%D;k23noaS8QhvZg^2hL z7=4rO#|Bq#(o2vv&~CJ$)&l9{*U$+?2xp5Z#z*IFVoh$ojSasVXCsxnyVKbMrvi={ z+hh2C0Ks>m>ZzOxJVnx;QApjNUDHKguf_ZsXfj$0QpUjNI-ounw#N3F8p4Rb2+Ph;6+ozIF4b6^U9Q-yt#budZh4PfW`h!1^D9XyL;ZeFyM1tqv z7CiV#w{jk!9E?xGO3z%wr2hTS&@#}n*;b*chwd&3JU_x+sRmZdgU0IE%jQt~CwxBv z8)zg%DOf3n9Pk}BlK@p*Y}86 znA1U6-Da-RZEWhrorPP;ZEBONi9)yZxtvTHAzI}&jn~gnibH9m(mUo`Zyvv9lCnqe z-PcFPJpaHEKQCS88ye|x%p^c0tzJ85I=8k!ZDcXhIA6cMe^kexU0Ys`3NEzo?{4c7 zo%>+#XwLaix9LZYm^oO9jtW@YRB$>^{09_TjmlZkaedp-8cxpmc}dGS#Fo>fr3cXV_@bgB&O5(($BJxIsUJk?(SbCtf4tCN zU-oD_s&3u7m4u`@Lv{9KM3cDwFUvk| z{!H>eVEH8orscR4kUE)3_k$_?V~$`vq(YN7U;iSsd;%fH#I*T>$8%ePv%8_5Sgg-X z5mm2@G`&tSgJ&EvR!d@hY1Bl*J+lvI@Ps6 z+v_&>imL~BZIyV#L5NE>YT!ji=B$qK&24eAQm=PKyS&`qnECdG!(+CzlXcC?t?n&# z7wB+1lDF(HbOxtm$5CA}1)mF5;prXB+k0EwEx;Gd>qNnjoV5`oOGjwS`}OEo)|4bi zS&WnEIfPmL2&VHWYx;VYj0CRxu~Oi%7;>XOlAney@uS19Ixe$2cvjcG?M2X)hu#}y zdc77oZT=D!;0}-E(_^G=)xt%K8Y&y8!?2=+@ex^P#-*!gV+z;d-32`VYpYU>xH`Pk z08}4g!u&{XxTS#TOqn!kl2TS;xh3aF^c7b~8Dur&1#SJHzMi%~ot(~G^IK-PvDXga5{a|58)wj%RK}d2#Tvr(CrPkYhm9LI;w8b| ztrY=o)A9ZwpKdyy0M4Fzk?6=s>2ASt59?8;jNF&C8}~pB8|~RJU24xRk#l1P%xf#~ zM7+Gbf^zHY>!l=QOSW2CG4GG!#-X~*n{Z8?#e80FT7zB68R;Q&nJ=(KVvs9RjU3J( zRkK*X1q_yLLm{^5}}VG8AgUIR)&Ndd-aniPdIWW zKqnM7$bM3e8{|fZri?TVICXpFW~)jjmX;QWl(& zstLdo$ppZ&q$cB*5L7VeY0-rX7cY)rZXCC`nQj4b;tqq{vV7b!5ZYqP(?-NH?wOnI zEc{;JeU5_pAUnSm zMU4CZzTfx}@OoI-gKWl|vvQ8GwB2{@-7EJ=fboG?bl9%OK_2vhZR?1}_C&im%pZyv zkIN^vUhFyMKc4Z(*?)uiu5JVW8<+q8j>iAqbDVv1i3lG0Cgu^gxfpQ{yCJxqDa^`} zP(%7d-7x*F%m?nj5aHDtGqS-?9O=`{P8*ArbNYX1q7wPyQq)qS*J?J=u14Z`tEjMW zHRQamCL}15mP2P}CrNSbWT?fkSc-wjkcV)^e=T}ukG339HHjx;h9NOEO)L_UlatNn zF`7rLx2k{Md|{C?=ysT+m&Z3t1d&0$1m^DLU5tqSDf-Bs{a@F8Cxhv&4FSb9ytzJ64jT@vGEh>{6{pFuPue59O1EreF$VbLeXef7 zWBk18Zh>_5`!F%TS}5^HB&I~$6KJ3Xfa-z@!Y*Q(7zACW-EBYt?}*wbdn#aFdw`cI zsW$RKZJ0o8Nit%-Y~%QJGj?d?jT^IX6%`k+b#`{nU(i8LW*G3+fO&kHz{f!`Y{g?m z`olSO-Ehz(?;LN!*rL}E?96ycsZ{|;Kt&1&rb~=C;$V^@eiq=b+3fioaoeH`f!ny` zkdEel$Xv7vtWs{HG0> zJDTV_&)^}ESAqb^d+#?qj(h=`X}p{KEVnf)v~+Z`Kry)FZpfRHBqx+{0rb+jx-sd4 z#5#-BtkXx7AZl2=JB==snFxe>jx^Q$72pYHLIHHxqr0EqkJ(23AWaE*Y>O4|0OUli z5qwZ=kkUbF-ca3%h2T$D@az7>L28biQ>@7&;z3%k71_6UuXj&J!`*+ms?B6&W#`S- z(%LQuCx@1QyP`elrtf(_Ymh>uOxS>_sj%bY8F1u zry<&9pH^|f=>4~!&|0->)y2O3@5+n*?Gfk$0{vfH15sA&xJgM$c4eS`cQH>$|M6HQ zq=e>}1zkNFmHpebD7Xc{vOfCXZhR+@v>b zr*AfTY1H&M1k*5Dq~6i^3=zX z%V}P)(t-|qJG&13nrwTukztB*P7_BV~US*?)#%Acqs*lQsEW zB<^o&o7cYWGj+>c6AR@%;&GZ~pdJ5j8|`jy{i3vKaM^U2y+3}(XK9%vm1zSZ13e+( zJf8i(T#eCOAO@&g$9HG5;S}FaQR(R=-?M)SEMAK6+FINcezlsAbLjBo*_Y)iNR&vn zamb$;^R$K*#<8b5EmH2&(6{Faa7L*SvEe)kQOo`y?QRf1IZZvPfW}I}XcR|8F>kb# z2qGXK3`0`6WEw?GX);4XU$|hGwY8zFQ@}1+mw*6Ii9YnHcO$^M?))+^*C=@*c?JT( zRBqWIKFSj#>jU{O4w2{`ai`jFP-GIW(y=8U=tydTqPwlBqcg=|-#z;l5Xkuc!vJpx z!$ItTH{&4KGTSZS2>D?sAHi|Hp-_61JP8X<7^{Eh_ieC*v*kG(oQNv-FtNz;%cTZ7 z$+H}JdBXcGX14k=E&(SHPkH~ z%8ca!tIE8cdQ+ zEjqgqrk38yP2mI=QC#d2zhhhrhMfgVH!1byPRanHP#DBUtCAW;1%(XI0|oh_GKkgh4p}n} zxG(EjoJXs+h$yuORKB7WJz*lLUul-GE%}pk43LZq6*j;WPC0D*OR^IO*lDsgfu?3z zT3OWqzA~>=nDMf88*uD<8CyKmQTxyY(5Md1gP1sQU6Lkqp;^LYafl zck5W3$8W;DrG-+|DK#LHBV8_umpzNODhZ3?O{BRriCuIf5>325-GtS=O00LYA(a}I zzII(m)*6Ic@mMNB-JmTR^`edTIbZ{aiK;r^vAtHiOAm*bApl2hw?g`@kCre>iS+W} zF&-1WnjZT;eTo!d!IdjC3!RV_$>Yg>ic>Ep`6ileVyM0*;_)OWb@F&ue}WM$QT%e1 zjJIx`KxF1(C6|DBU@{+}8!td;CP+WuD{}UITMV_(38da~C|c0&q_Q+(!ZPt}r3K9Jp6AQq9Fdq?~}$=Pj`6g(W3Z4kw9Z5|EODC#EUCs%PUV z*?JQyUA=87fQJeV-LOO>5v`Y@=8`@QqaaE@4?OZx7`aTs4>O}4&9wAW4W zb6$**6cK{=*bR`*Erk=RAvE**|9lYqy97{>oi|BNQ?j-vij5twV~;qF{X-`Uu#nctJC2+9exU&6xa>S5(0q! zih4#aE+sisH|A_pbMv451PFGJ0(QX1p1djP5W{_JIbaM=Yf_N(gc`LXNwpB%gTnuO zV1Y3gIfYwH$EJL2G`K<;1Y#yo(Oq6V@2Qsj!7JAIn1iIdBgEaJ+YW0e?g+Mp-$2$j~Q z4hQf_3x>ze-gsdm4b|(D0S(Q7VY--DUul;7A6&0CShglii;(umZK)L|k^V7?lwPZ@ zEBrn8%6OT;iHE1{UwtWgb;+||rX2o8+-0IW#0mOqC6~Js=nGetLWKS+H)X6SlO;sq zd;sdThUtpTaG|k#QVUQkfSR=2Y0r^a@P>m!YwQIQvCFUg5$IXxV9>)!>Tu789za|U zDlXmXWVVb{5~p9GK;WFoupA>unPM5%iMG#MDk6DL&5T#?NR! zeggr(48BySSC-+twetoCC(X2jQ7`$AL0b^h=Pb;gyA~4@7xK9EKdk$sJDODUi6KJH zU*s1VWS4hhL<~T;lg5r6`_82|&Zh@Z%jmAI`Y{`kyRMl*2S`IHx*%6$V`D>^!*=T? z^v||vYMxC(dTLP#1V$U=Hy4xq%#qb6743oR_u!*>0ARR-4v{%PKSAB>Pr@&f$GUHh zKM7X(l1;dh?h;sY8dsfb-AT>9(Gk2FNuL$p+)w%+MZ?auPV<_tScJ~$IRu;(0gCx> zQocPVP^iz~gWGW1>+NQNl-{Cfmq{es8k}|P1G|EtC}>z1He~6buZK)sCFpS}iP!;8 zNGUoQT%h*b5vENFvvhALonMRT^s5?)u97)7gMA%SB(wCV08;-%iCTfUIp{RDI+&ZA zuV`QdxzfT>rjv9Ldgb3-2|Rlva0ZtoOn-6taP$sl{56!c0TH1*L99BBl&aHup6sh8 zmNZ%!p-(Uw$k%&`NTG8n(aG2D!qOl7qztSYV$rsm@IXED6QXOL3;IPqggOxg)F~xv z(*r$|-sk74vE=iFZ_27!%bZ&|;Cb8rGdAnQ{Vw0%O1F)duu;fwk)(Gs0V>fRVJ}CC zga`@=35`)aC{U6!Wk6lrrw3U`eDA$x0c9&k(UPU-5W1V4`+PgtdU{F4{pVU$4E-u< zUD``KIzSO$zVzHd&nk_W`Q?Xr=UQVMl)}uHuDcUbsUdpU<#NtDrJ(E?3U( z^2@p$0xD+@SmS-!t%lLW3?18SpOmKXxxY64S@MU(yIr$VvZ`D0L$A!+L{E3=3aAYG zP%b4FQsf|3{3oAOSHV9x`H?wE@ zHt?~7J`~Xbn;&e_2B>I@nYu!j@HDUi3^Qvd z?lS4N*}gvrI8r81w6x;)s7zK|xH&$y`13EN&wnzNC>XM*eE~s4`Vm2 zIL__u?afQ5-*XX5_Yilnyp>vU;d9amfG&Bn(@%-n{k5?T4cT^&Tpi@*@v1b`&LjjI zpZ4=H-c?qF+dZNsw3_>rvYWs~_&rE6=2lXi{$u>!k$ zS=CAp1}0ZjY4kXX6*rG;eR#+x!|qtuw*LNF=Th&FT`zF-+%wL);anS?@^ml3#mKz5 z1&JDG#3kQogD*=C8?_$=FhB7TS`mVftF>JUE=qIfs><=hHmXUrz-P`KHk7#(y(+oW zM&%lx>+gcdEwS+dMVl4rpNpx35NNg?Ycrzb06v~b3Tm%Z*#ZZX0mLd_s-0>sKB8=; zWpnqQ5pmydNY>+d2e+L#Py+DohFx#bO!gr}K51MPt=!~ZY^H&?Hjy_#k>vDtm6!(^ zH4yU*b$Zj&uIG>+Aw?n|qN-7jQk1`H=3!BN1dSOOm|{2Vg=uxleA+AMCF^-g$-T$tp<6uExdL@=lfZ9?SVpuo!rgl(OG-xkBadn^)I7 z@6;dmxob$0-u`74pCe0hnmbCZvh7wksvzKKH}E5pmT|%OQPBZXwawRTdmU{r{uzgs zuB&ZGs%(`^v%5#@T7g?q_oetlmtV=+<_A^pU-^6xJ)IVUXylV|chai!Dx`W1LAhu+pg*Sm{4tW0SVkN{zle0$9 z)2#yc4nd%a%!?drw0kmrn+b}>hk-%?`Ps8QIYYSo#1Q3;O@134b~aD~s1+tUa1AUt zS8>d#*G?NtQ;L)m$WL7<3+5AzULN^%V@!LF|H!(oR`|!K(MwqAHy&F_D4s7-`^Q#M zb$>c+j(5|{C%FUDRGb`yVQbB2wCx4kg3Qim_xf9_ec1hEs+QGR?zJ0ANb=9a?qRmF zKdHOG=vOXC*4 z(^ebu62qojv3J-z8FznNBC2{j$F$`prk0SV()?|YNE$MN5tkTxP<(vCP-JY?#M>d5 zwF`P9bkc@JMC|5`)G`2$wnk0!Jwy;sfkzmXEaz_0NnFH_jT30mnp4G25!ixQe`vr5 z#cxlnbx=EFh5OGjLcO@i?lU%-UC~lJviFvWyPt%z4?JnUr6F&OaSat~B&GzAZS3Wp zfoAHW<|KR%l=*FC6&Q5#_fVgL?l-;#V5WdbBk4Ww74 zek{j&p|r_QGCHP_D2fP5OjI5+vAMHlV~!;>7DohUgfslbq36yZSz_dXE=FYlb1Ba38(NGLd4n) zlG3u8!Bt4xE8`JhHQ;sD>au;16*4F!3bUO$lM0;W&4|HLRV&Qk^XjRor#<} zi^KXM;o4SK8fNPxgMVM?R6QZ&6dE^iFH6?hkX~|YH#%!}-qa~}Ogjhy=7=$-_h2ij zlr>sG@*fME|D~!@1ZKgzDkx>5EkrVq9_ackz%x8MYMn)mX4!OYgKeCau8C$=xfiVR zT&hslPvu)TgF5Ir4;NXjbMZW8-6=Kr{7PB4@t3*|BM?84@^{L51wiwkmDD%yteO;# ze!A{KjMsx$#WWa3YVA#b>mnH8wUp1L~_7VdWG)Jpe6V480An5R~OfwStVZq zMuGfzF5mJKtwDBdC;(o^J4u=gyvx1?%vkr8yPEHyy(IORU#NZasq0p=I7inHT}eWUI}CthC0*>?M=_-0orMe6 zu2_TzeDC&8#u6P=B(~xE%ZUMr>}**W#L7opyx4gQ&8_oG4(^@38iuIRM=p_1L9Gah z`n+l*nXR2HEA+rMM*`fGr=+c?H;roUf2r>;o)O=%AH1EN38bwn-xk{Pi6GEn>zczX zq&OC((j}>+B0y&xwV%-G?K8u&I*q68m9oZ~Tl?h;+EZte^7axn5#kgx7tl9;C54+K zm)??wu2L47$`zjbB+$Z|t#LWnJypV|?~}jGoq(w@PQVB+-l&0j#;+TydohNv`?%%) zWpVU8`6G}DbS$m{WK7Z~>jURzTW6PG(V+ypnMd*l#I~0=z=WNGY0z7C+#U>@!&D`t z-8U?Q#G2Bq9X&(YLNe}qpyn#u;2^M`!{z23_9Shr23>)+LojdXcz($oZZ*h870&O`M4h?QK_(VbwTX_N+ux~~ zzQfwsZ4aSHv3(^{MVL+m+eJM>hq83qX1I;Gcn4I-r7FG8eCl5|puEs~)%~Kk8?uuk(JAff`E7v0s}Mc9Q@MVW zWbhfg7Sd3p#<5~1n7$pNH8|T$VE?=QYW+7GlK$s)8d`%t5si=f*LF@Ma+@>f<|8of z6dkg`5%Vd^g`-KMYl<&b!!SnYlSZqyVcHaSWzq{U9OZy_@jC^q0%tmD_3#IGHz z%oM=oO-Mf?|0{%7HmsHoA&t$q&#u;xi$-hBJQ};X_twpG4LXm7}aJ>kB3Jb#tfRQmrDN4Mm1f_#G(`)eE0`9g|04Y4e+7+g@Qhquc z0BOEiJtj=cC$uXgtec@h<3Y;?RK!5NW+-z`qOlNZ-8@ku)z)?BlqfnpVO;d>O}Jr? z35CVycS0&;LU}5mg0WF~Q#6enf~BSVZpVnJo(XpnbRu#23DnT_MnD zSH#;mm<^sK0)K@t5M<9&@FLlF&8^cNBw)GkR+UQNKA6nX0+9e z=YvK*6=eVVJGYzSl{!W4-d!1BuVg4?91<#&5Zjib7sOv-P%HtmM{!L$l@hBNMfn;)RKKrC!@HwCZCm&`^-B_v~=&nBp+^ksZKTqZB zR}vcm4aQJ%_r{`ar;acX4upVLKl}QH>!?Bgt>?cY(_9CteP)9$qIjeg<7HVywUQ7x zXZcI>gz)k+@wL6Ew-TQwihRASzyE%_7#AO>ViUPiZ^6o<@(U;kuVQ2K^FuOs&_&L@ zaN)w$IAJ(hd7_J|_RKwA!2}lh{{YAd(P~D7P;S?TY07ppEvw8M`X{2}%qb%LB&0Z( zxao8PV92(}&Ir*GHH3mhRTm9WfMS@2Nk|&GR0j2sya3g3doH&Km&J>pBi#;awFNo; zedS-jaBC+(j#&TObt9I2ArL}0kVYx`Z>2dF{?eliMeOqC^QRruGwZxCOKB5A@(!V; z4j&?$ryxltgEY$4Xd0L2BHNJXvJ4ZT#cz&H;M?(0B_rf`nsJBSytyE+nAXq?g4z$6 zD2lb$+!QDq$8K*WiXLHg9FV-$U^u1M+RawMnPpR`xh`YiOY1-;v2se0VdxmF`qW%2 z?{0Ak{zx_uAx^CL{PlzV{kLk?EVrUR?!gr;D7282(f^1Dd=s&{y_Ncll+zNIdOz-4 zXmc4@|0JL5ji!Qgkzm|-uJsujiKZfo^CwM5q8MXK7Quc+3c)C>S#Q$21puo3Y(we0 z7OQk&eFfxK)^cxDFNMVcd9F}#%dj|SG#SGgPL$QO2KAY`+2vKh(iRpoU)KNUA>5yv z$ovB`3l>MVxLcb_CvAx8Bg7i;BXZ@vgx??(iBA7ri{Ub{DAOSA+~TGA_PXLwPwl4zwhGL zFRp2@IKz_RDtVPUJfT{2c9BcKnux`K5|Iy8Q1q?)QGp~RJd?|W(@4$6$Kv}HAd$LO zbY9E$C3KyNs5Rp6v0WB)korhf9c{((PLgm((R(_9dB9J1#x2i-T!I-?K!c+F+_ZAX zX`>LTp$7Jr1*GKv>RzAa8xFTN0dfGTr#Os?kM(@EThaa7lTYWIh#@eT6nP2}&HDHF z5A5}XJ7}_EOdVMsK;j$?qHf!Pq6_G;E>bUoom-%tHGrEDgRKoYeA?~ckGGN5SL)uq zdyTD9TnD~rMxq0DtJq%cu}qQ(325eA_+y(%~ab?WgZK9!54WVMKPa?V4<{@$1{NCa$1Cb5r5h2j-uI z#Kg_M1jTlEbaa8!rDIo%J=LI(o*=H#K&1sLuNpi-ZpB0+g2%k8QY!ehsp&Zq?tCjaO?C3m>4tvSdH4PCKOKy} zU1EIrdaB&t7yk8&PzSVn&ZCRwz-#u6(GobXPM(=IHC~I>&N&a{gHw%Am2P|j*ZL=! zBw^eS;2iN~o#LdrE5rOeCe-H~^GJYj(3&y`8(Xy@+I zL3jrbBu>hTd8X6iAJ&V2cG-O$Z<@dQ%B>*umw*X^(VHt)c}X@MI$G2>nyJ< zi@p_CB0(K*G>nbkDS!PKfBzlog!R4|JCQs_GpTMz^A~Ca0q$`Noz_MEsRm+L! zzi`)PI?D-`iM}K`MQ&tu-G^}t?a@E>K8ZJp(_h(P@>IL542#ZOML zoFkN}?u1^R56Gqj1UKh-Y^pnLDA8x#61e`rtL-7w z%|neRB6;Jq4`=6}y|k!I<>QrKGH6ss3G9jnM#0BY z2IqI<22xKVQv(7E*pSMsg+-nI4DP4VW;{o(b+;wZoNeeiC=yV_0j3>MQI%^}kEC&c#J|DC$;BXgJ!(zybq#sFyt^zPwBvo^6DJ)tvr7k}Lcs8i1PqNti zpAYiaSD~mn6z9XS>=Z;lWN7FKDD!Un5VMRZ+rjAPDB|PYDT?yDUmTNuX@Coh=h(7a zy0zvUzr%Qaw~r%D_Y5WM+K4@N@!?vr!*m!w$l{GpZ+cCxPm3W$ABH~p1PIpL zvz+!&TA|vI@#C9QiLR+wNn(fu7=xH`^yC20tgZ{EF>|dP1N+!1=_m96n{k4UpBHdl z+{vl>I%TcQMf%%CmiCYK6?+mh$hmori^KuuruP@XflBUnbQUK}OUqgG3hR|nKh*TH zk3Y$L*)yVbSATwsuFRi$Ja=-Yq~&53mow%!avd(cw$*)WI*N9a_V`A}mF zt^QPXG}XYVkRh$TQWODO44KuWD#$n@c_30uDp)??G<$!*>L~pj<8y|jZ$>ab_KLZT zd+G+Nb3K8YivQDm#IKy-AIvU zzOcO{UKm|e8J=+!%JWg%z|u7F03)cdgn4sgqA2Tuhs%zYI(o8cQPI&pK2Ei>*61@+ zlJsOO8dFSIZWr4_HX~?U@r;wgar)z!nOi7$@%F#s{8Y-K@>8pXxkzGW(EAc=(H#A8 zb$eOH=U@q~M9$s_idcRu3bDUmBz<6Kt4jK{$kyXN1G<6N4_+UoSjZaf;W5^Hean35 z0~=4h)D6oNJX#`dyf5rcr{!F9gv~}}v#(AyU%1xmURe0lYjZ}AJo0g-$S3Fg+92nX z76(l$^77v|bS!NsD^uE;Kd{Jt-IM-*Fxo6c`ZQM;cB^ml6-moEJ) z)%*T|f!cR~YIRf!VdgZUP-hvPhrHFGF%`vO-M_vPDK}p4qY@%GpLy@#7q2^F2NY-B z!3Mk`w+K3fEjyv~i^1S*XhsG!jAHJHIN-Z}C9?!5;9ftXAj$kY4*o;onuI9Rhwfk?_r>K@*I>7lOldrgc8^ zqbg2!W2aLm7=7$@O}0HFivi=QAsG;-Ph~qDFEeEiOi`AO;G%)e7a8rm6*jUMplHEW zCw_8&^YA1UvmHyB3p70qC`z)GYxtlYznsQn=aua( zjdwB=62Zi3r(aZ`bRwJN1C9u$L}C-CO(m93$gYE}T^3CM%LTCOPC)^8Hc`ziu36qQl>WPvo-CE!cYv8HQqCd5QO;^c@KXpWO9X&0$tu95at zERGv{a!k2@j8J@`oqkM$juTG7ODjBPF=?gJmHE%xI=Ga%3u4)Kn*`6M0iF zcb9#po#>y{^5_%a$QGzNqg!3fecbhzpRAIm|3wSt+q|wCOsnbfoIxH6%XUynjNm~P zoNoSaylmv+#9U1=Sz@Ki|k) za*^mSjn9lCw>)Ty)VN%a!x*g$b*2Fu?w_9OGN-mV=GNR=lYGt=J z@)81;A3XH3w(Bd}{k*(|Ixm`es82)#@P}#wu29pNm>Y3mG;IyjX*4amVJ);(0P@-9mILw5`$1btljTI-*yv zT@#Pz>D{8Ek%r1%=(?^1%Azoto8`4zU##eubZO(c+!7bA}FoWMdmsVUrqmmDUK!yn;v+w9g?fO>Y z3`2lIX5IqkN2O!S)Yq+$=q#OKTk2|bh`LJ9(X$#5Y_K8BcA&vtJw=l>zW-EB{p*P= zl%rG*%?x^ug>~W%Y!$g82wZnZ5A;Td{{nkt5j4%BbL+WMrJd+p)mt#3BGX;*f%5RS zCT}{`Hb}dxUjh>08TcaM^QfiQ4OvnHMU$BePQei*g{I@8lFSWdyOG67M@Zdquj++n z#A+aNstJAFTMsfwLGl1V%R2cx7?GLCa_Jm#mYrM!q>wZ;9*4uA6Om9DTRa9AU=JX3W+UULSA z;m(&;$g;zu@hFTMRR%em`3ci+=&r=QH@SOYPoWNOLysu)R)tIAqyB~)i=RW6zfcG7 z9I@X7^5AyL%vE~E1PkLDl1jZU4M5PEIM%n~A?1Ga7IQ22AX=!BhAPhLh+OyymzfS= z#1fVoxhnytg@yfuI56J9T)UK(r4!oYx%0l1wzY-p)ZR9sz~TN{9zclX{GEgR3R=nO zM9iM#pW}b*x^FX`_hbWBxQIaPWSPl-+=R3GWe`=Uo|)J#P(6)#)f%)SLcyx|5n-Y_ zp8!~Q(S88bWIM${?u9FD5-erjhhB6(ubJr*$O;qznWod^WQwhODXvE#yY4n*NcWBF zWL~x)WpuaD3tLlgd0ELAwd)VJxKpB|P-1xNm3@yqrYd*9Z7UvQd|jeLqJ*p`Hmw;% z4=@vVj76hWq2W=_S47k{permw^r)F3#K|YBGWfVRty4XOZ61wRy8IL4X(8w44&9Fl z9*iVynrsE`MM@p%Bt@PNZyt8YQ~S6E1S&6lz)|t`DLw}#xaF`X@N)0LEW3&M=sWUZLJ?qBvbI<@6Bx302-jX!$Fa3SUiG>+DWwtZ`>b&(i>BaMgd9Kbq z&d3~$NGpVUeUO4gYBe@M6r^SNtSJ=BBFJ~b7aj?a5_=%M4^zsBJzFhK@T8aQtUpAq z9;9kX=E56|nR)sy2aon;EQLhCbu-8)9ff81uNwjaSBK|h#YmUum2jG}^Pd~7NE#>YdAH79=-;V%@Z4sd_y zVs5Bizj<^lW5sg-G34hiKmdpU{OW@Mwr|}1^1|~)HzMCw!g=B}!R`pHRl=N_ey)fc ze{wC?`2CyELl@#bwRbLUcs8sZu>L=KAp$amPi7+lE}Gg?54fLk zJ^G<`*-)Lbm3q`mBKJW4h9jn}ehRbrR$YZUVWjvr4pNnQ@e*y3P{q!|dL!+3Z*XO^ z5RA5Ja@)6Y+Ak>X5k%i$(}-xAGf)wRM$j5EygTrn{Sz+V?E#+dNON7>we@?5ZC2I^ z1i3enL}|VSnrwuuQUbPa1aD780v`$25`19Tt9nXIO5^I$B-p z8Q*554@eFMU)waP;ee8$2b?A{;Bzz?_!V>FF;vw|yiK%kGARdVS+}*9Bn&W}3+iyh z=wvcM&YjSaqI#5==S~Ph5zRUX*zoau7RlyNFL^(iNPvt%107=EKEpE#E?^u+s&B@p zTZ*Sbc!lNwX0Stw7hqo?8v=~Yo?*zY#gcIs6WRZC1wp!R4pp;G#**82w3{78uGMJf z2-9dM1;(1Oa9`rK-P&#duq<_^2*!FdY99~@aX>-NWINW{2yi$H%b zAl$aev%kM(s;jPR6@oXbtU#7|T2G}$X-hyD-dF~*gW&-`OJe4ugDs|UE#YSos~?cf^SRU|8r!UPihn51Jkp0r!8CQvD* z0!1PH9CEo4skKnwqqzw9#^;rbJQEf}$kLv1O^OkqlCiZCYlcP&i4}h&c8& zWSJUE6e3GWOqRr{gzVe>e4Uy3{(jf>G50_B{m1=y+`sRC-|M1SzuOP-A3>wNH}oI(r&{Y>U!?-$JfgyEeAUWgk-`6`oAU zDL$;2)aJl*jW(R(td;QO$fx@Lz{adJ%k-0T4k?GFrAj^#WJ*nQK7FB4+yYiqdsv%-8<{%W|Dzs#)_~S8P zf6|=ype%T;gm(t;Nm3yZ3tI(v#p2NfYP(``g%BKL*l`nQa4xihoDblu&#?}$)z#G_ z9k3MUl9-=HOOVQCx{o7Wxp=8p@V~0J6o`WijxEbDND2(A`QLSj{e@y=@F=v7AZVuQkT~88 z&7~w0eXnqVGF3c+iq+NRchJn`N|Bt1CS0D}{Jqu^n8adEk$+An!Jg60Tswzq@GYuc zA0#wt3l%6ExuKtk-n|DyFPt&bw#_6Hc+=L5H+bZ_z*cq`r*=c;>47U!evfZs5;2W5 z&y6Nj*372?k5QwU7-dAKwUt92c_Q^PEZLnC@NcH+bu7Ud7d1*lDF5}l;DC$!IC4sd zMYK*46vj(wylT&>>z>v$4Xg*z@)kf+%#ukL-_M%E)}RGP@l_O9wGLfdK>zc$jl)1H zg8Z^+k`*|R_*0WKB@?jas2cF}uV)GCcTRvC>gR6=6R9whA+ zVutznBMy|TmFR?Ub-;6$u=J>AIh zqu9mCr5$=eagchKT|JKaH?H~w<4?5(07VLvb92Qxv`X;0@-dzE8K~I0_&Nc>-^={- zm!b^4{OcXef+lZN7LFypx?Cp@&LGYQj3_;hNLMSrN=az0smQrB%6EpKRfg2-zyBV8 z0X>3FIgNj`)o4>YRhO*2=UXGV)taw-w?1nb#o1as@$LA8c8hz;4M!l}orz!N7SMdk zgZ;evq_RFngU-j8OrXTo9X#{y`9{K+#EtT|PuE5J+yYF(hw2Zq_jcQMU7}P`bhd{( zC!mIiQ|^ef6tF@oL6k`m0E!>NoO_j!53?*owEH%S?r1|FCI19G+>Mw!^BgNMZf+an zBnvkqNbE1@4DecwHYzdhDI-wXb5n8lZ^rPB-W`g0DVW7ZZl5}1n%K(ln9b13Q6a_npxXvYfl}uff&-UJZG>3`AelaK0 zd6@5Qza2#%zh|R@84Xur9doHZ&x%ml`l}OB#{Zyu*qz*J7u?iOI@i^N(Bw4$)M>)@ zP?ag*`A7{1Qx(cT@0ZeiE(*YjJ3YqIz_a3O@v*6jF`7%gmHCBR;E_X%E}EM~gV0TbPhYDY#OX_lc&2Qiuc% zAEHOCJ&Q-QJ@My3X}P%HJ51MGR#J{1mH&8}(E62}fe4ZncAxuMd_raeixlMK6X%QF z&>G2uQ>US`8-WPc3>&(Q_t#`17tjm&ZxbDE>fN#E3bhHq;&L#*32Ho z7KmNFXkhER9oJ(~hQ*3|Jfh%3>ShW1Y{VQrieHKE$nT5}Qt!?a;j-(sk_?w%uw|Lk z0lP`x7;S}n32QcK2U4|{d4N}!a0N%O=FyhR81B4=#nS7}Y3p0_LW(om=GfcN=i0ll z$42(?cdgw&))W@!^GwgZAHKTgYUQ*_zm8I?%F);q7sZ_hZmO0Cx>paD_O)enjJHWd z9Cf)7;P0<_eEoC(Zsq*Fn}aJatGt$V49y+L4HN1zzu2u*Bk5-U-Ka#wrVsyojcs7ZZ(*GwLmmOYf%c3s_%yEiWbUHq&M1if!wmV%(4xyt?9Gt zU7iBVa@%nkOP0r8n$Y&x=>TpMKEZ?bhCMun00NZ*fY^B(OL?8=zgvHqER@QVdvD?} zQhxP@22v6>DvLM=WgnG^);dt23(03ds9qPP+enRkvvJtxC649xuj}2=X$O7U&|J;$ zLdu%!y>`557fth@ob>_NzbCe^O|Nw0>Po-*&_Tb-%Wi$Th6NL{qRqxQT-Z?#M~z9Z zT;@oeaJRl{>lsmOuKVCk*WwI$Ic_Q$q;cjiz9*NcZ z$!nWA&RZ-Ogv|~rbLrjg_})=YvaoVmv)`ky@D2tgRxJAa#nmj3UFobHv7P{Q^m{Mf zHmL^wuP^lLuvhP^X@?eU3hTYd;gIO*Jmo3PV~@ZRDFz=XFL5-&uNqM7fHJ_DQ9d_5g6(Q&?NSU z*_S$Sb$EenKOR{!4s(nLw5FO?UMQ$c#^);`I~L8x6?8CJ$~|!hP(C^SL+_o~e1Ox) z1QRXcxffUh2xpOgz`{Zy@Lc-#<&ulU57kGRyr<#Qf;{Iz>5D zi*|hQ6iejCPbxY*#b(jUnMCq(qeYgb#%7zQ?GA1rmxB7UNJ&m@pimM6hI{&4a&5V} zsE4I4E+0yL;mhf}167a8NO8sv?AE0)8LG=aU8f*KzF8H{Qh$Z)Y6-K6CTJPxc#1@^ zAQVXuG~~2;ofB;-oq#ZFR+sh-7)|yYQEI=&J?-I z&t+9b@ff30>PWWbfV_JwJF=Z4HA5@X0ZAxWFX47dnt_^(5P*WG;AbnxB$6f*^mnfv7A`4h7fDqJKT zY-_oA+wV<#7pl3*t=z84>-%dFCRSd)*sWaCK6t12ldE0WKMPu@&fIaNplSrCfva}! zj)>RJiYKQ3vqTTct#nq6$Se@E?y^c+o*%|){rgyPR-F66JR2W3a7MIwrSleUci6e( z`F}X^&m3;=!q)`_*W>6YQJs(-9&PK!7JE2;W}VmjwC&}CB3-@r)r}qHaO~Wbuihe! z*~TpASHw$`7F5MGB&Acx+%e|&4C|LtB!0^Ra#(g={dngPySERz@^kPBiwZvIqM_|( zqd0L|;5PTBJ}NJeb*l$iIu_|Yal8GU)v|HHwE)(6^_H{&Cld;j=9F69(M~dqO-K~A z3O-e>L{k%$eTJbk*J>5ek}3^I=u+gdM&))Vc(#3KY%SlHI{t+wJ4;pYuctVC^-;08 z{3|!bohmhL_iuD73if4dOE*8x3*(lJmX3Zqz4M$e{fD}S(#>AEfBlECGL|^cH9PK4 z@!vRaSz&~yceAp$Z1`l`xF)+vwMn)1_5InyE%()moNs7XoLSj@m)q;w>&l(k?6sy1 z&$KeHY*<+BfB&QH^wAvr(dW%xWA;|A@nOZ0M}ooPlV+%<{lsh#+BBT}7gx-dqIBS=LD-&!Pif|T6@19RBIo-Q7s#z$Z0L}s^Dx5*E z?XV>i9YqsRqGH#GC%_GRQOePYX%f5)Y_No~v4iBz^GBb z`i}^}xcMxQ@vVezQ9sllf_L15zf^>yOZVu{EQ#}+ra|b1?qtfNvoa=eJtZE~m{BsY zQSAXC1M48C(G2^Mz&k?Jh$bSd57-TruhM;a5|trq3Jl-$cRE(c@Zzbi-qHWnpI}d~ z@bmhhi0se8AyzzNOXCkDU;On5(B^%4Nh}Fq-_KDs*vtHjl z(p2fL)DgTCOKK^!!%Eg{w%VVT8Uycvn%EAv2WUI)uG7mIlkV%QHA_qF9nkLb+1jchHgzX>kS7iSrXt+WVG>lA9g{cavm*jDI|Ey` zDze1Mmh0(1i_Mv(Yu%+)m0k9D`$=F=UU;9|Kzyp1`+OmJ&w03)x;1K9U%XABXncxs zNdgy=o*cx zC%4ID&(g`+OSYYETy~Ko{wsUYruuz#A8>D6U9fS5>p)tVg4*=}Jr^m&0n@u}q3k(D zpB=<&k~p3g-JT4SQs4MdgzjCR>V2(XM|}O3qQPlqEn)f=IOSQ2?D-qxpV*5Yb_x-> zda6iJyO3x%tj06qjVrsVQJt&z!~tk9+p*r!djr(WStwH@fF|scTZ<~FET>?3NdaSV z*xlZ`I_{di(h%$0BOypK;m<4}m(W>Y+EUYw)@qJ8Q6CU$R-y4&0Q{D+(IE{gcy1Y&U}=4_s|jKY{9KV$92Huc2;TN2f#Y4( z$HcELb^f>u%HoxfteQqN+mTL*!dhJ3)i}+jlxfM4~m#t}Rq*IH^k6od)!FmGLQPO2Bs<3~E+JD{;ZMf9kI9b^ zYMi&`Cp0kpC?Fhq*ZZ?X*7Z|lk+>^|9(>(6IY1-R)6?tfT?lW7(T?*$4wKg{UH+F5 z_3w{%FIE)zPU!lglUd7&h>0(8?%3AsLB1Qk%o7WT)4>cW;*YUjg~{g<&825xPYO`5 z*(N*QfX|Psm1N)qLMuO8XCXpt2aU5z6_xPZvO`G~*>9*7F6eA6T zx@xtJyy*4uHd3rwLOjFH@$@-;*v|w(!~DZ~Me#ZKE!!dh?5TwjUQIgL_!8=!dI0^^ zTHuIvTn&sY730$rKHVg%8kD)3kq(SEGz7J{#h>U-gz@VpgZgiZOr(>d#^n@1Ujivu zuL`iPC~>icYNd>dNodDPh}$Bi)2Ex>OSTnbv^v*;!H^3`Y3e_4*nEr_J0R}SuFy?L zIlLm}A>PU3K?}`iSW*F=@}_-Ai(Zw{uAbV2GeF9zk;YsTlZo;?x!Rc8;{bg-B|`i8 zF~2!yy7R00TtP_1E~EgY_!7~c1k(3nr=(Ufq4nhBI9zWNw2aGr0nuA%M+!-43}BKZ zdZ};O(eS^EK*DsWbC}owN)p2qC-2Mn&UHPXG>d#i_qPDp-2Xt%LR;Tf!5Ly(!lvR& zc3>IyKvY+R%WE;A%f|5EdXq{O>ySVPLOx3fOCuLiE|!vDe$!**5WCJJO`&)b6K=t` z^#F9l65Q>Cwv+OY7F~3ML|QD@fKVf+4@4wV1R^OS2zf?tA@ti~yL}1e2m3`U%VA%e zKCyh=+O-~}Vx}94sH!?+6sk+8#kGu!#nrbVgTXfoWWbRA%Q>>bQM4;*haoc4KK_my z@sw?<7Jr~r8pDJPP7gz_s%ni)dLMM*>nL?uFMFwGCD{AABf>d|Pv&Q0QQ!g|a`eG^ z<)&Gbyj+;XSbkmm)C@vyj!|D1jIlSiAk0fBJOk*6%2x@#;vS>^&DGTi`OQbYKubuYC%PnoIoHbs{zpzdJ$v_lXjOioln`0~#L##iH2pfJs~Xp$SZ=&4yPyZYHK@(c~`J+w`35X4}vLS2pos{>@1NADIlgb z{Yul~6|T&qR)G|XC!jt~0pc7LdVzwT=X)|~{SWv~0!r^Xh{eIbjM^;=p%Q!UXza-~ zVjka{`gz-mzVs9nYblYNfW{r6lemAht#-s5z>jX();xUBlKGjgVr7E@HBS&1*oxkR zb;T$uQltHXRwwk`#w?l4oX_GtAL?~xRFC>eZ@}3`+O3Rw21YM1{jhERH{m0sF{iT` zg@#RDGCTUo8n%-WIuv6Xe?nFvG&_QX-!F!UFxEIu=nom33dV#ug4OS@6LSk~8~l_{ zvUcKSiK5QZE`H2K;uBj!G96(!H zsXnRI2|cID3^m`kuhi*W=;yK__v3D;qgSrBXI#bt{Uqicp)MSV+}iJy*mDcIQ2O$N zNul}caL%U;jEn&Y^QA5yF|B}Zz2PjZPxe#Ly?iR&vfTa2DiOhilr66F>bND}QCP%;|RvXlR3F zqUW0fe7O%!`;Ozg8K1IjDIV1m5|-lHR5CdDTa@ojO&OLXg2lGJOa3DwpSnH@Owrxo z#BPFm*It%!b_iUPBgw3BN(`ywyMlmPgE$0`A6(8LVFcR3Ak&2M`-xKltb`)CJ~RRf z`V(vvCMU)t^Iu>GN8&y(?Yl7u{_M@aiMXFYbES4vo|XLpB@~p^9!H|~7zFzAa29g{ zm|ZeSWLV1xd!?G(Av@w`4Y7v(Z2ZwJS?ibd-a>Mz@G9oF?Nxv-gNP>H1XW61ab#Ja zF~v|lz6XwUt99elV2^N7193ha&U11&XwFnFRjqGh&HjbdkQcp5Cny3?2;Fet>!8v|YjaKaO^cj9LE(zbV2w-wCT#ELb}G z)oLL-+21n^B5Z>8wEN{!txwEMK9?-M$kRMCw1e5Pe#fP1G+TE zfKAiuiJntUfovtUI28@lbavUvu-JGWIl*Jji0EkHf93fFW9=J^@!!TiE_L?!6kl3N zi3M=|^lcYqeD`bn@$RB&H^T+aDY~->GH^nsaA@q!e}%h?6nFvU?Ca2^Z|<1l#34HU zZ&&R8Z=!vl*Fc@q0oi9v)QR!(E$-k~>0o8FqIc&r+bFEks-|fjqm(C96n8Rv zH`xSr&Mz2J*lbm|j=+A+6BK7I*pwLYkv+`w>%(jvaVl4Yn%x+mAO4ddZcR%w1Pf43 z!3~AzYPn~=Z#5ymTk;31fD@3vepniJJQ;B4_)h`sGUH^Btj(MEEEyxaCv;QPJYjt7 z&Q>;&p4oJlQ*nXyD)yZUXO>!&Tjf%{WRMovTirc2UuTgXXjESP5J@Yido)=C68vz+o!_(OZM zva+-=aQRMU*on?Vb)@(q_r)>POWZW(j7Rr}$b=IBlgZbIBGPDLHi5~Tts72V_(4nM zf{tLw>LvLN*vqX@KaqQT@94mLtM{(VA2VYQRcs%K3!y%aU_UyGNF`ecoaOOAJp}gN zmZIdrqU=~221xa3?O7)M8~z&h(k}@eMtyH+33W+j3k}htz6kSz=fjv6TPM9A769bu zUZzYI0+1+2sEe6IOXDSu9ALwO`WAFcUEBj zZ-peMa7g{X;9ibxX3oftx?^m)(YfF*_oBN56~7+DbYOVv~{iEFw5Xyul-w0CtjO7FB_NgYP3vz?YUT zUWm1obHzF>hT2}EAmGN0lb@tWB2NHS?0Z~*SE=!#*|*QJ-K<<>8ILOb=nubh?~-(Z z=CPAJJ$6|wt-JG@RCxkq%&M!S;l%pze#Z9C>2!coIaTSN>e2$LW3ks;X@$|bK%)=d z;^illGp&=GzyDmP|G=DUpvWiuJ@<+3ihc6Cri^k1403vhZBS&`Rw>Lc0$Q(obckjD z-czLHhMepR%6zMFA<&i#r%4%(jO{}f0k$eVQX&p$hz~kqQ$y{;pT*O&p*g$60CzRE z*T0zC-tlC62?R_uGL)jBwvf16FK{8_p{#MnxBVSj=-K$u+IZ}V>(+2)DzWcKeXavp z$AO|)1W_B#OXMmKaaRPU-R)5Q@QgrKG3~^waY^|(#ZzP+3hWa~dbGRlSjC6A*GnqeYvekX_cC={!wrc$}_a$Wn%o-KH1$Vv66d?ce>G zP#j@J2tZStS5S6UA?jXP&1U!hD@J20z@nrn^~8PB4=lM6w$a`#`pI}{J7pP%VhFuZ zIdf;u%?EAtbU^}2P>P!2&m7yOZ>jMFbJQBFCV>rlkjp7(aY5PkX!K4g@O;qOMs5t6 z5qA?AW4O?wAGSUVkZ3ie&Jp8c<#1bRy&q*fXo0zK#BCp{oE)p95pJ9nnt=iH*XbNe zaPYIJg%xc}&dE0uDBby*NE)R_%BhF;vS^72dB4ix+2|SAFGxS54^N?B zanc;hya0G1Lz5ncFfolYvZHUiph9vJDj{-Ta2=SsUH6CDxG(tv?C4|aL31ECR$bo> z;IoHDwo)1m4H}HV&_O+`X53YaF2WwITRZDA^Cqt*w^+9O&O+m>U3*w~**LN9ooD-LP$Wmt=}vrfMmDXnxFc(Hn> z%-e9GB*!NkOA4VCS3BM0S$B@mNN^OAhRhOh&AQKZqmC72Uuf}^m$ zs{w~$!$rb?`(`GjrsPf@gt>=zLGhFq%3 zUmT(-W`(58UTsAa5I7(9o9!b(=wgg&$jEoKt7vP4{qWZ;ifqHpmhdm17w1i`71S2@ z3{S8$?8b4+Xq)rk;^65HuHMKs`4!_66ajML_XBrVM8<0U6+_Mzu)umoOt>JX;LGH$ z=eWy^!9BwhKTXr@`#9XCG{vgEw1L!PaOvEu9@2{DD{ISW_Kd?MU=BlbE9Rr?_Yu%q z5`+WkQvK?U@gfH@KtU)1e(5rK5BbSf%;eC^G?XF;JTj5Y#JmYe0V68vm@Bdi)1YLC zz7#6DZP@~{0*>i9&C28;Ajw`*nl$Jx-OhZGCs=VtwNtQqJICSBFZoN~3tFH==I|(P zjPe{g4^`3S@ifq*OC8e!(iF3(;+HXC?r-Xcx}F1{PW{l0JlBwT0+MJzo305fh@!!| z)bXFBewhC|qwsJq23p}wbJDFYS{mENbO}LzunuUbLpB(yY?Spu4bGNIa?qf?cB(0Du=Xd-3N6jI({`A1mBB9RgaA8Q2T|uDtiI z?+4oYSwhk<_$<>`VHfG=VnWG^`6#}w zC=ZI-lh|Oj{S-eM8+-{5UzxR)9SCn@&B!gv`D3li*oX^AyX_$&>C!$lgWs{+#1MT6 z9Fm)Mq&5X39ZX-z0ejUmu%W~oSU4ejd7V&5L&~y0q7XEhVK+q({!W56N@!SX{OIa~ z!8&ogJSfm{7_89jTvLY=Y7xZ2UO5BsxfKI(Py0H13gvyc?d;#V#*_zHH%?4)LgFAI zB2Ip~DRMCb38@O}DZqBojyH#!N3;%35#G@C4ZT1Rp;|(>S>obeRJ^8uh}6@yLaKHT z6fpiF+k(L0$k`o5XB6wW$#kL#>XPT6OYJu?Pxika3E1^JnNL1ym<3xRAB7|^wPkuV zAdvMcU}1fcnLYw4c0GhQ+8D~6Z+}M&X7vxORm{~`{G(*blErI)xNyF z-ivixC4H~eVJLTs;ahG*hVMApb8(}82pa+2VI5h5DMZN?9n2Trg$28qFjSV8nkXSI zq}=kVJBx0d;wMZyD&7#0mooqXU?G52jlqpCw<}#`{!og1uscHH5hy6`tiKzneY|sL z4QXzm(oU=!Ci)Dm-{{vJ75Xc!Azkr#aN(Q11|ztM`NHi}H%|Rv1=imSuzB57;S)Hs zIA5R}r*zVQOJUX31*_F!C~p#_=4>)U^bQjVyDj=)NKTXV4oW@?@{dEt<5R-EldKBYdD4GMxb9CJ-o$% z^|>D(^4}r>fpm9?E%{ZMz+Cdr5dVFwV0_>TX1=dSIea7M6_GtaKRsb6Njh^M{7&F< zF3$8KL?`I49>guKYfHLGu#{Sz+XK_)&Yxcb>#EV}H0GM>2Y5acm!=qhfFATBS%yDw zBc^HDNq=id(e<$jW46aRMM-60eWdBeda3CeS30IoF|jR}zI{;Peri$HCUa@Z;e^9K zV~Q*gvNPGe1DUS+RRVU6!O?ZBB_3tpA7z_p4BV~c69do*Sej0taNI;ZTaC+E6n@6o zex?$d=uCo5X|aKUjn|su7?_U5AWS|6+lJ$Ji|9^2{-xOUg%4Rtu#b+MJ`Rb=mB~K0 z301E*E?AEm58|+IHltkAejUsy_m|!esPlVFfTwC}Gt-uO=s8RrJB@1~2%&M;gtrKK zao3&Vyo);#cfjl_mz?`3`{b!w+1yR`br4(L$gfG4Ct`F;a3Vh@dQu37?!yANe}O%x^QAuS{IjC~Y^7?NMDQ?F&7(8j`1Qj} zL7zTSrR5D7JN@Jcc<3`GuANjC5L=S3T#__0*u9j-HJ-qR$&j7uC88V&B;bzhxApq= zDy(1ov3!e>ouRQY-h?Vx0&S(h=nxQdjHC^rJI>X09uWL0dD8{$&bMq5B$0M`vmnz6 zBoK31Kx+p^HO)n>r(``$GpyMQ@|E`{%~85)g{*9vbFP4s&t7&VV-wQ;cpd{#%le00 zOH{hwBUYS3<+lQtCZXlvK1N|C5ClZXiv{5WG=7r!@;_3R7ql29FuuIL&MaxU3IbPD z?9iY(-eLTGcRe-DZFiYBm%GZ!si3oiQ`7?j*EFUWa4*p2{~y7EfB$9aDYKIQ&&NW_ z`v1R&zz!#26Ga0b!!7ZNwsx{SW6J*mbjlHIUIwT(XHWj z3c3Sov(ScF@y0{HOscMmO9xa{0u$AZ*3$p}_#wA<Vv& zeH!$o|HygGOY)x6_Nm=bSoTmgN?Gmm%kG|@``B}@&5@Z_9kg?q$WEbs_`ltAsvnze z8I=p|ksA#;aoTJY(ZPi<8{oxG-J6t+XT|KA2=M2bdm(b-?`uQP;Qvv6g7p#H2Pt1Pg~$T?s8UADiz}4Lv2wr3m)0T_{;8ymS$}NLGOh-OG+@O z%zrF&<}m~z>|LS5_#f}Q0vh%r4NiOxK2G}zGP?>^2lxP=YJ$N1d^vOW%{9D_*@}S zdXl((6`@B~zgt~BmQh8>XYLYaowH^;UesnptO{*dig<-n@Bve#o8$@|kG8>dP%K4+ z3Zb=C+gEymT>HD2n4|!SC#H?RGnb3=)PoAkAj-`QnZZeZ3#TGf&~&cQ3Omd0=he+E zEZfq<3|6)X!o$&bDJ8J; z<&?Ae45us$w5Kc!rW_{cIAvMz|NLWXWea|b6b*zPhv*&v8)^Uh_y}lr_l1-_eymY{ z9~oB0z|nOF9SQ!0(A$$dxVB6#J{+3ojO9vgT{L|ap(Uc|YC@(TX;;S(Vh{sFjR)5+ zg5#T#s(Uh$T*S%vy%j?52qfF7Y-fVNPxYo9!GxUl_8!t41F~2OG~7*HElz&KTuNz( zn@D~^{P1X5Yw=%qD3~j*K;r= zMf9%7&m5knHqCOtv3uvFU(>mYU};;jLeTJMD0p-4GLHov97JN9vDX*KoU$OH!bXBN zBN4~mE*a8AQc`w@#GvW`#f8%>SCWlJw;X1!xo}`7F4&p9jpVpC( z8dUt5$%9%gp)B>g_Zu;Tcx#?wB4WsHo=*%baF~PhP>CENTYU72+4J257X+>l| zR!jZp6ju!IY5vnMp8ibXD0FAU6(w*V>+|6ITMT2#M&xnPC>a|*umJdtkczQmoQLv3wyfd=B&tps*CHIBpg(B^^i4KU5cZLUa@}( z)@rXVVU=0rY6Nq0aNHRSQ@upBByp4Zfl5gT<26t5@GN~YKLCX_Sn0~1h7nUf~@);QeSXspQ3X21j?Nzz18X~dUSz(6dY?*g@ath6vP%qR=YtDa=nT`qp| zDHw5lfzJ(Mi=|+B7gJaZO~H)>Q0f@Ab;-R;gA_|-BZosL!)$3ALvn9TU}X!_GZO7{J=THJVzubA5)3c5o)36Jb1s^3(U0mtv^5@w_N=3 z;AQAqHh1Pjjl`eAP3hXJUNQJ7_gdtI5yN`cD_pbpoQx~yBGL%o++356C(O4kNUksl z>?2u-8g6jwa@;FIWTJS`s!itWwD$$be%gw0O2+6`RC2nZrEj_8caYrpd+oEKw`hft z_hjZcqla(E8Mq|uJ`xC51Q|KlpT)|XeQyP&I;F0J7j2K{J513<^*|e|RUSOkL6kga zplG}y#+E;Pt#J@ZKHDd78O=N8O&&TLvs~kpN0TsZKR)=T1VW&tyrcqGyv5}-(ZXrT zs?YclP0-t=Hn~E7wh@sSn&}Cc(C8IM9PD&7Zp;X7wT{yN;uWCN)SYn znrhxmKrrQJ-`@UBeQXmz>k@ zZ#l^p^m3Aj51vhOc-DCb;5P~Lj>esYk;ax{yi4Dewr)WzT|5u!TxEj}AZAK=^C?2( zEv8>=|U zS(@~2k)%^lRk>!a@~!QTcIl_D%-`Tm@ixMsU3dG;p*SxnE#!D3bWV~0hp%A#fTP!n zVBy7-CQHRJuOHZTHsp`c@KE@W^(B01u zULqeIXyIP?@lck=#W%xzjZ%JfbcG z-I%;p+Rv{aYWjELWV%9P-2ERxCX_&K5*E*++{9y8;WjlJT|Zt}TKMOK3u%c|+RSL6 z5xf;S1U41{U7hbmz2OjyBf$>iG@f$dpA1+d+!c4_BY3>*5(j~&>TyALnHU^~)G2+Q z(#>}OMQ9f3PgwXV7D@nRLgTG~pIe=hXQ}Dw2_aGUQO*RI+!CBXT^SuXr^&P>9eEf* zU_QmlAKVfJp5xy?eQV6PGCo4muV4ZXsdT;HWwreT5XW25!C7|%hZ;X~s1lano2b(( zcsKn03n#~$C)=$vDZ=iLS#tPE6zK$PWJ_rZ@{)X>pL(W}mgplS$u%5(g~nO8o}8m( z4%>W5B*Fo54if`6tT`$TmAn~?_h}u@QATsZ#`{Zarx+i%!6rLPOkZI&)CC+mbf5 zC&9)=thW=qYDG>R*g68MZ+$webdwcCu7AMUy!1*CzeEV?)(=9ZX`x)&CxOhhc%Fkx z=+v8pIE{`GP-ziStg2gQ zz)WnO{2p~;>@!)SpDX&KXPyxy_*$`&0m|(>i8XKsd!`C+zh)TvG|9RW=u5?G@wg7R zwwI36oauD4pcAP|{J3r?q!qEXdR@m-Q&UMIRw1CJvH!dt21}RaOyDG`JsyU%DHDU2 zSPemL4QE_E>2i!9uhS0jfJ8@hTLZ#_-BWD&DUi zjFluIDt~@5Wq47(;?~ugmQkRis{sbd10(eZw9%Stt8-y7Sk`xaNd$?H;_qyq?1`F6%F(t*2My zCr)89%><$5JAE$Jkrm6te<2Sev_x;;CfCD89@HsF6p*`&er#1DvSIZ8P#wM@_BSZzLgl5u`o)~7z+QDcF5!t7r{Pko|5jJHauFn6}8A#^sqF?o!xgQ%1zaECiW&pLZ5%~{6^b!sRV$`<)<6lq(1=GsOtwE zFLm|7#smdZL7JQd)@N$~Z*P04O>2#MiMkzT`*61*6d>xkYaI0MPAX zNZ&c0M@);eE$v{=YqL(wTvy*0CU&THE)KVbKd)&f_V2F9LnVQIWK-A5D1FlDId&md z%$0PX4*5xrKZlvZLbuO}^^ zShKcA3z1BSr8^N>lqf|-2?+h)qHwh2AvN{)5Z|VS_YoKpe zD(WRGn{gPSy%qi7=C<(u02o;kZv~tuqxk-G$uoGUc8T>;+ZZo9J0r<7WZsDYgVX#8 zZp6zg3W%xgP~==prb4Y|E%_JTzPx8coq2~X6?N1^QKSH=X@~Gej=^R^k=043PELgPtoPDLi%ZZHgrUj)<89QuBN7s1# zS$pc`t)_F?AI$qlb5KJ&eUtCok)3|3KlB*J!*pqC{~e50YF;=wv~GiUf$Gx*87}pY z`y2Wesx1zi9r{;Nn8snGZ@rlat+~;p`al0do;R|tvVU^T*Fqx(rGuC2@Gs{7le3nA z$h1&-d*rNT$*oMa{X2EZx?SSo8PQv}3bGS;5-ZC8PWBqU>OMARlQaHLx|oaNP{-Y4 z!{)mml^|rv%X3iTVZnEClYKS8J7!a#Xor+NCyZy8oxt*nlC9yoorP&u@=~ntGA%pq zad8*fVMnwDNw$q$C8dJ-%1NSfsHnS16uZbFEcSkzkJ_Fu*ec8G=f7>3xS*CO_x95F zXGFV3m)`F(#<7?&vFpG@D~WJ&i`$j_6W34nii>;vDSEeS&>D4c|IMxHX_|;rBY*^V zK&|7f|4F24adW~?^Pc4F? zOY8Ly1t8OIQHFQr<(z*)s}g!79uA81K4@-i8Nd3d=>1J%xQV&cp zlvh*UtwTB&t^4bmRc+0luaO=={zFUZRwV9dk^k`x~xn}lEu8jh-^SFlb4MUpk6-E zNUS8LCfL+w4Zx3Dh%?@ZcW&Lc&Fp(H&N$^Y0~rjEVf#e1PYNoQ4dlynyNS4LMx0zc_& zbI0JF%>O;>{T+538 zXIHLisY{ALp_4V$a)Z^*zxDG&)oME=9=f@$G@1-L=@vHT(mPn%j1a}5(QciF z26M+--AbD~&O~MQ^5fP#fP`$XEsZS&e1ic3{c(M~t2r7XmZk_G4xKJc^S%kah?dQL zwu}2_=27-6qkiu3A2{!3ArFYFTPJH9)xrzX?QI&F($ z>X{Ka!!2!@UL5D6C*Jn(MrIxAe=S@4kT;{)U-V$*Ld1ir?z3-g2AhT>k_v-GuzkYmm`Pc-q3>zO~kG*r=F!*&94AZ^7utT#U^>rB?PY5<5*5sLFd(*_R`rA_Z}&a>H1=+& z5T!3r5E&)T6k^7PdPbA~?D$hh#5}AiyojGqrntzu#$@!_&xDZV%}f0%5;m6V_*ph| z86QaiHZh~&-dg@!U{nLwFj!fv9w0D3>sclb_6&ZU5^mr*aMI|v=42?ORNsy}Ky$Dl z;#)~}%OQ#e(EQoshNz=AVtrn<>{c(;&qjh)9Nqf*qh~N|qv76Q6~kuqbOCF<^&@mO z*JMKAZ|Nb~>OTo8z$XgmuGbV| z_^yp;w)FncO>Y#aLthc}K8u=MSadpq#RBwsG{FxoEos{$q)~KP1~Fn$+#5hIsUf1? zlYW>(gaZNhbs12DNxih~s0<79j4|!wBsq(Ayocg0#NGud+Of+H+dt~hf@edf+-{1l z=T7?5F9gc1{+O}lNsbioZnJG4V0gc7?*ib<>SLqBi-L`IJJy#AzG5H-7W=m4L^C<9^CG$7IH z6_EsvVfQm#wm?k$FWuKy3Hp==01cOJJxnV2E90n_QWLF_r!VidHGp>Cumg(e1LqxUy#2q1Hl`It2{dMb99 zUHYH@=bo8(oVzi7pVGUxs5Mkb`AvPtSztUg{WJD0dqfk2e7D95dmkdf!X`f5#Q?!C zS}1lG$(e~y&xBa|?yMnzsNXglnMjfNK}y2+L{gFKogU94?RQs3GjO+C4%XmLRR)~) zf0zW{7i8}ZMWAG9%71<#{C^EHAWto@RCNK*HL_VK`7ar$@VRn+1? zjZg*$QJX|tsD#{YCtM*du{=;>HTB_&OCuv9RtOqAHbct7Wk^=@4bNc>^T#2X8x(-p zi^$F0PAA+4Pq-P(4dj49EOPmt@qH#R#*IS-b8GS+Gp~>yQSc#GI8WK+yTi2#*7zc( zltZ}~COH1YR|>s5F&L0TrYzPfPB}%wQsz*ka5ed*2x*Du(d}|!K;^vU3nbY~(=cp{ zt7N_D5LQYTa~xxQ2MeLkbrK#q)EmrS9eVZ{O@D#VC1QC>Tuxcz{gI6{fo1PvTX_`y z`K3Pl$7ew9%hQ{~+6Lg_82auRP8MNE{TzbFF3iCNc&-=H3nVSNyAirpm1I;9+>=P~ z@2+!|2>YnOAXCks)(3@+wXL4fz&yQg;q?Ao$)XG8p(D~A+&rLFtzD2`Ch9Vt6KWYG zDshn$9EKsaODQcNEs@eFVlOwquhW&$4d8hpZu5P46Lt!BWH6*- z27!vi1f?aePD-O}BZ6w3-fx3No6!3(CoeNd%|;9H+VuVf6c^kI2JRT>WA^LVhKJfO z&TDnPG}%^rdQ^XTJSk>SOszj(`#DK`tFWl!c{D&252OfEIzt|RGg2c=^xzsOv7GXV zp61WZylN7iG&fTGR?pRP(#dt z*2{ZN@^K@!)%8jPN-K0|ehTpzDkEDDok4H3?^1*->>Or!2Ya1;kTc02T#ngW*Qcuz zYaCl&L~tOi#RR?b=Q|SG>zLb?;Q(F8mXWo}hI?rApN$Leuu zc!!*BSWDAQp5p?h#C-=DGXR$da5RuwjEu`IfclkD43#Jy%uq<20ymH2u6i!Yuf!nxrsAR6ER3~2&k6vdR&l3RI( z<_uuj3R$5>x3rHk-#Ci@KG0H6|YPCp8KGPda)+)FcQSU?E~&AAu8nRW7pG~*c1F&MQh(D>5#xvH5a z*J+G`k7C0sx8(PDeXg<~1(0X-=lZB9>`-S#y?K1qA&T@QxS7w%AO1vQD;hQ?t^jiX z7{JolXSm=OQ+`}7^$;+H6#}Odh9bxV<2f{+$M5wKPJuR}5T>;tr$j&ZI;b8El3#o? zL&$&nQ7T^v#yE@m@E8S(;nJ=#C7xO#hX(Y?%vikjC&c;PrK&8#ZF1r(W;q|8Kr|^C z+-NFGF)0A-hCvrE1f6qcuHKvlIUgT>*Xp}}eeA~PoF}8;rb`f#p}tF0&l|A#orAQO z=K6^|bO7BHeUP!v)Agf>;&%j*UZ1s2;@1q=Kuf?1)g2SAqpbK}zh9^>*r9Cl>wRMO zDHZXvDds2lf-HSR*%Y`IZ_40);O^ECFT#ZsU_TB~Q0YrX)h4j>H5jx&1Y{=;oHKi8 zNumO>Q|sM=S*!&6WC3I-apgh5s_QBLdRGVI2;u84AAz=Z%ebt$V#y6W_`HZH8I(_{ ziNt6?mDeZ36d)ejN~{7R=XC2y8X#VazAafXU98V}&H!Tv3N(BTbtBD1iMK+Q>QRuc z;Fd1@VaRxnJ9$LWK+<#5Apw@2Ds^g5rR!nuync@f5xGC4mkMh1aZV<>7>iht)pm(z zFmTRhD%%W1VkS@)<;9@-vwe@d&Fh9G8XoETb4arN*N^|=o0(5?24KRJ-%qhfwXPd5 zkxr(WkY5A^eHP99wbOSIyG1>n{*A&rEHSZGw6b&+$?yM!QX|P$ca#05QMinzZ(?fK z4=tbv3o^uD(VXXpNq^v~EMhVNiki~Yzp zJH(O>(?3n2P3Y|+MIAxJ5+w<3PRB{IP!?dT1T*!*G+ubcDF}tIrsi;zgFpt;Eb=P# zbr4?>D_6r9xY-TZTgG6#ltWa1i{ei@hTn4by+*tJ>0k2m z4PigdSojP?-+c36TgA`rG8ZR3%6%Z*3OrGVn|dAL%PuMUAY8%_gBeJAr(N!UBed^a z^Qa+oBjMpwjaNs?$Q{zYQ8=z{|2dV}@sKw}AB4KhF+^N?fb?~w_NdhllvpL__puiG zquwb7VLjahT}aBFnbIQs`~7Fu;V*fV(qil_y{gz}X;=WwNts7X1ceu~?V~!B!i;ck zwE~5{itpcUx{C4A!cFng>_`AKM$+ezyy$6lS+=dmo+lrrz|Jj)r|~GtjlOS$J`_QQ z?}q{<=&GP{d;#39X-^nOw?MixU+{f~;_0ELSOk#4b{%LJ)^^Uq+ccJqW*x;oqZmnw zt$7D7gC;Dk5x3U)250MWDu_!80L)WB20|NGIK<0m4%NaOzl6#^XgM~&)ChaWJQdRQ zv3TDi);*@D7ge(~-DlCeHEUPuc$^CE0TnKOKDgx~y8gzJ{pm3;-Rs|T6i z#{!OL&!>2uI7lT+uO0*;ml(G2uCkFiIyb5?YGP%%OIO;5c3EW@FgHTFIDSsNEaGl> zT0Z8IA_v$S?s1teb_(}zo&TL7A`qBMs`MM;YuJaUah<-8yM2^9^7Y+Y5{GIVZJGM? z@J?Dx(bJSA&Hbm)pRbCNG-UqA0_AsH?5z*A zoKm?iW<8|BTxQ`DQ~1dR9*Iqr-qfo+h56q-cMdb|Kww$Y|M}Se^APBy{QvSwh)|aE)6MjWZ78x(*joea1h4QI88V1A33l z_*}<&r=z1IOOu3JUw8A*WeR{s5ggkrCMH&&br5|hRgBb!W&tKTM$v&7_!f1HDM%MM z9>wmkU{gjvC_5Ye^aJW_X@c0-@BJ<&`hKiMX&&rpBwBE{RU0M>tKjN;N(ZTGCNQrj zuiIqT@C?QnjMaehw5I3=$Fgm z#BrkUbIKUfoD-z{&D~9Zu+jbaP~OeKQCS*IgND8)G`mGxQXwg|D0$N$qDahG4Ga~D zK)MQUG;VzSIwIzfUlo(HPCK5p5pAK&dl%ph3A2;A>z9-$FSSSE&AqkUMOWqA9h*Ut z&O*<|o5GpT|Ix8fnXeQ~=L$!ZY3|r5i5fz0a0&X2F^n8l38~C9AGwJ()0?1c{;Wq* z?;)4Si-hhZJ?9#mS6s_BvYUXY-g-y3B;!Fm6~d#vO-~S3fDFYySsG>wcw$Dw%9MDN zf{3}goI#Lodc05%lL9Q&)r82th0vob6)1ioN#zPqYqX#%_lCGj>s4a3ODM2sB;6{8 zbThZ!QyKbugi1rn<0lg~Q{A*Us`{5$8XQ6q9pvdC_bCOabu@0tE=MNp3xxMmYebMe4L1~w;I4J$jBQ8o z1}1|KDTJ(G2TlNItv~DdCZ!IK^f^?6UAZ0L%v%6rB-|3e?l=LUG12oRW4j1gBl70D zQj0ces-`Q!5<%>ZEyA`oTXfT$Dz9WMJ*P>MF7q5xrBP|xCl;U`?r&i3m^CCtG$Zk9 z{RWJMStFh>`Ai1mOoynh|IVt`gn|QsYa(&$1k`V_eUe3KWa4n@X0V3>b1}Phg~Z9m ztXfyq^Xi>PY7xfn z%(5wmvE$^#SgLsg%#90Vn z+%=krVL#-s5&AM0}`?}8abAAS7=QItV84)9C=b0vApd~_1 zr2MZh03iC4nI#Y?(S%l#y}3kzlPdDG5)AIJr9d(&?lUn-RunANCvs%wvMnquT2&if zUY|s+iyNbT%h^iw$+L~X`zlSubE#mrpZUoj(kBNauHKAtSkt@7r`axT)fg+A;2Qq> zLe~`H5(RRXHxu_~Ny2PuUd?Bbh9nu|9^$TY}L9VXQAGgI&3@evZ}4U`&=so}Pp zF^f!iAlN5c^aG+l9tYFR;@-TxyvVXDV2Zbw`dVWP`_Xv4DY1)hcQJeE^RVc?OquVc zMkSO37L!gpm#Ez9pV&mPAabFt7xK6_q7-O4q4mI>7h&mGJjA%NYH)gzk_Z0b;{Ak+ zP@FvJnCI{YYfdZE(^gG1`68tkZyBG2kGLm&oB8|?5d;u7%py0WlilC>R9K>yD{wyc)#8vpc5;s^ka03%j%i<>8=3ow z))pn_`Sao)w{bAj_dG&v>h z>h-c_nVcrV)E83dp~`pevaPuWWa#4_V1>~b0TJ7r zK^lFI5t43l)T|nAM3^6OIvn) z4dzSzB!+CB`BIW^c$28%>Ah6GG?cZ#ljSG zOh!K<#`qk~Ff4Lj|Fpwq9rS*Y%t6Pe$#un*GYk;F#9JXw{jsy@F@?-!jD z8My7-!D~&#pr%A20>O`oKf22@;91Y_dqn-R^zl5``CNNe7hZ8OoJI6a@Z z`8io>>lfOgPce?Efv+k>{nZk%2Fa)rhDe}pL|7R%hBUUZfz;IAzkTsMd1L#`QWZVH zoJfbdz=7I^i0M@J>D{BbAmtKKsyV5hJ@sksMd=zY@Y`wEb_f5-ituZaJvYVUhzwt6 zmLul8mFP6ys~MDU7FC{!RuEmRP)x>^TuSE3eZAiMPs{@%UHyeg_F$b4U`_Epgw6fv zWabT1k&fzTGyXR#wEh39XoOuz&9m*c*fl7`4Kv^i7^N5i+-uUuK@fYrkJoG%Yt7tj zdly1(?FuK|cE}a8sS>CD?gUyLPCogC${;~f{RE~@r)M#L_{BWZjFV?HwK>GX<~I>( z*yTdMlXY+hQ+Q#d%Qz8!ycBJo=t*iT-xQr?3MAC3=A#XxpiKm54$Q^eAtFq-!qZSj zLZw9bq#5IB<>ybNP~$-=1<_~thliMz)FpK^P?*L#v)dyM_Go-#VrsUHw98SGNYkIg z*o&BMm>%nJe&}ZQziTC;0w5|Mf;p7F6*fjF)w24dP-je(eAc2tpk8LX?Q z;&Lf9UPxBqbPf^aAMCS0*WHX)@KC^Ua`kx(Y6RWO5icLIc|Gj-Qp_8cBO=U-C_rQc zh+}Z{;=DuJjwXqTHKJa<-n)4)6R5|?B%wrLAiQ7T^w2Q0^ArsWh ziuNsJgQ-q!6LmW1buC9MWM0_&#(Vwq_ue8K$k&D5HfQ?6td@3+(<_s^ZZC+-N4C;Qhba!l2`o#2^N4&L^-hS%WhpUAT*JIan^Z z9J_$j#An-J5(ZT>h9~Sx%;5>EX+4HEi@{@PuB3$dGUl|2CAtp(gs(1^hs;Lx4GE+Kf6*|vqk zNqys8H1Kic5Q^{7)cx%>o!Hc0%l>|lc^gR5u@1FHwP=9f)vI@`z-WV|96)%%?_`IB z%Ho?9lBO9n1Wh#3KU;>=3>YKtRL@t;QuGoP$TYvbo$$?F+fv2TZ>Sf*gK98}y>&iJ zCm$VbVsjHg*UILTE!w2?R6JeKzdlD}JyuDa@#)p*>wbZl<03j`MUga(-w?#yOHP`Y zbWDR&z5A{295!UrF;NZ@0kONAc$;ZC4Zbw8~C6jCz z-73EXXZJ|S%;X1>lS?9Xk>o7?k@9N@{=*8_Ev#TX+8#+9SDWibWV&$O(obx8&1}#4vlLsh;t-53Dq6~E3fOZdHeT?aZ?k@;xwCKv zV~pyN1Vf;+Y@aP9Q)0P$Rnvc)%`9Rr!PYfui;5K{F4B6g(N$Lkl9`{kg=jse&l@tX z4ohLv-rM^aJRlZ}SNwjSK^m+mg;W_Ii=~#1i8}|I^SbYp%!RYDRBWO7IX#Q{cA-x+ zX1M7wJ|OG7^jwZRD0r;mE|zU6Asf4WoGuYAkQKcTj0U^sv&i z#wC*+8yG*H&q~!>_c-<^m$%^vm7MEz4GhdM7RFQASsVsIMqOMaT6v>0c4@rLdRj1?ydm0#2g z$|k;MeUYEq3F3soSW5}fNvT}e%sUT%bGp(==)D-xoIr|Js&HbgXOE8(_?84-wBKw! zv2elGYq6U=xSZRD*KwP-*M0(EP8~LgkSpJ#<=TCnxiTbY;e)6Nr>h|K%9<*xnk&ES z&xQBT%<#NbipA(AoulvV*>aI8+~qL0{Rm8QUmG*T-Qy!h{B?+$5TTd}1iS43^Au$h zubkbdqG+CcYeb~OoT!(h4dffh{1_k0zU_OCMx}hi9vo;iLdk-wx8Yb*n>;Vw@G`2~ zP-B=9hcPa9LC|x8(a21sDV68F)0JX@VLT*H7ooD9+j?an*dhr2ON0@VF^uA8H|Dp` zB5#C2+{z1Z$0!la)R4v9KmHRS4Dv@cb%@(|rQnwrbAG$CZGSq?K*~-L*R|N`K~_Rh zKn=`Q(=7wka-r{c~{kDj%qlWLjJi->H@U!qjb|j{3!pj? z$O%7~9Atz1{`G0(QbN5cdCpn^tB5GLK0S*ILJUB!EO5B4cTcYhq(;&tj1UR1_l#Ss z$OT@+G(Wl3we`3Y+;r;(nL>gnEt)9}AwnP%aO~~@F0Md3poH+*2-YAUR}lHc;CJ-K zuG2**9OZ$J3${v_ROZdt7iTJ-nPfd@t63}O|FkX5K+!&7Q}Vh{i|kO*kl4o_$5TV zpg}EKkcg%xu{`GpJekjtH%(%GQ8(=(f5CKw%ZN zVBC%t@SoZty~ucfoUzz1S%x27JAs2iY;c zpV>%Y7|eJY>tMb)(U`A5ChD{%r7uL_#(%!T3v!5r5M+2r;A&sDQi8*MN9R zTIs!QLo@<{OM#8KID*1Xb~EnxY%-bL;`nO>1_7Hj6Cn+cw1CXy-EMs@?sQ$HV%&32(c7D1mIg+Q?0P<5%1WWjc0=sg zk-AssZmD@U+`M(@!Qo$T{`&4_{DXeCOp6A^WeqDjSJgPz-&i*OQF9LRWcE0adB|G( zIo)K{KrCzdq=irP$#w^Iw!%u5^cBELiV@7IgV1J{!I(xGh;)%Jxe1r5`V^PM#6(UG zXiw>*MJ`K^ohs`^hZz|xIR!x+qX6=0#g{nD(}Ut1q2I7QG|ag5lh3dsg^NJoPn~cJ zSM5_J87tXx2n8Nxzpa$$jT<@VstG2q#mMYMxf?coa+h(IH%MYd;^ThW#l+S+v{?q{ zjBzT2B_$n7;pL{n4noN!Y`$0{@N9L}{r9`TvC16HzC4zg-gFzS`$g>|ClV2CTQ)a3 z`XX$+B(3{#$XveS#Q4xJ){HQc*iCHjCKq8zlvMu5f=~UBj)ve(Q8gNQ??Lh%7bQmd2atxGc zH+5gXy`@X{uOYVaOPwM&%OHI}?rXQmy)m(qeub)M(rH7wkyxl(7+&@;+#%3+43{w@^8BHXiQ9|PYnaJ z_l$kqR2EaC8xs)`!FR`pSNk$@`X@`5rc|;7X%hQ_q9-ygTa<{MyI4vw_eu-L-2@9|U6coTg@}$BKC2;br2e zz~3O|&a-q==lqa&AYWd0p={y1PYVw0#Z;EK45O+HjBWe~ZPx=DJkWUraHKVSc!>G! zHAo(SB4-sr-|OiN8NOkX49m70V$BmkraOX(+|+3woz=g zE7sJh>#!F9K|KnslPWPP_^HY`b6$(O`Q^gkK$I-ONUzpT3vaug-rJX4#{GV=0S_~P z`V>FwPd`%iIRXld7NQ(kd63_>D0UHKG9FnsA$EV5rs@gUxEaY{F=n;PaP*kR^GoQvgX{PmugFrOj| zn^}E#)cA22n0s1*56aQu9Bg9`hH}33Q5Yp@uP7?;3%I3xTS%;-sTg(gwQ_dVp!us% zUL_eedWkAZl^mAeH28be!Et*VU2?5y_JIQF<(uuHI{ZvFR3+G@e{~>A_>&)EF;BV{ zu#}8fv&kHskGW#}ke|;ShB^A-c`wBqKLoviKaRj$A##a;G?{N40e@LHKfWn5IfLXU zoFCJqi zeWvaNN$zTaC%HO+=dh8(xPPIF!XE68Bo89XnObN>BF3iS4o0|*W)Hel+(}lSb*1f= zL#y%ijJJvA5bAIO7s-%xgM6Q-SNXsU-3TN04*5gEY2+8__dWv!IA6~IBkM5BOgDNM zKf`&N_}65JDwKE(;--%$V=6NC)GSExG$ zC-aP-o|q_&j&R#i65le8Q&U=a{6TuAn+#MLZepN=Nv9YlOwMgM49ISm!+wm!$X)a3 z4EK#A9C)u2i2}WCWlI7*paJ$VT&{-TP6g;1Qy(_n&uG(2OvGn+%@AEy1>@zQWu{s542Y=YzS`7RUIkR3C|5@*M`1VyqnFlKGbN3ckEv$`P?kG`u@Sz ziEdL$dtNVE@pW=AE7D-@m?2|Gap?A6AVU#BwvEtJJ1?8{F$PLrlIe~I<`MgRI<<_q zbME%gFgD~whkAAw)$^3oTE>bLF-BQX3-YxOct#4`R^abT_90P>VrgojP1G`9d7V5# ziTh2NlA^r|uzeSCo3kLZdY@!nGj&iL@r-aD66X_(0|w^q2NH#=s++jDe{47OqxU?Y z{pU4(AD$!=C+8&bNymZtl5pV%sa#vMa|0ChrceTUmBEy49>E zCBu36*=ui&=9*Kvtr}Nq)~}6j(5T-ORXZy#{>zouxhW0D?7X{2cg`1ja(G|k`+4zk ze_TzDO!d5%elLEb3|iC%Rnu^Gs^0DUjB?DX7{=CD_YX31k8aX3=%udUI%y{%s(O(L=IZBV0pahH%po0EvvvM0?Bls(&QXug!>$%`idmlX!5L^Qge3!9;0j)h2snuAM)o&rJI*FM0DbTe-85# ztGXAL$=-U@53zxL?R%Q-DY+3j+zZKrZ~;#%-wB9+mN-m8y;`V_B`c>^7iybmSYJ)f zP|pEf1-y;pD-JQu3yMqUa-}Gc8)N0;S8}ew-csyf;o5OYs*o$k)gx)Ut&i-KPL445 zPPr>BC8ewBs;H>Q>ECEQ|JKe=pEMg585kPg^Y^m}`E$)YcYhd=T-|!_=;1F@;&!dQ zZRat(a?Z*P_pVrOQ8h;d#O+t#|mvb&9W>1wKke9 zf(N!O+8Mn;77J}+RKu(|`M(xg6fLv^s)i;N&M}G>+J8OHb+fjob{}w-z~9LGEK#;= z>`qs3Hu~dDt{p#BleztVW@Fc2U4L%(l~!yPMT65dO&l9tnlwiF6qMIeo zVAd!ec~5Bjn|s{1@$TSCdQJMxu6lJ=!iaheh3hH8tf(7Dg$t~r1=OVXL~nR-O?pIO-t|BK$$@%xg$pUd z#g~8774zFD8Zgp?Hj=A)+tg)f`pWkhpc55z$M`b?k)MZUuL#U;0qZyp^qwq_5kE8y z`7BHLnHaLm&LH_S_CcxS3Lk9%E_9?V+t-P0L0Ou~;17+O&}VeN7WII%alc5ytbdj%9;KUD1~bagNEeUfNj$M7LT2{Kc;c3T^igRy-bDCY zDUMBon@_e_BCdpLv}3#`c3b5?UyEgOvcqaBD`6Em5)f0yx*tm9iBa#!izLw%B4oh` z#(^4|IMG9JaRV6%W#5j+6OxU~Xfn9H8HRGDT>z4~I}#ahl%O;+R+5VXsd8cAQ~yiLf`~T+!;EOA1Bf zVq9f!XD4L97(1%j>luF;X`6b1?H9=%8lmu>OlXkLC{^)GI1s5jPh;%t?Jt=BoPVFgIY&mtwN4d8 za!X5dtX6^<%+17ffJNVZy}dKkm7-n(KoTQkFuDsGSvOWVf5gi3p~@Tx;=vDwGR6I^ znc|omERR&^l4GyPrwAfURDvMmeqo{AZlERl`VxG)^*ba&acfwGZj6Wxq(xblcb{ zxP$1lB>DF&@jbOw6(!yDlST1@`ih3aNj!K5**LVqSn)zyJ!5rgoE8WW_80g;6(l?g z8nF3kjFdkry49pUo$BT%U@4*5krrN(H+&Q~f7z^y(3GZx{zlSX65sA}KyA*qWRtVb zM2Tqp`YnOOaX$4Es7C3f)n3$m$_CZ34Cd`=`bGPoqOG}!^EIXzsPemI$0 zVFok3Dj9+Do}~}utrv{2@BIZm;VwsUm`_+_GoCD19q#apJ@`<+V>w106mqb=c{=5XNkv2olTK|NS!U?7*)|R# zIMS(jSfLsq2g~uuDlrhZSIl6H%OO4XD9KwFI}6ncZKq?Ex&TO6!Rv%ZIjJf|iMxa{ zHGZ>}P^z@O?OCF6h{&boNkD|Aerj`pnAOG2#u*~caL=*>ABesEN8dmW%*%nbJIgba zXt1+oA)$Q7J6s_We}8gG!{L;DdaA`VQX0ayAaUQoL78zJ@r$sGo=y)OO_gI@e1lk# zupLOY{jGzNAJG4Lwa}1`SduQ=l-LKh5#^^HH+A>BuVEKt^Gp~!4NDqOn)J9Q#C)$s z!}ThE`Rz^Ke7u^tZP{CWf@B7y%xj_aof;-@JB`c>`v|Rm2F?iP3<8X8t%2m#J z2d{;X$Z*_O>PU(el2wv)`_15MSZx0xz%*0(QS28rk}rCHWjyX4pT<0vS_}8sgXV4e zNfRP*xfIEA<;I!$#8lzOPXR3q;Lb5^W2Xiaq_c6V`D9{9N)?~=CykA96_mjuS>GOn z3@*k1=!#1af_N5Ej7Se6=q6+1GvT;}pff55)u$HHnU|^Nl=A~3(O&%m#`C$$;Ci_C z6uu>iKA;+~lJus^`YjS@CVeQE%kB9v-n~3>sUWfGlIa_fXN!dLWyA$?psv>C#a=rCB6~36eMAXQgo6*v(^rO^=K60*-2pSwXA^6gHf(k4y{(Cq$A53fZr3;ti8g z2m}mGPh)&DiUt)YyCpi1Kd?9e-GsIX;UysEMz)Yew*L-kxLqhVL@p2`J8QH$*_h_eP9`rC+|G|(~H3?+UU8`_(v?F?aWkbLE z&`Wsc*dJ%J*%wp>2rH5jXr_}D(ej@nv1I%CMkE*{MYJjXa}ju~;=bI|F!_}a{J0^V z_oAQyjv9Bv;OqzjOg@KQAWIaY0+l!}BcoQ_f2xf&Pwx(|skzkGLXvbogBQl=t6CTJ z#zLpId)$d8G^7_+w&nH?Jrf^VauMKN@zH)#8bSXo`cC`6tntVJAioltj_VCFPN7Sd zz7{5zX=@UB-x*;!XqVE)?VDpnrV`kDSc;0u`jGElrt+jd@j9+P9&y zC5v`3ToA!3!vpq!W46eIlVMByN%4W%i3CPl;E6i|QZnC`p4wJ&=z_r$r!xsf(Hm@A z{A?5)r9Px7A-C-*)7u!wSz#cM7aO;pXX2y?BfAclUk$IXi={j8$cIpm4M1nD4b-jTQtWrG9lBp6Z!AG^6oz#dJc%Lha6F&Ryu zV%(tYALLDJ#wK9X&G|U4Bh%p5kEn z&*i`(YO{g#A@JSru8Sq5TdAPAjnEU5?QLx0P7?rvh9)hvpYfBdgw-QsO;YB^a?wB& zUOH-htUO2D4ISw5EIDjkl-ls5iSZDyOwb)UKA!WNm@w$B^uTS@(kjrMV^rZ8Lk1v1 z8Dupu){IGqNaJyUay*qWsU`+00?oML7a`^<{7#He-3?Zx5*avn1DL<{WJ2Wp9;x(wc=(;j z$OnT}y#x~mhUihMj&>%2u7j5HyUCy`8n2I~EP2A8t&p6yF@ni{vUSA`+)k||4K)GC z1}Odk8{?+u;Jh^Z?i=Ce%}4?!k^>|m3|+td=u#Ripnbc5+3VCO1IC32<=sc&?Dzz} z_P4_;QM}Rv)t+xTBoE|K_gBNlwuri=M{h2ML8A8Q9TLA9lOxC8aj2GZ_qsD)O)VeA z2@4eM8$y@u8jGG7=0|J(wu21!N!R)5;LAHTH1@gY%6Q;eGAK{xp3(F>kHfIiE&yD%NL)VUs zWqcWyVQ4MCV8-DV+wUzlu0}e-H5+rktO+@Gt9S>9D|SfJ9=5E z=S!GxD{(8WoUzleScOTWnIkJ!Xs7^FsK(OK`R@0T9AU(KyUABs3|NZA64GFL1G89y z5=@+c<>4v(ooqX`wd^T}mYaCOmota@I+X0{s@vc-hr?8qdUh~*W+&?gN|6@vXI^e3LPi>V#nkpJVTU4a zem|$%Nq&(+dXRryNS*4q^!;d|i4AhcpVDSXl1f7tYt-@uADG@Ztf+H%J$`~Z970il zubK#mP4l#&&DhkuT^m0botwX-84aM846PZm3%P(#sp!OLIIu%=u{-5#HKC z7v1RM;vy9)H6~U=?%XbOS^o`#8E>06gSmsPV855xilWWTG4tZ^1C0S!PpIaPi>2^d3L{J zqFZ9*x8=MX?ciY)y=NgnQ>3wawxOpU4SEVAW`ycXiS9{+JlI(WT_SSPo3K7Dm3`-Y|+^wHXqu zh*RDerhSEmF!F;QuL@HVYe*V7ZJhm>4w50EpXfc5nsjQ*CJ*7L^Fd$7cOU4%;4ffC z%qdbJntGa4qYA`@WwA$l444A9E@BlS>AkzEb<{nP* z;A}W%E-YUH$4H8bKhbPjBW2`BWzGO9K450Y7PGb2LkWG+vxNCfCx`*sC`k z)D`L^nYG|~L_YCFe-@o^!*!545cMz3P#hquK-r)zVaO6w; zzvO`GvAyjW^SwUez1H!8Pr^ri{K-(wbf3AgafL?9RK`CCY51|!90bbeSH1^wYs(>4 zhL_`a&ec#ydSU8$6KT?7!Ue8q<6w$8$Xa{m^*XOfY$TNMrCk`JFEHVPKN7I(7v=}J zbR27J3&eO2k`Py*Xu@?!M4!V6Wg^ckx(BBS-6dJk>$g5UqVFN-2+WH7sEDX2E!8fx zN~1-XpC)TAeiDdR7Ob`X;|4+4IOi{==cs?i@p%-6Kp0e0a`i1e?A@qxAZ&2fSU9YF@z zc%NBKNlqco65LNPLimM4nhO-CVc`^lN{m!Gl%0Z5wrBk70{sG&EA6Ajod@sh3!m_F z8^U3mMLs4nSx8CkG(t?H`OR~8O21+6u31I1gkn*~<$@<2mMhR4D}W3xBXoJ;C&TGU zKZ3L6b~!)d4uNv0nt2k)s25(H!9#$*tj z(cGlIrZi?5#|fw;g_?MlF$?F(0r*rP{q2#Nx;W8cbSbC@lL$JbUU4bT=d`kKFs;o7 z+hqm?3z@%Q4~jPb=0bsvdp+OeAyErjd7>BE+}&xuwI(j4ZS5)F~-O2?GEl1VM-Gc^yo~;uLR= z=L1TLhhG^V`p~XJ^?~;Qb$xz@5^_ zdw~1|2?d%cosha5saYv1bUzRka2otdI5sv4#qNV`Q4Svrq1|u-$tN2!veL7v%m&6} z1~Br+KqzLCwJ(nU%qKXzW14^5DN~gcFWNtJu-|>S+dT)BLCyU*<_#{z4gQtfqwSAl zxDZ$}j7^QgR-G^aC_ZhbIC*R0m((*O;|#LTuZwB=(FrF$(vQt1k}J~f9LB@NG7%qP zBq)!`kJnng+&=V)F%{Nk$oL?CXv}ZiCZY_mlsd)btToWu?p}?&xMidoE62g>yqan7 zrW1%3U(M84V4aQ0flsKtydY@K!7!n@PirKnNHe2Ye4gr83HRATJX{qNJlv-fWs$LkJ9rRP+*_Qx#xY}5cfuxCE7 zf1oB9OX}t$2b3I&&*R)%O2#iaP|N5w2U>xyd5+k9P#TK!${Du?I%HTX653cr1k%}T^;kktGoW6d|bcN zGcRtXgG%Z6FWb=QSp_f_J8SYK;4j>$sHo#&uL&;jq>>omRB>&$BhgdAh8xvz(MKrngGY5QywvaJdaI)n!^z68lCl)$*ZN<4nTyLn<5QmFCQW%H&h{Yt+3(m8 zW~+DnDNq`P)YKnm9@GJ+Ftc2Ow&KF4-bY#@jC9RSzsO5&Z>jRjTDvmR;EVi!e_0P< ztZ{)sKz&j+`%GPvaag1STgiaTO0gLO00QUb=cWGLQs0 z%%?bX5xpSVjt#J8DRoS$K9KKO@*m%d|GtGrXxCoCInMB**6U!^vAWEmothpXhovCV znS4+}@&4~`hMC$*kg*=@TZTEzzrPvTwm)?QavtURowBQNvY*zmsT(^<-%FrZWEIJt zY255t@@KOUm3y`<~|5@rbJf7S}QUiu&}&-BZhjl!R-4-dr$T6Bp~AX535GbwUSGMjs-+*Bb~;!qH^@9+5C+(t_tC4aC$+&g3TZ z=f(lC34^?L!f@xB;YsM(w+LEF%rVOKbqk&bYfG^MFZlc&s{rOr(m|y>rV4P$M<58^ zeA9>%env_H&6qoQ!H#eV1$smMV*YZQMja>7138EH{%R@@w$hMs~zeH)Fdok5QHovu7rSg;wcvMN#g^d zvHlZwF(0r(qfg#$%npd9u-?Y`{Z=Q8=X#lDm7hyc5Qim!S-_`LvF2_OaD3}a#<%zQBCuoa3}wJ=k~ZQe+SXE^Ob!fi?}Y-6614dEcjpE)_mdx|KM1NFCC`OCzO4@^_$X)0d7WRWhG@3)B={5?bTrnDLwqRA*O97JbKT8 zFS%(EK4j2Gcn*3|3SUAEOq?@|o5(=%kq<7e(`S~iE2Ov@-GKzc>GQGaG2h|KL5IK@KmjmJ=3G3*CRx zLMAvz$O!`oTLG5=5qb>kS+uSwJVVsS)lk}bwGa!%zm5nFG(WQLr{MxJ8}cNrK+m9dA!rsa()eRGDG=oDUtgAiW70DW(WL&f5jFbpQ4rdqH--h#8oKb^oG!H-z!*?S7C= zf!%s@3^9D(8#@TJ==@-JO{TFm<8HKxgZzJujtycE0|NE}Sey+A1spJV#-2{3L4KJM_P*Er! z!ZCYy-7}&9?e<|@*uqk9iGHBz__N=}^Ftv<#unik8}__MyDhSB8MgFmuzm)JGLBqT$PX~~na%6S!}viu(xr_d9hE6aC62>DM^q7oKH!GT{5 zrG=7Bu6slZj-5i|A6?h*WO1ON#9OBtf^NGN*0Mu=@FjbqZSh6P$1~jqOf6fky3zCn_BL`j1bZ*EYE;ayYky?(9tAzr;$iCKp;I&bRFo+6Wv`zQ=r zfI}=f%@RF&gXZN^9GfB>te~mPJfQL(sPTi~R!Xbf#-st+UI36GPj!t{PZI<}WO=w) zl-=~2iZMMHW$F-$R7!;+R!-F9psFfi~T)?+OS z`Io&t3I*6w9*j~;Bo1qyXsMsTj!!1S_Y>jsu*<<6*V%dXf`7h)nkh*LUU(DN2-~s2 z)DwP5VTBQWBaF2!pK?^(Re`dE2a4})pxPu{Roc7wz(Uje`bll~)*XkI!x|&_=~WmS zbf$R)ed`BQs$AZM)n0G2_0PBR&|UH|F~&;WF2IF6zj`)lQqGSoSs} zqnduR4p&qhkfND#WJ{x3G)h5m+Naw`#%&UFDc-I%>q^oj4?bz$u?9+$4*9P%WsYm1 z={O$1O~n6{|BLXbrs@9|d~o^qzu^Oac1E?RMjmNMiOt!(?yjg%f<|)q`uk0C*x9*b zSJ_F5*>5EJXZ7-Yjj}7VhB2U^vnBJ5_2n74dz7%TjH>eO+=+qW{-wY5;rYrhF9jI- zhv;)3KvGQaz&HFZ(cUjh*5fX*=s)7lMIKLgXk%Md=tR!{(qvlGXJA)>n${Y60nMly zm~;l}KO-^#&ChFT&4JG)Cz8On>`-}^gbBn$koxsnZ@2M9^fD{P=j8{y$_j$)I}Klq zZ+S#Bp=)TZ?A~EYq&r8Suv=;*h?)KwCKyB=fS1_iD<%&FwW7el(GqerosJQP#ZGgr z^xgklGAsaQMHD)S5vl!>U$|~-aD}GvARVfEF?`28k!O{_q#c5o%P&@cWavY=7sq`eO%3GqslP*W(jHM%D8u(GD zm`Pmj1a&0`V%>SQ@SHhEUe27@h%@tBG6!4scN&b>8v9t|RRC3IKdXK#q(h}v&b?PN zyDefXiJyq0BE@Qk-H00u+oE&DbD$Nu0A&EKp8h4n1*B=IDra0l$6rw_>YJO-#H`AG z$quQRXTixJ+4{G__2fe|IWmdbav}|qwB+O*RN$`Q|8Zg^grrSpmym1ia*!S3&+Jm% zBwR#h(KOPGGt;Tcdoe|!1T5AU0d-PC>`H7W0oITVbigOLM!g zqcR|MbQR#NkI)5DfLmDc5#lz&TItP>(FU%hZC=}K*Gz+V@)pSMqS)95YgzJf#x$lU zSKCb(8;@P_akztDiIXu^W6K_FDqdt=h2FPQ<*x_w?~9>kzYy>N@a6E zi0rq^-O!`t0Fo%f+oryU3C1DL1i21-;!Cc4?+$9}NBAX}G)|xs2(o^c>@G;X8)8$l zc_A{_7=4?ENQBurN87b@zJeT4sr#!gFXS-A+`~~D+}(`{0#v;rAiig_)Dp>=$JlEo zG^26i+fKEOrtt~+8af^fxI9b#Q zL_%QtMwi%ed>S*s`Ch+n#;NWP0CM`uLtiu;Db9O2cyP7oHhhp2DOdKMM0D|Z{_l;9 z_w8)OtahuQsl+X#3NrInXd*UVe;%6Zw<_jlHBy55_7rkPXdKHmP7R`m55DFJ(|@lW|LVx}$Am?`49 zZa^!!-(q+D#Bs}feo^ghb$&N5eyMU;G3l1^(uo-<+gEpPc!67Y?kJc@{vaOcOQrN& ziRJh&7_&VUjsUvKmu*fy5~eQd4azNp1uW??38|EkIl@p{y4P$xs+n>uKeCV|DB&r}uGzhP%`3;(SKX4DssT#j&M!!w@H289ve3)w0k>7$&;cyDi5_}tt!8XT= z*$-^G9si-)&Oz;DkUGA<&RHYmc<$gXumc^6SBz$b*Wnzq{4n13rMLXD-wIx)Rr3o<*aR#p6ZY^3cuXNm3(Tv_>L%68Y9IqhY72jn^)nT1a7 znE|=C11|Y){D`~#!(gG_?pU_~&DC2G|A%qj!=kE3tv0)7y;;&Zy6npEs4Uemqwdj? z&ub0_{c&5I=apSaxqN(sQvIf@ebC0!Ei6@54pee9x!t!w+5h6g+gZm^#RZrKoH#l^ zv#l>?xG#o;#t8~oj2FrEuXelusrWq0il<2Ap5!NhI}&9+8~nPWI&q-|a~z1dH1O@- znu1jGj?=yl0R`tr#&+DS4v8Ah9;jjRL5v{E7hZ-OWBqdWdTIe}IV6w%R4UPb`IMD< zrZhhXz;XC}cg9q5J;?5iSh@{XeERMc;8jUe)Xp9UF#H@0)C`;@=9piv`#6npVkFqu zB&v^`DDeU8F5*&^3T~DtdVM7aJ9O-mP>x?5-qGeuH&ta0U=kH@2LT90jWyKh4%!kN z1IVAVWPXR-%;o+9oDxf*v6BtW!IyyfghS+=*W_hJN3~NB&Gcz=7*&P#ePT^tX|jAq zH{@66!7fwkkl+lWTiX}wh#kNS5L-;Ht2QR?KdXEW9aTj-cLIeTWUNVXL*tuV-G~0W z%>0=&@!1m>&sgj)Ga#*JWPG%@=;(Q{(7+C;SQw%==q%?d*Oyb>9kyVro^4v5q|bhr zidDRUTkdYtOL_~o-Vur5dM!Vg6U=Go{8}z!mFsC0uz|zUxd*4rTrNO~%?Y@mJUl>T5)!@2wkFbvX8_WxwBx60Eux>8qMKR{M%) z|Mlp9$`p||4AXIc+maFRNhff#!i!Z;d3BkuU3?@FE+keh?sE`Xkk=J!PIjk~pUgIOq+FdF4 zrF)BBZJc0|W+`HTm@F z#YWK!%O+jT<3F<99Yw)2dJo$>EuN{p>4Fld29Yg?G%;oXJF)=4U!n|2#}LTh67(l6*!rTp6x9#Ch9jO$60~`>AgRPb zx3IX((tT#HuAbf`3<(O#WtoKb(4+Kvf4Vp~9R)w|KnvQp3;s>$KaBU8J32brttmyI zRao(co%2DN9eJCKTjEIDn_u1qOR)R?*+@59=E!prL<@rd$KMT@vP*0V4LIqp^WYHk-?Nu+x?E^NUiiFn-x|XKNu;ifTDQ$SDe<|)pnztTIY!q1$ z@Kv5(MY6`v*hjLvfEsODjU#=5brsO$;DGNalZA8FOW1;n=LG4TtjJ=&pRa1nYQ_A8 z@5qHGN7}@jh~>B$(ju`}qa)^IMaIeW!kB6b!8%e`9l%*z3dEQ~x8F6$L!@4g0<%-T z3sC-4c<{U`&v>(GhK+-?oSgRQ43xU|;je3eV zXkd#wp~Gn`Q@{TPi$)wOUn!HG67u5|k}**lfDtsE?r7|8ODZNQs<)M3cbv38t$ou8 z0DViL#tdH_KziE!)_a6vG-zBBp9INYXs)aB#siBX2PnM5B-t6Qz$(zeaI6-wEap;= zB`>?(JhfF~jy@k31`SQl))B?jM$%$)+M;Zpr8JdYPZ<3D$2DissCbr^8Bof%aaHHU z`F=`hWg}b2W4yEr>U3=Dn5xtQLi94DQ6w|LN~$}^Fr@TI+Ja@SLY8hw&D2tj z5m)d#r5XOiUF!9yR3XD83c9XK3?_LC1DH#n&VTN2<`w*GxknNa#m&B<4~(?2Ofa*Do5P2AVK9lC@4IzhL5S?Qx-pkQ|3- zxztj32b82#@E1jW)6^Cq>OPo()$w{q)3^4eZIHJZ zEZf_t;+r%7r7cD-yf;(JQD{yg;{LLv8axi|V8g8em)Gw~`zCKu5$#p&8?w7_7GwiQ zNYizli6n(~fVm3gU6`$B^V{i9SM|bH(1->#;_LDZ%r&y1X0M85$`8pnNa8EuLE6xL zBq~hyEYXoH@s-dd8{4JGDhuzk`<$LjfI`LZ<-BBVpSUl5HdQzfXJC^tp49DMzx;FP z$&Ua7wFIv#&Np_)VzU|dtBJ9?Z6zMM%<2@K8Y0{RM=|CMp$DCc#p6PYY+OWF)U6ufppD;0t_fKbVu8BW`>Q)( zfj=TF2il%dOO{34;F&8AIq26A_mOOptowoI^R=0Smd>d7JSh^}ME~-L$j%p6)L)};NkeW8ot&hrUyeu03OXUKCr#j}nh8-aKYhX1ZzriXEs@BobUcro*ZZmIpsab& zO!+e&wxPKq1QhflYT{ zlZZf8wWi6F(;;%EL?@G+xXcc|*%+5E`>4)A>_7valMCeKi>Xk?Q%5ZClHwvWD0H34 zwtKsU#rsG%x3-hwch*Hxph{#FO|)FbDX@hryJ+Z!vZAzZ^z zBWUJlYhW-$a_qd-Dg)`%_G{-#w@%sf$+R+$3+?QPa){Mj~$6c(Vnwr zXlX0Cbh>fcAtpFbR6~oEh_#dg-*JnS>IBVDPn-7bzeG`L&GLU@=tB#$5@h~~p$lz; zm@fI*&nusku7SpANEXkg4@jH_^JxZiXX+=$|H{rEWfKMKdP)Uf4y2Jrrv>e^F1?-* z$v0EPVT83QZG~u(72>zyA44%y3bcH0j0%HaY&J^M8N|708i;mZty~7$uLuDm_=z-*_=&BC94X^$--L;D zJ%H1|-F3f_UvjapfP*70hG4~y9kt{dLe3W)H5k+!vrF-Ka2K0H`bUh}&XMw}W6qdj z6%@SIpzCmE$JDLb7tOZ`$DzL&rLME?JQhoB^+0Ek({}Ubn7exe57~&sSvP3S{Lh)F%5)pj_7H+f`PJChNPr_qK(AqhuP9?TxQOMs5-?c6oRfFacKAq`=qb@)i)w? z((qu5pM=c(Ib#KUS-%Phs|nBEFnRhj`S^kno1ear8Z}L4qJpj2s8t>_VB`8!^3=r1 zmnT%5nz;6o#4kI}K35C#@bz}s)%vPbb5Oga_2AIWI}yh+-?4eC8mixx@G^MsllN`x z4=(Q|X8cmVco8c-cuaUATSd%;!r|Qqa@?PTku!G;YUYA3-S8Schi7gLR2L<8%a94$ z7or;R7zT6z%= z_j6#Rv+!bvsK%iB2nc>0#tL$XJ-I}4NaiRBn>Tc9EugkdTNsR z9+E@f?(xs7ygUuVY5)A*ta48{j~hW|<^cbt>&w~4wICBCD9I?}#L@bGeW{W!Y2SKK zm|s~z35yS*GN`4e1hw*^kpJlBJq8q75tQpAMeA3^+aJ%sEp%=2U#Alr>IU;go z0<5aZllCJiwbFoiUzaRN&frCDJ+sFy9X;a`CVie z1S!W9G&`2Z&dF#yE%%)EsDfsN5{0uRlB)L_)4tvKmVz4hC4bKXxY}}z$+b}X0Q>$4 zR7GN^P%?7T z?Ii|Z7e|Z#+S32Jsggg1ze^vXQcoKQ8q!@Kb-izK>OI*srRD`TmquM6!-+==&V75?jU{r#(?!bqc|+|*Q+42%)?4`uA_kqUyG zDe>XHW8Rd(SS1jdFyz;jS?K@xIBV$AM_K$rQKqVT#L)vaLSeXkQ5NgoDylIwr2)FF_+SHNfnE zx&t<|g0Hu>_wmpzI7`1@^!La9>o+>K0u{eReqKJCsvwfs znVR*Z*tT~n_*RCORd(xp8@9n_#~*EdM}9*JqCYZU)_ zlfQq}k)(G+-l!ES%Ak|FTH&?0pIjt?WbLOJE?8C{?JfIF7Hsj;G&eUVh@;?4VqkU* zTKh>g1NF+z(Td>SJ;c(o#(TUU=&K(lWJxiK3|~dt5`7G%Yf-;k0MJsFKe{m^WHy z6}=`FnDDr9<4jN8!S^k4;&wfcDZ)Ot#&J@qUGc*qur|xat=(52U9phoV-(9vKv&+$lSw>i>Aaj$=Vwz~*W*OL=O!#(!>h{JTWqG)HGW z*7z}8^dy^l_b)(TT1>jpckS=$z59eqVc;Zui{`yOoa7Z zcJQTuNP4HHraG27iays^L;my$Fu`V*#({W}eBAu+W&77}W)EX`yX?PiJGlitDy2Nh z2NxAb#qo33(VZk1B7ZghBDUmP@0uBKEUWik+9PGP=ETY8^F*%aeIh4oWHv?CHpUTy zOT`LU>WY6#P(7^V`6p0F(^tn!za>RTo^xNhE8y$h~ z{dR|Fha88Zb>??`30{cPE5yeia9+$!u`MCoOev}xhe{5%K6xKKj(+LUkb$~|x93r* zh8N2ftrUg!_P0{zc2A7g_jg;|+j?CXHAPig^4%hU32VR}33|u8j{L23E3j{Z zElmZ&MaA!}*Tr||M+Ktv4RRes+!~*}b;7Z)vi_q!xcr1~U#Gu*Gdpw4j!(Bx z*GZCJBIq>ANu#$adhbwFPCw0D;ikmh1rIyk zprsaDtQP`q_CMI|+^~Pu;l5@V1%3oDWn1!{rSnv`^4*M3S2}2+(l&T7CnCE+biI+5 z&)r%r#ZeJOmSl`PP4(Eph_0nO9(%lOPFlSusfXRyi_?;ir+)g>XaCoA`1@BKFB}8q z@X)Hol(9-I+K#(aqog`6@ovmJjBzQd+KxA?3y6*5Q^SIrx1F>bSCG-7>CAHG_W6I< zd-HHA*Y|yRHE7%oDk@UD+Ek`eW|9UCL^4}KQOT4F6(Y?cX)u%_GLM-Rg=(Y0m|2Qa z2%*T3@jWklw?CinUhO}=$MHLU$9uf*?vG77YkAi5-1l`~*Lj`Sc}|1*`eK0h9)ZK3 zO~@zD>Z+0@&r`Dy}Rm!>I!Pa}$q56P5h?JLR>haeF{rE~IF1k(H)gDQ50_y19$58)RdA11!nT901d=!+)}y zpTcvMfu_iZOB;svQOn*P0FlXA^}^hQw3)_X%_KKx*Bme(b+M)O5Wi(xmFS79B(g*! zCX%SjWg35y#C(sVBcpjC1f7TSvxk&OmFDZG5RW|U4`ZAcy1#3VX@tUC5IpPZ4r_ee zJG+@o->Ldy?gn8*poECX0(zo+DJ;xd#x*@=E*j@tZ~BXoGkBu&NrJ2rNX+b z6*VJiHfu|l@$2sKIzQg~KVN@S{Aro(wZb8z;TulKExWYU)aD=%0G?Y6h#(Q{!L1mo zI$_0{psti3uQQbd!1>mAS%I@2OoK6`3BG2pJW+EfhGN*VdW+X7t1trRD+RDD+j1Q>WF(0L2RhhEzOGbYiZ=OjEYHt*h=R#VPL#cN z4x*DDrO9~<>2bmgW|W(>m`-)w47@^Z3&7%U-unF4sMDXn|I_=I4;?1t{>J0}as@2+ z^5WC$)Eyli`JBAuWK%w!Dyk*)lXkG=j1gR|XR=wgLrFKsU($^(Qu=EeHi3MVFf`nEvJc#6RANfJ23 zjziE~w}-0r|M};7D?^ zhw*MRbyu6d=RMX$t!+F}pdI5Tt|-BmVnV_HO9$)ix)Yp&2sT2rz%gIVig+h668boe zIFib$H+#;-bmKn6KZdAURm%-_pwZDt5f8 zY!Os;N$8;(p*wkNf^P=sseTv5Cf+kxLEpC=LELf13pY>qjWBXsw9( ziK?UOek>y!Qmh@qws7D(=5bEK&V2vK^=-1s~#;>nN`W zr;&C0F;=d{<+#uY9x}iO+r!ifuSSIEv?LNw7sCLjSczk*TW~pR1HTuT_&R2p&NIi+ z@)g)bSp@54C~RT439rL|RW+WI4!pFd%cv}^^-v7>W0r{ zhC0&fXc5lBHg%TY>AY;2vw{y7Qf@bEM5ES!{0X@I6@As<=KwC$&5Cu%v$H=AR_Z~= zt4(LejvYFhc{h+5FQ} zl!irE#ATd~f@Qb(yv9a#vxb=0wm{aCY#uwk1)_zU^+Z?t!GK$hoKA0+Lg53)#ERKi&T(T#|Y3chtv)9_R-RE|5F* z;AcO(-44HHDmlo(dq5T)nshQ`9a(CCuh6rDQqkaYUfe0a=TyCnhQHIp2RH23@!fiOMZN(u$AGa(hc51 zE>cY{$f(}#EV~=CM-C+HhXX%;=&!G;=jo{Ym~Hc=l?2t1ze#X$aRrBlhSpGWzLiv< zXzbv90P8|qe+Fk>8hR2Wo5W2lZ)0dF-~Oao_8gblya)8$dn_Cf#=KSI1eroS@%8{w z8fYMlnu*}#EiL_(HwBu8la^ zn{=|B@So&^%CXE;emZi9=>q|{ZTk9eo4)H^iuaYP0tn4>ym~1?_QI!E|9GdrzRDN# zI(s>j3zy!Opsw9Y`(m2%{EBvX_;`Eg0qEg@fKThwpWTH3hSu36h2XA@Y0lc_b?Wvi z13fLXt9PsjC}(@fax5^qTo_ujE^p^B%MloM;%+9s{Z$OIMNB{OUx>$_7xi)axY)&9 z9{lG^vPYhH63~tFjAYoM4if@}p=Gd#^hVGz5W05hv*udU|_jB>i}|zrM;R zPfW1aQH@?_h^3UzK@zdTuc_H6>FR)E?jf z%)A}{wc~hdB}d+1o$<`T2Be7yS6!f4Fzl?qT zW)ojt;4+}GnUZ|XKdOOG9b@@o$?tzFFcQWZgKGDtwJ#g12IS2>)69PITWse1v@`Z! zg~eZ&ARqjbh|k#W`O}M^!+JRV@9K-ceV(za_=h&$|ILeymaQrK+ar`8u@5__QXavT z;lTYd%JoLlf?D$}C#)tgg0`_#SMHy6AePd-%Z&eyGl>LGr6^*wnyTpZ0;+!a)=I zM0ex4YAEKV7}fxi?v>Mn_fNaQn^RJ{43n?Yw(Eaf!(U%-Anq%CCL36i|0KOrVHuJ? zAbn|9q72Nn@{K@s)mwZ3)w(gym=sUy_>YKiF=x@UP;275wR@! zm3-gI&hYN>Zw~=e`5I^CD1dC1=`Vxl(NB#YauOv6Ua}^PjEubOC7_)LkJt~RBb}pT z*030RU!@kMQItv^9eMce>;ziWX7+)PY*NGmrO1!Wd3<)6I))wyNr$+He>ZOZeBz<< zIam1G{375;@~dPlmP;5t(Km;n==40jgDQOxBwjqaw;n9LxA&@84w(|`DSJ0wPX}aR z4933hGnC2=FpxNQszu50rR?E2+zbiQ#(e*!;g-*&go{!)shy6 z`_}QesXrdlkFWAc6BB|i6Cjib6eIQn@D=-}Pm!46DkG|9lRt{9z7+Fz8(9Ir-6303 z&6R(=@~^LBYvm(|o<-7?5Yr{YrHFtrza7xP*R}OSY{VGUIZI|T$P!J(fq}NCVCiYe zob}f`{P~S@$5O1i1`^|-5!$wXWVzfx>RVH~NmduKN1oP2V~sa4Ow!Jc(ul4}1gXM= zwMSwvH%@Tn9HHEB(uP&p&>SJx6Ge!Tv>fh*_~e8na#-`LWJ$$?2I;{OU)R^x0vvN< z1gdN>0X>f#kaYiSLP|i&Al4XWXWXHwd<0)dkJ``26aGM$XN_3 zW8!fvCAWMZJCP#TQb~mlEZZl@!Sh6! z-I{tjd7n3{oF@$3Hj_}y85e}s%ra$#>z9u9g;AOv|F*q(wgQ=EYk zEBr$}f^qsys1gdLCd1s_3%x3!OW(>)vuRreVHB{|&6iq?%8CHeS-^$*2UE zFRec-D@(2p!T{8CBlA7djnzp%c7O^>MMcG6MEFE-Q}dt?MK-m>L#M6;Gn19XB)yd} z{nH$xo{<$tTR$l!6FgPHB)tv}0WDFP-s>Kxej{UPJ6R09jPX{8;tWxdgZftJect}H z5-sfy*my5x?~}m|M4}hd_|-(#vg?y{nEnHC)lha%Iwi z{05o&?<$MHSpG_;Y$ikeeo*EP{2H=kc;!|2Dv%&i$lbel3t9VbCDP+{j^@%26 zq_faDfP%gMt5xG-X>*PA7>y{&xgz?}(>^&H8PeV`7&&4+Nn*p4LeH|^$ z?`f9l=wPskXCgQ+=D%NE4)=?YloT7cW>8|*Z8NvqB^MEXx;JBBg9)P~fSFOHWNqcC zpIHBU#;)-}3p#JU;Xa`Kxo@kUb^Jtmkv9T8=L6F(oLsW|O6OF@G~HbTxAiXfv0v!F z!w9<86Z*$Led>HMIxmN~# z1kB0CSjzolMcil!pZdUEf2ez)g`uTHI!r`*UIe_|lIZ}oHX7&313QeHPFRq~4+O{{ za*PSV5qJMy%8~a^IZyu@K6)%P{dpBMo0&a6R_>Sv@ssjZ91RM@)8szB=ys;X@O*Lb zBZ|)d&dhh{k`f!KQK~NpA5EUhxImOep%FHBhmg;%-xpUp&SB)*2Y#$u`Oqj!; zFT=IbAnM`iCe8-OEs0qx7%TU$Z!tN%v~JtH28Z#^n@A>v0%1%uedy}Co1?=soHteZ z*0&s|8~T>pvM>{Gc`|k-7Z}W2ATE6R2oL0hMrA1n0O5ssJq(L)fH=vkyFYVTfd~UY+hIW-qPvv zoCK>;=M6ZaPdAt2$K^|0C_f@c*W-ryl3_} zJu(n$O~Z|2QWT(ezv3v}9Zz#)&Hlh@LBYuu#zq*}0|2L1)@PNovUO5=Q8Gbqd|UO@>%BY;~nmdp#Bb8u+4Q-7_OZLFUJ07Qhv|r@obC>&pTDq zC0^!cvNLiILO7sju+hmROKB9cUh}%f7HU!Uiiyq*KNe|IV|Ql;GKCaVOX*AX$0oe~ zhyf-C5`s&_g9c7ZqUjA7Z~FX&LuIt&&)&2QtZ+Vnh~R!8-&;&3i%?NCITw(%tB?PN z;olR$b9fuy9$8vidOYxS8@Ef-VZPTd9n9`vfVoS1TSw{}Zn0uq;=E?Z8xA2&=@O7j zdxULMF59=2q3ZwRUsv>PEwr{O?-@n5GOg!C#@lWCo9pL}m;d@!0H3iabGOrIKo~=- zZ)H2OV7o);yB}@3ZIbrKLI5bd{wrqJM7xv+Ccg!Z#k$B2?Z-K_EIE|?QlLZ310=Kd zgl&EfiicxGZ5U4-f=|ko3A1F^SjB$^Xn4)~6pl)lc9ujWXg2n^6lD>Z*Z6?BkAq+b zH2KTed@AY3bPErbc5z#bG)krdhZhIGcQqzj)Ff@`V`+D&Zl9Ro99tQ0xP<7aZ_sTJ zG6*Af>D+8CSjukP^NJwG=Kiy6)92K(G_&RJuIlWV!B}60;jsqHl3XRPmE80(Tbqw7 zHn?D%Lu48`67Q$070N$YwWfvS~oC?L-c= z4^>OF57U|V%mIRL|L|L96F2G&!n2Tp3uN6kkeDX*U+?RuikVmDAOj7C==`J^T;&Iv zbwPQz8KKd^=UJuKV`wz+cD(yD5Y$7` z(!|#}6^J}bF{^LyK}a$_Ufg-nj?W6y(TR|xi1E4>~?qdutM z{3dAfcwM#3M1KX;PmpLqf6zbf^!F$1yeZ<{c!gG);lPc(mupA5XM50(aRG@I1Bwq9 zV3}2Z#il0RyoVp(kY{@37p>QryL`pl%(C!Taw6kBPl;rI(TS{^>$)_UOs=Xu*2<^g zFngur7&4#cMW?}LefaSaR32YQ$vF-63rns3`*q1D&PFDmj5MYHrFPaFv~hHs@d<8XodIm^aR)!QFE3gX=6W-{S@dK!RiPtfBU0qK~H4Q(OizPSk~+h zG~w0TJdY(yKnu}*HXl9&ZIOw>1Ih(S?eUl2??E8sB(mV-tDUIF)-U^orQLC@6W6vq)-YQRo~cV3 z{i#Phc1*00f-Xb|2vlwT(V8)&>2(EN=;MLN{&Htw)(k_#D~=Tnf?*P1&2l3dDQL6K zbD^UN=!&=PBh0VQea`u;qL2DnE*U--Zm>Vm|E z7z+d4sW=I8ZI?iB?g0k;jnFcaYMucJO1j;gtSC&7EpE#c+1_g7M6-(M=R`0j8=GgS zcmiDP4%~F|AWjT?op{K%l&nwLaXO!%Sr=d*4WI@WY)K(`!DnT=Ht>KFV)gY6_*gDD zPuC^0R7ni!OSoNU$YAVxCy`6Z?6X0@@Vnwz0Dp)PvIp2w>$Ytp=bxiEPIJL0uei&K z-&Cu$Xw^?(I|uO(frpwKcvEObcFd%h&TR zG|Ua`-!g>zN`rUDYow@XA?OUEV~_HSSmJ|f0q)piA;5Ig3UML_xa_4-8VupSHEqKz zwkVDA^aBFpBKla0Ez9ex)xM8iIgX{>mytAom~Pm=38T?VX`pe;vg7u;h{85cPtP2P zm~he&?|s5=%m@wac6 zRA8=mHu1yB=D@ib!y@iSq}DObHk%i?c;f+A7>vH{2Iy$qGj{XYRQ?$Hs(`X^U` zwV3|j&1U`IZ0h`rXa66-vwy-bVEhAj@ZW~fzYU{*qXDAkzsum?W$^DZp!DH?HMjl$ z`rKe-LvP3UqG;*SC8OIm@lMQ@-?;k;W4c4T0onr(@rOeufqQ~=dq>CCojX6vv$2cJ zRvq!^tI4hpY%5xSqM=TvuShn~rD1Etq(l;-^5=2pZa`r{M6^$^`}1H z3O_aE3I3^j+wPw^PeuOJc}nA_w&ed_E-mq(fA22chJP2szl(wHz}TOE7X#gf|Bpn5 zcK$(0=PvU7vd36z8M>`E9Lcr31s&C0XhOj-U0IR5S0Isim^?N`v)mM{hZc0Nwv*U+E0nEX z8@UOUqF3!NeWGK}D<>9EBW0_lFSygn98a`KP8F8f$;W<=pY|Q7Nf~0pOfuj_ing9~uBYCWLls0CM zJOb`z3SM=w92u>AtJJd!BiXVU+~TqCG7rOHJ_XJ-Ve}Yz2U(srgZWUr@a;I~0WNS~ zXkPtEFEF}LOef3DSg_{Z>8J6nYe4`Ar#J0)Wrb{m$CWa-qD-7}Fyck_JCL zqiJbNz$7pwrS6jTAe?pCK$$2^tWqcBotUB1_X8PF&*HK%WO>-kcfF=P8jqz3=N95jMMe|8y&fDd+$^SP*o_U=7dW(5 zR`&wU_ZhNd`9*Q=^cqJ%q=gKWugcUT} z@}?KVhzTEy_v}5xHZ6^QHOb)do@U{r$(z%9FeBBgCuzW)g$vjG{tNHvCc)%_8{xiY z(P!I9r1C`|OIxGF7gn*+uMxxy6XGy=b->{;lr*NLWkS>OfP?T~7xm{i^5%5M&9K6D zJV^tNo@r4W03EbxX^!a1{o~Iw$%6o@Y06J}Zy)rHvMBC=8v7IRpIC}2FUQ+6pmX-?*aQc1Hc~V`%T_c}3bK zTvcEWK1%DV;k zm^b9O9B^PIJ1MQ8b%Z0?Ol`ZSI%JQ+cf8-j@^Xax`V@Zhcu!y0Kmx_G(F@qgYTzG$Y!h%;i)0RPAPR>O#eLWVazc>P$` zL;H`rR;D6jd3X>8O!>W^jrvG!Ex*^|?{|32lutGF;H`ZZMWF|!t{Y_)PD{T~iaee8>F(m%f~j*qOfmVr(KrgXL{jucF(eEHA{Dq=qk)}i{= zCm2DA<8-n|M;z?YUo0bi%NtoaP<7ATTf6_JvNxyB-ZMAkW-(gVg8P^P8Rk&*!W*(j zhR{6Ufzv11XA(mr)hzA+rB674E__3}76F}@HI9%=GerPtfoDX~YdEUh8oQUs>JA#u z0Tb7T&aLFHfS8Po>WH{oWahqf^Tkk%J<3vqt9gb^1>uVTHF!^zLMrSIgdnb3I2pU} zdT&?XDWZ^^^(OKHvGKm4VHO5*rvzy$wcrlC7#YGRP=;JserqWD(&B}9bf%aAd}$UM z=FQ|+2HURiH9op{CRo%hkf|!JG|B1TS-#|`s&4t>_S*Zq)Ro0FDQTtKU3PZ5@GUg8 z)5|fb*3==+kS{E}UeWbnSGfyTu1%WCNlJAg#iIyK8ROooAqTHGZF8>%@65trb>3?X zwpl@>E?OtGLNQgaHTc%eQ(*Cz;^%#{K>QMguEjS5@cm*2P}EEoQx(bL?0Jj37+q0oQ*J0-+c5tY{D@*gizhf|5Sy_DUCr--8&>x`CfR8}jM6?j z+fGRr6X^l}fU@hzmTXrS`Ku={ZZuficl8QJ14h6 zT<cXclsww#3J( zshaK9qO>t?B11^sBI_a5F>q12fkt!4B*(+&Z$1nD4y6KVz0GQcXCg(i=DasJor>;# zIJRy2i{i^i`XYIzud1--T6T67sSF5i3AshgB!_K`5;e`=7wL+549=VDHyd}P(gvy- z2hQlv5CfAaf|OPLs@z8P6|txeaPnzw4{P4|L7xm}Yx*8Q8Gj%Y4`WlLAR{hK%k}X^ zxEcs(GRuFXx@Je`!Y4tA;h(@p>d&4q+G>7=jq`cl0l1;2#?}e&rp{rcTsayxd#T^K z$+L>`e7M%$D~EDI%Euc@2OUbZpxk?-iaU2piDrR9-bm+^$(C2(lY4O3LnKk{s!)^{ zBglz=9XDIaL=g8Zdfm3^(ZPf*L&%*umEp_BnfGkj_sH9=>v~mJ=Df$k%$MWT-GAK% z=l%h|{ut!~{R$4JiW?*lU6TAC@*BeQjJ^YIKkO0)~gO&d5tzBm7`G2 ztvMqrqE)$ow6D^dRWEEq4mJFsC}3Rfea_FjNb`xzbJ` zagCQZ?38#_Oa^cqOiR0;JERHT8v1Au2%4_%AUO8i@uibi9FsVWyi} zzBciF2YkR@suY+i_E!YD!+w&4U`^SvOoxkICg*k(Z_onpT;+w@0=1f--%6Lsq&5W z{#4nHxt)Roi#y|2;iZmC5_Lt)UQs=m`WrYE`QnC^D~6&^kB-!E3B1>4r1UaZjpsG1 z0Sw?r1+Wc}5TE@sMaDk~RvELWQVP);AH*qkB5ocyDj61qYR13EkC-8_*PMAMa`7ty zbm%dc?@v+EEtff&5QSeEx5{X^lBCq*c+6HIThxHfn0$rX++sNQF-`P-smoif9X2Oo z>AX2B?mgQP%up2;sph;pKJiFws!PhS`U{L9-Bg{wMOy< zw1IU+w+G#g;K|$6OH>~(b^q`O(7g_0U%oYz8KE`aIouA_PT5v&OHUpPEwjXN@8C&X z)O02Kfbmlh1+P^ci_3+Mff^PEtY#ueC9F1dDnm%qx|mKp^4Bc`bfBM>wM~etGklSl zsrBVwM{dQ(6dx=H8(&r*B(o?8_j0sg$(hG}+xAX9&b6cm_7ME!wa1&;*_l0hhuH2m z<-50qd2d2;Eo9cgTxE8a&i4I|7MJ%Sr3n{G zt6oOt%2AbwR*^7%agLTiuD`$K$%Q?qhkJOX?yeG#rg=BETH}3u49_z7NCi+7)9Sln zJYMX^SGN|Jwq$^0qYjN}ema(V&2B@rHWYQbY$n8B4mcJOL>7>y@v`hJza?f+bo(_e z1fOF*vnOa+&j%eT>*vu3fhu%LMF*ILy)1k@>id(peLmM zAhR-kj?pgyGoMB^&KAVEk{UdgF3L4b3@0mYrKY&H`IoUTWM!fvai+Sf#Jf9`+m4nb z8gGl(JziD(YqtqFs0Wsa#%`rS!rpa{Zc)tU_NTGHv3vkGe!-Gz%*3^&K0bZ-0;eL` z3^Vs%S18@NuFz9xge-nbanM}1EV<%-Gxuql@zd`>=ZaNkMT-Fqdm%n=2!Bnlf?AQV z-bc>PX^ibPd0A7k2Qb6oI`Yf#VEyPFOHn^Q+)alyuCw%&x0rvgrA43u{CF zvkRWQ#}m=HX%-5&CDUNCi%n(sSps#FM8=52Z!?As3pqZ>5YtlY zM2`K{M@OR;_}vBqQ3|GQtzdlyiGq-IbyO_ub2{LQj(W8T5U(C4JA_ z)OLFFDP;r}y$nbbCdWUl%<(rkZ+4p$Y7L|gi!sDGtYGR?!_4kPI2(}q#1v<~T~HOpgVI_7>*vP)Ht)ysm7oH|Zb zzw*vVeHs@Cu1dA4m(zQginJvhbV^CMcG}e3(kVEq;JCm%FQk)9HV@ZtUUchJ$;QbC zQnf+nRu|fh->gKq>^_V^>_mQT|HLqoYLd_M?;*Mb@7|G{`0HA$a~2kdTnrDxTd|>8 z_57N&@C#p0%XtkVZREFq3nMD83;T2TE@!CfM@MIR8yT`_TKOlb^EmLZwy+F*%;&#{+%+$_SgaIa?LRrvmP49}C+44t+U<}tF5`5&!;nMOJ^f@S ztCUw;TM$MI@;bWewoN=iAh52ZuF^>LuLJP2pO+~v9-O!DJbTG3MqDOVTH=~>W*oLP zm>Xp#IJ&M1f4=GEeUv5TX_mF4W^JHea~|SYLh&nYRvQ`Al5x_>>;)fO6aA;yM)LlV zLzOGSfsb}5eca^F87aM!Zw zU3>>{r&U&sraBmkXGDf#@B)K zhPfBdR#*QzG>tJ5h`dE+J+dV>!yV^^ayETKSsGDHj2I1C7QY}EnaUQisdY!xB5mi{3(=32Ru47vUpj)=%#GuIh}$fe*A7140GQf z6PdhnXqvNkELYQWGrKn{T3w8mY9t<(=*VvL)~k%^L_ueG-0}R$MwiYSYAi;y?R%ndi=a4n|=dms!|`{OKZ>Q@8Ba_G$}BeA1FQli$jt3^{l}g&bUZdPdJ1 zRvd5_t9+id1j&-n$wHLfp6)djbLdox#Or|&hZ~p&!V|K>1~U)k3!OB5ytECZ^bEc3 z&un#Do%MWE5|==0%%k+^Dd8DNv6I%D&zRbK<@c#bG4li^1&T2em83er45B0uc#hdt zOnD?5c68+|%Rq?clZlGDg&F5CW$~Kd^g?}BlzIZtsD3vxUjB}x*9w)DS9ZLt{7u|4 z&&4fwJpa>WZh|);ndq-v{%1Wj<-Q!K+}80fQ(sI8xx6>x;^o{f_Oo6R!e0n2A=JYR znSF5>3aj)(NH#1;bgm4GQ!g=VoV?H#lPzC1-2qmxT5o@yDnS#+J0;JgUnA@6PkXlN z2akLQeljhMpqZLSiWVy^uE1bqIfg-O)6W>P|8f-TI>Kx${d1r!9l44hx05YefDy2E zhF*#CKKACRB!#;dTGTT7n3oIIr-?h|4^>FLy?ts`j+RWMyu>&5T0^6>0** zHQ18s32|J{)&%FtpeUZ6whabO8_y+8xX5jUY15tLi_r?b<|pRMY+^Vo<=Ms_Hq$I@ z$`StKkMnU*`nV6~_%>X;+~&sfnEOn-yWQlYN8SUbHb}^XVP1`F1!ib&aV>9+C8g5o z`-`pNqR_l}ipC7u&HTxWcN?xO)*RLtQOB1oFX@$W@yU3udGq_3^Nn{J2U4Cib zIOkS1%ha8MSsruGOw%=s3QKa1%@gjw$~Hpd8XaOH3-t;~2iijLBPKasWZnayqswSP zqf~3`!CzL;wwr8~v&xyc;US$%{05AcI-tCmWA$$Ix$^$^OA%&a$V*BK#{CibBzH(Ro#ryfu#XbP^j23d?qP((kT{P_Z^lG_2I*>IwT#OWrinZIc+CIqTHdO@o9DT>AC7x3mndDV)#Jmig-ezjsX!lx2^l6hy-Qt?iW^xzdr8I!c@uXqwO6fTN_q z(&I}sRo>x*1s$7Exnk$7YiyR90`wj`sS#?_z3rC=c@rJ)=$xFt!i~l{bd9XeAI$yL z>bJ?yqCBbb_R8=YC(;9(=Swen_6snA#N0N9-a2`OabdfAD3E+_g>c|@3r z0fkI~XI?1ga6>+&;*t!H>#SjjMwrETjYwae`~L0)QE{7Of1@j5*QPA4#|rhe8J!Yb zjgK)cZ3FV(^l^I%l~K*bMWvydbAPmXSHrJwfwT)vUufl8J&SXx4q(;zWJhXkqQh#Y zdPKYKmfA!u^7P&rHrx6J$yz?ixStI`z-g0V&eL+l>5b;B$!Ymd8V5!O;;F&!MiR&Pz9?boCl5th5)SlwB!0QOi{-kP@Xn z-&cPGVN2Q+}5j$Zr1^QI@`W`d9LY&uz7yJF2A-cUW*tp z(VWO4R|ZhSqA#bBpl=zWj5)@lgti^lu~*NXbOCCpAw_AZKvHtgyFIh_m#oPsne=Ky z0dAFUq{xWfc-PUxgH(qpE}i*Y7xC?wd9_(L7U+TZ+K{l$&lJweMt-vA zH7q=-ii`AOM%`n?K04}dcJoBZk#Y%gTU6@u*NF%?G5a0PZ%xf^d*IXZLQ$2>7GyLp zoWh%U!u4L-s_p|Hp~!Jp&y4FYWhR>H_0R)X*O{(;C`j zOW3m;FU{e89L#n80x5^cL1cNPHlmQd0|6gXE1*jlypkx!Pcxq($cL zNR$#S9i47DvA@l96Nz@enE!qpL+HNucl2YDUKDR46S^C6@!LEWo5Is9(uEN#`CTSY z$@PL^AtnfGs$ZhcQKO?Qj&P^xX;jt$Vn4>~pjDu}Q>;RRt+t!ec%2wA2s)|uM!sEw3TT7>l`^m1Y|& zU~$^K`Vw=2leuyXto1bwkD~oMCmMUG@|mOS1+Tn}cWN*EM!7_cwnX!Zbj;`8)V)c1 zWJ-)A+MT5rxex1%9k~^XGos~!8^wDndswbC?Z=62`_e#%tCM@jjueT?43 zels0t)HZ}}e*J8Q;2Dz-gX!v)G%}ymQh*F(Z<=fV2-se>_3JB)2gQ&L2L5<<#Uy!$w_z`N2c&uG2usFp2n(7 za`p~?-Td6l@l8bakS6v1qPAG-Jcm*9jp$r7Y|CKkdeePe_Km7zpwAX}Nx*Gi;5LOM{n57j}rAN1$S=wWq2j?(I3| zE+v@sd>PxU1Us}U-4>r*B&KE%X-`ms6#0AlO&3Tph`BpholiSZ3G|Lb>_*3i0hp@L z@n}y*E^}*)69&)@dcjv|oRk3+ED9N1O;dS4+fl5FA}2f7siP=1Mf-dh@&X(yy6AM~ z)qkuxX1<1%Wf!J%7&o8UL7#AdFeorcixukE^@xTS#0>_mdhf$rg)kcSTe+S$-1`w|~ z*)WSe%^o0R7}`8(gk&-exp~eskx{^tUWl8m(7aha*=t3woY!k8dNUJd+F@NJuV>N) z$}3guNmZ-s-@GWi7JEENR-*SPj@Sy@v`lcdOu;ZrH)h&>*tZYiN4arMeaNP9I)yw5 z_fWm@f>xih^61l7Hh)63{7Ek$ z{HiuNP@@#6%b7fPb~cX8(ZS27JqnLC&Y67UqiK=*1v+9pB>e@g=DvrMhE8|9TwV|@ zc$CEs}CrKe75^^pxO@$qch?sTFIf!}uf( znr0D|AJH7%j;_QZtY9xK=Bfacn5Kwv@i-Bk2iyW1N$cTBFPa5mI4~ol4O`IYxhcaZ znlG3fN~wc(3K_y=YR@HnH64dRUoXSSeJHci{y?#iYxV~Lz> z#cQw|KJ5A9K9AkFM!^TTanwtA4e9^V*yN(u%j#{<8`$ zDc-vIk05=f>0o5d`xbp2bbS0XZkYgc+D?<+Hzgo5(rArPuzv+t)39tO3V;fy=|Bt~ zzF}NJzg|Q3vO12z#`;@6+S%m;3zW;}VAY=pu%wJ#{Ge7oS-lAgX5NA(pD+)on;b23 z85%du0+z@091WaOPCDOZKwX$7gQzy{98PcaP61s~B1)w&#yWTwrG@I&{Q!%Szr>8i z^b7=^2s*+Y?lV-~&F*U1gBfkc#}F7_jZ#?kDZVZJ`tQvTG=j#@VLz ze6Cr?1NopRZ-fiGnn(H~B9j+H+?XJ5uGVh7UufT(6?N^PNZkbSPupY=E*v+g=`Z?h zk#%UEBzAZil0%IIN3dPDW1U+^r&Jf<{_1hQEnnEvgrr)xtVfg~q%BmTXg5v=B~$0= zVTW>Q_WtkK`=NI?UkV4@C`!F>q9y5`Jczuc6mk3(n~oq@Zo0!|P~5xGld4mqA^UI- zk({EZJVr@XGDpKYZk|I%*F3I>RTt1<{FrtccQ)$vDc~9{C^?_c_$6k4WHeHe%1tEJ z?D>T30UI_?GICzWnY97?D7}huGJUi#IO;!5VWhla@^Sh$3Qa5S2CpTYs?7CBhR5=? zrzv??dHmQMj(;jystmZO=4>h(r_QfJ=^t;@j33k~&{C!*pA9-f;o$o>8r(-eX8%&% ziptZBccbocTfS6~qv4=p8Yqg6mN#79mV$&9!x{3E;!7ngI`*9*7Y> z*%)Xy>>gBkZkC*+*tW;5)QT1zG6TH1i+bkf&)FC$h_onkRcJCHz*6L184~Ex!G(@E zy&|wE#kl?W_alGM{k0P1dH~^lql4*0O^$FEf4!1B-^>-g@_93&__fW38-~7YK0FBhK($ntqfw70xDxf;)iJ&TKgUNSc|pHK+m@3Lrkh)9IcpyM290#tVw zqKS6&K{9aaFiM@$>H5x95!H&k?L^P{T!AiyXUX&`14N5b+!3#b&*kJU&fxa0P-gGQ zBv#m&o$MDiRoHW8-`!*D5_D0qLagx!3e0d6m@E5yPgW5P=`~;v{z{yhd=;|WDsSXD z9!HTG+vNKFLBR4uq5ItVV+qu=O~&kHhPp-+h$9 z-=MPJXdnUfdn}sC!~$W{j4M}U*U;P!+G3LJHugD-pJ>}XnPf+_EE}9+5y0~$fpAc*$Ez{V1#C?Ep zEr)iIWqMUCq@g%k+-L)WIm}>`yen}*h>^;KNFe%N?+ytT_?q0Yd$f-cwBPl@7Mr$8 zFkRQ9P62=68(=euR=cCV@2Q-w5IefeMli`r_^442iZX31N@$<%q(U!ufQ>N}5B;=* z8+y+*9w|KVnv;3xt__w)D0*L7?E+3HVUoCbi;+$TvaRy; z1G|a84Iw$IuYczQ)N!R)oOh@Yp0q)rHJVVw!OPAUz%2?T@|crG?)F{ z)61hz_8mQ>d?&JInPjlkGq6o6MbLiLXD?d#q?4G10aZD7#kMUvx0hB7mdG9 z=&qoSnE1B5leNU8tMI)r^YM4 zU=Y{b;hwyp*(F1;Qd@Jgv(A;(uyEMdf+sF;QW@o0@;Qf>1T~ufMq(zB_ZR3~`4UsT zSG52skgxKi@eEHj!9@b0IO8Ftr!v~tiB~#EBWY4fIwW+5J7VXyy$Ti+Kcl}G9g0j| zng_*UPez)KJSRidi4;63ZQvUIPG>jgXYd)GX~Y@LZ9w+v>#+8?;2k}*2J-9X6TDZo zpfFIP2=Q%K3uLYo>CE48XAoQ4WSnwhBCunLonc+X9l{J!LecAR>Pt+@Ti_6_K;5?%lAkZ*o`W#<;~xM_u7d~ zYkT;}=OwX;Yzpgb(aa& zw4<~Nms~=EP=5Z_^EW72S~;9J(DL_5LD6Y4I*}2X)AX|XO8+Gt!PA%+@%fySw}D7F zrf5?r<3zEH_SE@iSpOwvHa7#AetJ^xRK|0%QWb0EphiOg3PRn-o5U^|<~BVeD?}N~ zHN{|Y81y>Z(0#67P`Jdl0Jql%DCf-^7Yv7`etma&`PH%P>_x}IGt3KS52h$|BAP}k z5Kxu*G=60E(paiCbq6OhH0~>29Za#T6?dp4oZWlRu@@RA284?_kmU@Jscy)m?Yh|-=iI$wvvdgv;pK#I{jbqS?n>u7jHez(XO5_zWPpXet` z-5@rG=WF!y3==G=5k9ieZ^>#L-AS24l5baV#^&-B9dn!EN(9?;5`2qJzB^+I#qk4e z^BLDVK|ZwjB0!~K((GayIchbuDqleco*iODf?$HYnE!ef-Nxnxg^-E8?e3QoK57*4 zn}mYkCJ*O`@G)A5Lc6Ul*jvf7>{2DzA*DLy<__4lZrS90mW$ zLTV-Rl&GuF?FUN_w;$`nNIqV~=u9Q_0N)EuVDPg0eVJwo8iSU@$AHIW#9G0bYBIgt z=<5j1+&e@xw*e0_91CVO){uPmi4M~A_srwpjfHJ~RF4*ObA2u6{^)dG{`1^c8zzs*__8eOshYSd5dy=Ljn@w;X zb$H+9Wl#|}{gA6vDE2O2txiA4%mLvCFV35kW3;S1=8r+Gd|y+1#<)dFIB zEsApzG))sFM^Py%19rC!k=u}EMbPt$K{kd)cGH4Bm)E>aEom6~6BIPUfR}QXN$gQq zuCI7fDqi#6$Fn<$L0EaKqf_}2=r*plqX&t+{lu|z(??{TILyy4n>UR9>hqaT^$y*= zpmzW8+frQB9n5z66A6jUupNr)HPRtOT!@3nnL;PelP@R;Ph?ijYt+{c{F)fP^7!b$ zM6GfkqV86qydCSty|I19epZuz_gsY;noEctZYE6n`>*l4%P77(=ogq%`FWu25?(i^ ziy#Gsu1hJXtith0Q+IT>U`}Ai5~u#VCuT52b?cr+Blq5KyG_3dL~+6G%T!eAP){eZ znNcy2_C5)ST@-pdEhI}g8eqorUMp8+_bC8yV~6a^_duu2x-yNsKcVv7lLY6qo<1PA z;;UrMse;5oT{QmJ=obX$Qq+879-Kv^t2DLnvT|(K^(`o*o(P3#X$el6&>WkKB{~=J zH;MU_928ZpQ;oi$D5B26_mdO9uE{&mALa$guhJ26t)uVwpZiVAO`q4dUwn#@k`ka1nk(=+jnL}=C z$jUoR(ZU;tCZ=P>1{Ux3=|DcNAhBmWq!XAA{J zQO&xjOS`ShHLWa&T!gn#LcBYbdFK_*_SoYtg}#ZOgm(F#eOsPTl5sXQcxL|GyZS`w zGb`geeOZFDj#zHI?MoP{b&S{z#NZ4UAyOgUt#JHw!36o5fx9>( zTW39Xm2QeRQr~Malgv{wRavMJZ?YxmJFraenUe3E-(2W?<+EUg>8F+w_PQ@X?kj83 z0Q^-s2uAt7IXyaDb4%+Ev1o9^;y-0xm2Qm$G{3|v;RHiIHTv6wT@&@kO4+6-1LrkF z$p_7|5)%v-6HLxu@K!3l9hZ%>4K6@PL z05L$KMGC2>lRYj>N$aNP?^s=d7j8o$er;a{hnCJ#9vm_U+Q%N%W=f7#OK=Jtt1lu_ zg>c5FV2g1>)cSO=6x2zUpUzm>-1IWzGr*n}Fuu*p&n5uT2}fH;q!2z$Z1FA^x`+hI zA7(UeKE}-R&)%n5Yp^n98V$I)H?*x2na1!`5mYloFiCnBty4XM$Q4GZ`t*e!qdYwd z=OqObbLR9&Kly@lrP$0@O=ykP70J5;cnbl0{Zsk`k>tOoq^!Jd!C1f_foFtCx1u}y z#&NlMhRasI!(+PFx1In@8)Pn3HI--&z^7qyn78Ep`F>oqF;fuTBY-8W>w7)L&qH~p z-I;F;`Sut*v4XhV2fB+IZp`nU9ZutIAohhKYVSARziWMuhImmgu#E@;ylQu*qh6ag zL&v?DQ%;xb&qC^B&I7qoZxiJO zCT;H4!G_Z)Wtzu%(Krj=+vS`Efk@2KZHs_+C6|Q=>;puX7$?I2ik$ZUY=4!DY?Li zZ-|p5M%$7L&*zd{bxl=;KZTQk)K^G_N38@y5{a+>d0vw&8Em|oePQ;-_aa(qRjDGA z=2;uCr$TQdYoC2Od7kh0Gu-zz-I#2)yowO7bFDL!-kdDv z)r%wn7g*xu3Crhh8?MX%wsdUBQEE4@)4r&*&k_E|mIb>-42Yh#GH$&a*){i>4G?Fs zO;nG0uG2sJ>2G8`%C2+4K8(RlmuxZ$EI^MX(XTc4ss+&8Qjzp2*JyDdK6!H_^z#a% zIcU+=5-5UkGNLl8@mD4JF~os9XdPMLHRrtVhl^Ot37@-{KmptXxO=2-hWBn!z9+vf zo8+Cb;Ciom1g@bq0+GZFN1gOvpWG0+8F1M|ecYZ@rFCrkhDfX4xJnZh->&(r;Td#< zML563BfCw8N#=+>#XJCHDKFgQ-v}c~nr+ApEUtK`fK}%g(9IPnKLpiE+4;G%{k*-$ zYE-#b%|G|ixe;5W8YjXH!@Hb@zK&6VF{j(Y)W8HAFqBiT1fWzds0It5cg}>3A>q-o zx+{TZ*0k7?FIw95+XuS~79M|yndRG2Z$-+_F;aQEe?fzFo$3PZKZ!Pz9e9j-V5&as z<;*Ia*|MWfURb)~n#=R`fJL&{y$k-2$~ZUqriS0XMiq?8lo*P-zLvWyJ%jdeS|-gr zTWaw;bl30LBpBiE+O@mKd=K7KbraDP-*t`STEm>~5)|bcf%(yC70Tcc)RkZIaX;kvg&!yD)40I?lUBG_5 zUnf*_J?7MBDu0Svyoi|{D^J4(j=PK3tjp+G-}NM7rj0c*HK%HI^enKzGO?zk@!zvQ zZ!>%N8U2B7^b$74ts0{a1$=Q^%BMJ)(gO_(tFI7KG7j*OHPvCxEVpUU1d}+X##@p2g5k;ZG5h4dt-B*HQ8qS@) zLV~s~-skG9voY208%UxU++K=XV~-_e)~TpUW*;SQX*Dj%;4ka8Lgi3Jh7PzQwY$%@ z<1o1;n8o{1PfPITA$Dg1l9cO>C#6z!!(0xs-pShE9YS$LA~28q4L0Bqyu)7M{``%%jlTXL=k`-O_h%3v(%=qjUZPJ(e!JUvuw%k(8_^o9xj$ zP@2fP2?`1_yn6v|zlnqhR($j>;mU*oS$x^UE?5*7Krzz;&Z$~oiDM=V&xSgd*w*Xc zvGc{bo{L4%)uaYIm7^J&o}TmeRGRZItt51DAbtT?|5|InS` z^M{!X!%JO@M+x;l`G?GMqmPJFF5B_wuE>}t-G2y`7UoDH4QX2Vuakb)$8jx7B;j3h zq})L-Q3b48f7nE7kG}XPUdT6X6H|8|AIGwGY91=7z~_k3Rxch9KF1RUuFJc3J+jX& zXAQkYjDtnCR}D9}c0Ovb?n}pa^XthKZ?2Sjx%dW$b1iSxh^4=H+Fscd-2sJ8=+-Om zW+w7tP^wa6QI9WSj9HqA`X6@vHJ8r`2VxBPtXQ9I>#L#?k*S)1XI^_6kd&Ljle;mM#t@=GtznLdQ0JALa z2=3Clnw__~tee#4=MVW)^IeTUZ8)4mrW#76SlPRbp=kq5Lxa@&&wo&u4KZtn-A$hD zaGH6zSYIoy-jqOy9I^yZ5K7}Yu_%SI0mxZil_)T47rR}4B3=7$KTQ?UqCD9#k}d>K zoCv#wQlhenl{m+2f|@n`hLGUNyYFz<9m0p0yS0oCpzXa*@#?9Sz9ojJW}FtCCd)Le z!E}aGcs0>|^?hTv*{SgKpBbb8L2Bcl$_ypvCcrgL&88qPq~RS8R2&6buLilCngFJM zyjv)+4wM){R#B`osCUUjzv7Q1Z!icIidol9Ed5k>1s&w*n!iHZf`)3Njpzo3nccQpixfjIQ8N(08mZN{phe|3HXbP`211`XVXSck-$ zQup~aZ0L}g?+u2Rhnp)F+(p>Tf*Ac3!dx{>wR>c&MMc#0c$kS~J`Yiam4SnFQQiQGU{x|pE%>i6K zv7I!LLt@MBg_-+m)~ro@wBz^bG9`7F#NwF{uZUNKf*EjXp5C$>mrF$?W^L5F*(1lG zT5FHcEyF;R+c-GvzKKk7=IEnJ>2rQo4($1z4;}vpBsvse+g3R~4lnX8`|IMFWLr)c zolWW*)h*Xt!G_EHoK!C7YkO#BI!&00I!>}xKNc|71e;xMz9JwKfA~;p=5}Ik#Da4@ zYe%2ukFknf{}<<6p!DdUe^dX0QCMb^*Easuh4mdn=JT3NyJOM%hMW)EiY6`UfIIDL z%>+)1b;l5#q(N~>12alH>0Fw&8k(M7$Y6Z%q9L&cW1i+z1{`wLv77%5RM#8z_cIx> z*T%$?Z9W&-?)-HBxPE_4{}^-rJC?Z@GqJS7gg-_V%bqV;b@}$uDe%aGjjD|BdqJ2* z-|_=%?batbR4S@0vYxQ_;U+jQCEi0-XYfjuW9W7Jr-58Nk6F z!I8fh;knE!7b&gka5LWbz42_V^qX&>v*OBf#;QA7L2e@Z+jIqNpCxpYE){7^H532K zmNFP1soe_;a#@KX-S6cqi_hheX!-PefC(WSk1DWpom)q7mf4+>sL>=0p08vC(z0%o zMMpjnMl0)C*Mj=$n=p}P-sds4q&o8PLaq*KJ|gPLO4@@Mf}#5i2J`NbOMj=`I6m7N zrE@QshIDcLH-{4I=1a|3d?exu?~OF>U3W)A3sH*%%>_TVTX7a+O_*889973Er+c4s z3BUyk@#u3VE(KRsU>U^=-6dWney7g?Sv@5;ycAA%t0WRc+1bUWid>%rYFIA2{*xP( z6UUyVXRx@a*kxruH<78yuwN1=IadU>ls`ywHyg~8%D%cn*DGgh~ zaAV|QSsPlxco=8&ec(MN9|mqCx;t}a*In`livQI|iuNDwG!Gz;AQo`a`^e$OS3~hj zmi4?_Z(G3cX_2sSsvq!6=@`#t_C+D{{68ckaZOR@57hq@LxuW35vgs#{h#{f9eYpT zvon6X&yW#raoaAf^yDJX;`6Do60cV}1ZFjqJX4_=LAs{AP9SJw0EcCXc(5^g9Z1q{ zKaG?0?(ml7oMbXkqx7YBw)H5RPT}w7-IkpbL~QWH41KqdvAgjNB4aiJ595@%&!>{J zr_VbCuQr1IHXHXA%d*>29L>;ho77n@;yTWcgFikOeYNqKLnaT7TEp21$Y}LWqqghxfnoA;y5Hw8WGH!bN<3nMG!560 z)Jo^60-&S~f6vhyRN^nnljN&@8Mhy3+M29)5`33s>hPX#*q;~A(pTWyLserBpmb4z z4O;%xBS@n;J9NBKJ5|lAbG9ZYa@{1G(+~Nh4O8El7JOrwP=s_4q3%nle)pl)n*#@Gi|JOWFg4H)M%!Jb=C zb^4o(lE4AGwb_N8@hTH?uuoaReV<+cyHn(`>q9h8ylm=Ts_)iq`SldClSmYuab}Zj z-iF`t?qVRz+`!TwplVT}mX!D&3 z;~Ozey|>9{WBu@oEL4fau!JJDjXC@$5)1;CD_0&Y8@W{tcK&2DsTHuCmVQ4eWV2|* zd>QxIHYhc;?Fn9Rhr;*Hd3$(Z9IEonCx_rP4y8#CFc09`vWD!@VxGAJmEShV+``Th zF?+OLf%!M;4idtMRjo#Pj07r}>@NgQH-=7v%;L49KI~wopI_7Fyshfr-!Al@d@b~8 z1|yTDDI~y!FFzYnzhi=&*MK9OA_o?kvZ2qiEYI4TFXf+#f+&fM7)O_J6c3P+HJYmP zxG3U0sy|7nTuZ$Kq%c!FvrJmw>rgOZHbFpZ#LmFM`|4@z+Z0>rA?Ye00KudX?u3JZR%Qv zM!o(FhtF5ww~f!&5&mV0ljQK44jR$_%0)ilvsa3rZ%OSwByeFi4>t^8_;=_7h%s;@#FUBfW;UeM%n2lEeT zJVM5^-@UYPhAcel?_;+gd|u@TV5gG0sUw*i1~#I#Tx2J>^|YM;D{*rY^-_?uRsI39 zR>zdQAMy3|c$VF)wNOqrh8Dxsb92YROTGTy&uQ|CYuE@LJZ9_iu_foCF4A0cVYgUUGVayrU4lWM~d;+eD$aX;MzY*Xujh|-q4X>qoTz-OU#lnN#2+(s!0+i>w@ z*Pr^2Ov@TKk46&0XuJ(sQrOs7Wj=!yUm(IzP9c>1RXW`*cK!>Kzk(PGEO>_WfP}-% z#7~j}L_28}@(KGRKkI)BwrqLgXpMcnkD3p<9&6*thF&1Z8S&tb|eQqVr#<7uXl} z(yJtoge}sz0Xy@rvVv3RZhl6Oep#mUKsq!r-C0;MljRGfg=hVeIsMRL<*p|NCa-#h zi4Gv0)9(lWy(F#>=#W8B_uM7&n;&~b-O`PBX*g?=r~P{8MVbS3-W~yiueaTP#JbNN zZ38JUM||-q=*6aMe?UADJbbHUdkOceP#4$z=C&z7*HS;X=muOS=SW;8D`4RFf?fo# zh??}}z0HsDY-8V0^3#@6pEbXA8sUqv1S5@a#&N`@Efj>!1TBuNPC7L^bjT>D2r8U) z6~I75zPgh}qzBF18Ub+u^Yw0- zI<`nUUq&jZ=Dm3}U!7HKnS040|N4G~J+p#=PvT(hK?$j-LLkD|_gju8;P#XcVm*m)A1dq%{ppG9Zesf9G$h#Dgksj?^< z94Sj}rclSta5JuqxfuC;{z51mTD)9&Lwufu2qP^J%R8gH8)6<$*tA5Y>vuj?sEi#Gny z?trh&42FfaZHGdt!kWS-vwtD&B>jcG$`=4duk0&AY!mq)jz36Dgulrm^PAk~d2N#z z;hD_C!?{h}gdl<5@F)#k^zm!4r@8|^=}5k*k8~L;k?&#F8?H+!Da7KFypW?AU}_3dng7v>$H=|J7)-U34>w9 z2d%RTiKX84nH}8abC?LwiCR3F`}Z{u?!D(L50%QQf+x5g<+edK$aSBhwDWdU?tE?|gbhaKgA~H{47KOg?`uoNbbh8l;*PC#3^V&UkNe&jKJl%7x51HlnB#ePoTRe41{SxpxyUAY=3=`d zV3HmH35lpt!UYgd9QFC5tRut3<)v6l$@A?V~ODYEJX2O1T1K zuj$Xa>2eLxvg;4c(ov?!VO$&~e^T>Eae6h4t*9jlahCa+pSo%SUA%Zh zD=t2}CAL5eY`^6l+Cb~Oh>0n8;f;wnh#LW2S?~yD)3HYnB6@(($_Ui5y(tZ zRF}f^+xq-o2=Mu2l8yo?(r^O^`eV=M78i{-qyJ1RwPIK$i6}j2s2D@Xu+bPNk=hw` zcDa7^vOTDLL|xUy8O=CH`Zd zkwcK}%8ZRrvD*VBf_Q@hV5RP>3$+BO!bf*|;$2?^-<1#P(Pt}Wcn8T0q$7bd3a3WS z;c*g9TzS!Xac>PY0`G6-#mGxt)>oTxqERGoFe`)=>3-yl|5W^TEHAize5A~wh zzA&KHVF!<^HHbm+Gw;#cP}8?26?Q4V^3&YkOMmq*1?1?S<- zLt#`l9f~zgFf;#*n7!+**cB0i+QH}=hH*)xlWWYT3b9*u{brOvMC{W2`dvaBFhaK9 z()Qlb%`h_Z=6xl6n;JpFqg$Spf5R+B2}-aW>HSKA4KxJT-OCXPe$zwyfc(1FBT1~6 zX@}z8Ky!O50u9_T&pYi`D8D5OD)}mhV*V8N>#>jpL*kh=2Z-hgb}-DQN{wcW}KcdwJ)d8;#tZZ4ZYpwpIwY& zDB7^Pz-zky{;pSgsdHkacK>~Dve*Xl48Mq0O42*|N3scIrv3Bp@y9T|c7M~B;@*OT zmV4DLOI(ZPB^d&?RBDotTlTAdaO|w*?6zqA45pXgQ>ef~2!?+C2`j0$o?uQA(8dMP zvEoT}CM%GMO3$}H^k1;}YUkaFPv3HFoR|7`44b9A#Qi&4HZIMnc-&%h#&nh6^K~e= z(S*~8%Ys3M?3XFxJ5HB2D+epJpTUs&_85v4J`e6xr9Sw(OVtKu!SYQ?67t;u0BKJl zzsgftzMSHtw)X;sR&OkM3DPf+WK4;5`WClzsz@d!=*erACd=1bA1@kxTV$(fj)MQs ztMcDp^$X2VvQ?1SBKvF6!UyxX*#Y8ijRbNfOD^}u_T*UQ_Am<7&pNzK{gMd+8>&!g zX-iLtg?iwL)!H63@v$Mp?3}TtSi=1TBui87x!KS>mF2f_ADzLFR#W|Zv=L_RDoDN#rg-~t zs$I!B0r0@!*$ZLk4)A+7%ZHRD{_Z%&ou7#gf@e+vGkl!CBLtNag7IE>l2Ik1ejBJ0 z>&o!$EK^VjrxWvV8C)!BpJ%uc4Q2?ROuBa&XE`%9L4Oa=$nL}&evd9%`DgBuD$x?3 zumQ@!O5?1%T*4d-!VM$0sX?qx%M?8~9YSR#pD1D^QJ0x|`?GsMs`ApEKJH(D`ec99 z`$(xuRCHt__L?!`1ZjRW{Bk^Y!Wi4Bdo2+utc?w-HM=MP{YST>>*4+{%&`tY#rr9F zA#IE3I>=?0TAI(@9~X5!juK|Vaf1!eJW3zVBv%N#(mqYflcbtFt9`t;;m=rtqscw% zo8pdSgzIEBEUv7X!>08#5s?jXbt9Nx8saK>dV#C>Reugg?!R=yo{3{m)FGBj8gj%4 z#ZBhV-Ndh7yB5uSR8GJ=7k})vmrL9et-|?6DB$Lj-xUw7r*^pENh)2IC(Fk577bMY z!_;WN*QC0vd8!+cLEWDC^>#E2uk$2jFYW$()~A4vs|@yA2wZ)ni(0C8&fBJa>fRUJ zv}Fo+#d@!kwjL1Aw%Rn#Qblr%v*@TRU49>~HR$J{Inqn?VrWWGUb6W0BWOlnvq=w~ z#Q_T`*jZ_1pS@sfR?+|l4K>~eU!l%Bo~64bWm;=qQN#Y!!8`H=1z+o2{6Ce zOr$1h9VqYkjF-CTj>kV82qwAB`H);A0?y~1cJ2pvgRni=FmH{>@d&RD0W8DJk479x zZfVHg8R40>`#Kb;Ybd-Ui?Wz0??5u3Ist-I;#Ph}2IFrGauX88?FNQ%kE9?3GrTD= zqe{hKsf$lma5EK-B+3$ZP0%K?OZ;}ky;-%0EPOsonC?j+pwrfEcbeo6U}&RLJ*BC|Z0fy3d=_O{7BcGw4nbO1`?e%zNQPSzHBZxn|GbqT%V`7 z$E(1^9?Y_&z1o9cn4q`UbtegK8#f<{tKVLH_9(kWR>rpkMS^0cGE2ZG-6kzfU_qL` zo->++DAYOy??zpLMXENfo}#$>Ox9>J?OU7erDkgNr7QJ;I80&zifO3AVz7a?+5@+4 z?AflyD~JNbshqT@wF&BR%Bh-?vivnK(Z+=Wu}@DM1;Nq?1?^p$C*H69Y0U12!(O3< zqwkU6A#k>^J#ycTDq!h{((=F2S`K%7zBl#V6eT)SJdwd8-2hc1=-0`Vi8%)WZ_1y2*Hk^(+;$khD*fulS}gW9Hn&`roU064dVPWoZcWjw3(4@mMhu=Jc>y zHIqnPEpgKe$8;5PCk<0l19pTAip-zDWxtoyFqD_|a9|^hy3#cEz(cZ_h5gViW4F_| zMlSV4KA?pwO_9)c&SHTCu<7U;=*!hwypedkAOA6W*GC8vfyspCR0d`YJBGpGBy}H8 zi6vX6bK*wVCzc=x8Lk!&G)Pm#SJo8xaxPgyq-*+?uy2P)CW1tIf?;kb(yK}QOr07l<{f+Y zsDID$>+-V06zXq~mw{_>)0v$j3`XX*x3Z8<4@0DSOr`@G%Uf5{!OTi>on@-s)Y?R9 zBZ^qrix2KaA)skqvp`o^woh4BA(+_;i0 zW?cN=f0p6>|0W{RGx>i59O)_i|GXHqpT1W3cZ#}UFb2*?lctxV{fL?)2`$UTM%naL z*mL&?X=-}x5C*jQx(78_WMU(wyEPpMlp9QHsdUH?7*>G$d=_3DipN1IlQMyD>Y>mYxxQ ztmKJ6qo2xMq!-D7>J{2cR9B?y&$G4S@C3m>jDjwjwCQAQWua?hazvVEJ;zP)uCbs1 z$LVw}fn%g#ia|LfpM-oL4w2{G%SlEI9cC#TJhiS z&baxW>Ij&@>LS{PQBH)=&@Y-57Y*bqCvx=am3oTbQXqo_P1Ruazeo}v5KsSPo-#JrYyXI%w{Z?1uZ#1lO$5-*W|8ftq z7^_HTD%)iBFr#qx+Mm;^8^4e8P93t&AZFIYgnbhEQQ{KUQx z#p!>~z?)T+icY02`I{eq{*V9!R3Qo9fx5z#^9thMkg|m`B>L}=ag?S-HDV%`Qr|EU z0jO*rWIx^h=U)(z(5NZrta6b2603|hdCtoNFIYvy+T>$mLW1NcdQ&JZkmCr2-}Zz} zPgv5TN%Pb+n97gHr=Y}BC=SjX@p2@T4281mx{<#Mno!6H@_1$Zw~`{(+UOwC6?%CobdzB7A~(mn>bATe> zU)KtJcPRpl8Aj{1zd&%RnP^>T?boD4ExGj9-&G&JB>=&e;Ntx~yr~ycpMHO8??7L* zl1(iqc9PeOsMj{LHp(8XMtsFRd)EY6s&kvLG^+^#Lj~H}@_q&*yhNU`cSsdgp&WJ? zNY=a*-VW(nZDSp%;SAKib^TkXi7Kc3XoBOq22}31kAC_&T}LAr2|MZ%4k4WP(GMLum3t!S51-IalAF9xEjbzB+`1}8j}cNPv_#h9L9fEcdU zavet8Z-;Sgx$jZN8(3?gslT%@Wv;@{4@4ZeJ!XAe&0Y3yb^kN1Qd1{HgIJ#PA#sNS zF(@HT>!Th;sGwP?n3BV*hz4uDj1{;9;EXhT8bFKo!#$_QnhjKew5ByQ&cZ1eiX<$-XTdfFx*JWYPR|-V(;^SWFU(Obe7V7w3%N8JGJN232)!S#=|FL3`AT(!(^ z$)D%V!48~wTy={3Sw8u`a6UD9X)2TYf_AF|mazAXKdO-J`sqDu9MWS6YkrpOi$44Gj6^7Dz3=*>={bBII) zzs#D$Zt&n-nLWs6#l#^m$yIO$kZ|5B5rPzEUNlvX{Ya2V9lY6JfA2tnXQ^mUrcK>P zXISVsPhE4rz{$y}Ki|&R?0M@U9@~D2AFYp8W$7=t!ptjbHT!CuI%<+Snyk-Vl&n8L zigS7-v`xh8MerT>V_mHqoxaLf7`ha1c2|Eo-k(?T&*;wIN9!1gRSk|fiDIgNx$di6 zV8THwCu~lV$bgoHfc9P&+Qj$I>-f_SCix#Lg|5j336s#XO-^r|Cc_Zb zPeCEPin6T78LLB_9qT8237(F7XyeOFSEBhKyTCzAyJl$ZsaXk4LI{2@5;OiP* zMQtO?WpFo;6b_Vkq;mm@uS*Hm&u*fchl(I;==JuMKOmnrfwn^DNAM1gRh(vY@_oHP z=gcWG;vr=v=b6Ra0;b$BMNJ85&SQO|{TL7YpAQ%Nf7EL*ZGa*-0~@Hz z2LE-_sEWms+n)+csGsK-1NlMs~0PYqH`p$@2qI!Y()J*i`<*Vu6t zDIr##!XCp~C=7~tykt>9OOL1TU$QRu+m5_tCO7CNAT$YR>oFCHQWUVEWZP`12%{K@ z;%k4#stvYUqJCjTGLNBMa+)?FB!fUjLspph_K$kNvD9#2yFu9=g+ zLfQvR;f0`_D(z%4+i=2JC@FdM{nf#nIW?shVn6!w^uCtGP7F1U*@! z?ues-$2O(?VlfpM#Y$0mL+I+tTalB-Umi6)U^12;Aa5oQ14Lt^nVH(tK?A zI0z7}Vct>!#D41c>Rxowgh-_`xOS^i=iJVR0!e&-JIdf`hbQVh#gfQnvqk``I(Y$m z(G5EZ%SssesNDLF9wn-aZ}d3(A*k&$T2H(AkG)>D!-q9SvPyNGg>s)%S5YPu>Fqqt zFB=wiKO*sK$J?-};TS%_u?5?=(kj-yE*u80o(Pu)vro`t)zznze~cDVjgqP<`G`Qb z)|SiV!aF!fXPoGekG_femPVH?D+fK^0w|cizYu=rM)4DpU1c`hlN;WS5>QE*N~iM& zznTqPliN^o?RNLWYb6r}0Ayoe-IE-F+bL6;doJ(xCD2}5+W(C)}1l6)}M`4!Q zTdvrJ|GB%wM|q;06X%tTE--f)ev?y%!>Z9G9r?;fJIwE?vLDqNw{eraAIN`$MKVwM zj-0*sm2?SL^t}Fh;y)-l;78$cTPtqc#$W_e3FYt1k6y*-%KF)2)U|-`7YH>aF!T-A!Dle=Cqam?w+{)_ftYaX%@wLFcJm0hn}M^ z4$ZslpDj^$c5n$znr{@4Z`Dzb#lX57tvYU}BdbJ?j(&c4O`Wov67m`b<+E{()Q`){ z{oh>a+BSP({V%H8RgmBo&7X~KPYx1~NLBHy#c zh|VWsEpLi14mh0MHP>V_+)J$4p+XhHidy>xi>C)GxY-s?AL54rAB(YyCYNA*oI)!Sof5VJ8{9XwO4rCiczYBk>7%V+(e7#Y)_vVT_LEUkE{V&>r<#cee3eY@gXa4mCSRL!;>-3> zf2FVz<|7{h3xENfZfrOJr)0ec(_d`A4beQ1IY z!BGb5b)EdKCy?4oFr|H~T`Yfa`%?<2OH;a8dPQifEOCw_$Csb~yEi1{ds(TPR$Gg7 zDf*x=4!^pX#~5*|2Cwuy;*aDqosE6e z)g|j%(RgdHlA9m>xWu$C8Kev$Uam-oKP#-btAs~)9xRB-?R!}w)*fmdPf>&}rvBZ& zXfeM}=UUo}cI;`*+!5*+DVeI?49fCkO8WA%cwFV#Z&`Sfw_$y9_U~c`z(l~B>sQS-k5C7zfv2}izprY$(0md~-aw8<<#Wz*yPmxvN|H?e;*l$6 zdoOo=5as)844Kn5+Us8lD~Fq$jR==7+$drfXZ*B%D)wFE2XE*yH*>udy}af`KDx`a zXV=y&xaPj0Vso3e*8U|O9VpTc1btCf()8ggvc#r;`}U!BC423IpzbPBKgV9k!PUv& zej^KTYFi{UeB$mu44{@scb+6y7<0i#jO;AVt-N_XpB&nP1?E>9DS;Pd=8$5?adtLO z^z|#l-rKx`N$YmfgX36UGcR9PU!uUX%wmvosnWYkDgMMz=6|hCkR;psA@<$7pIFHF zn5xY_;xDr|Krae9ccD3RpGEx2!%0E@O#7;_5nKds6mXf2d39gljJX``yR;j0e{N%& zvCL!Fl5}q8FItD*Z|G(9@Re6)_=)a@3J2z1$R7NvQ?GOB{{S@KtKv{&;>+|JT z8XZ~1%2)t%S@ro_LMdxzGNFJ?5cjC^-Kx$JCs#`osYYc4H*2c!M1?{2L_uy6hYr8| zBqF`4&;@y&k8p7-Mn#-O%V~|2R*E^X!fz9%F^`SeyQG}h4Ky3GPtcSC=GiyAo-%Hg zN7+S_(1Y|~=V_En%yu=uN_BY1V$YF1SVZ=xuf1IS3pHM}NYR<)KSr;4O+2(iMTmk#xXNg?rRR=yNP%^(j1Svv+q{Nsb|Uu*U|8fw}tU{1@H?`w368I z;>W)bl)?KEf3rGe0+a3cE{~_oFV`i{qX#{7%%~ zNY>#NFb<|fDGJ=XTPOISi)&`ef6BvTY+Auxk<34hO>eWF3Qq~ zo5B>t_R#QCRQ}Z9bF9JqFS>2KU;noUm7(ryA1M@-FWif;vO$u=J_>^oJHWnm*P}f= zyt;(|Fz;a5ZM+ygQIC!$j5vq2HJ_ocvUWbVo4}cp5K>if-H(lYl7$=G>GU;Sk@=P< zF`w@!ACuB%^gbD}+75I$vlHX4e_qQSuD$DlD^CSA(yRajg9~y3hOY#V%qGmR_{Sh1 z#G~iTh%+tz!97|6vF-KyZ*Y0s7n3NI$Na));@6W~2vrilj#hnO?=4wO;UF)QrENOr zNMugUcO%34GgIDw%-xGS+^D$g87E`O<`akc1P;Vy8nE}=$83?t^iq7}Y9^eyEgnrt11?7E#oBe{=L{R@lgsUXraQ z>21MWa>I1jW36jmm{X z2z#o0d*)GAgKS5k%5aYN7buS67Omhtz*30VP+{@OW)x+x?u5Qoj{Y91GkJv<5gaDo z!mYw#YLCQ|#xrZjRADbaG|3^j;pkpIR^*&fy%|dB^1>9O-w6F0^*2-Nez(gEr3>e^ zMZTE>ro3pJxu)(KTdv4rmlj7FC?s~&RO^)|2%2aD{jX2HOS1<`l-<0WvyPNnM6MLR zaA8uv5d*Mp8)cD~r%eMXv|Pqgs5L$oYWqm@iwjGt&{J%tkll;@33bH{9!1=HBBAn+ zH+Uf+5+J5=c&FT72p(ViNy5hK#y|lW!kEM(AzGEfM^12y)EPyz-2*tO*7}xg+!vSe z3AM|VqG|~qc}`2y;D?4M9aJ*A_uT#hobwLz6mZ-a$Z6y4n*B$fD^3kw8>Z~5h@!h2 z3d0dCe5=NA0>KUI`WwGqTc%Gvug#np`aqmDB#aZ|zQ@)v5-H+F?oMdu!*Ue$z@=#Lk2TM0*%dV+g=VtY zsKYUelcGAWA$z((V^RaZu(&r$&Z0RjM3=?^G3~)uDt-F}omu%?i777=?#K-<{@NHx z!f`)KB^e;$wKO6%gtxqN5Eu5*bGsp#R_6=Nqyw3d62n2FhYDb!nIo{8{M#8MEQty- z6eFOFe1+Dekpu`|e%}4kT%o%Jk%{{(AE?beG9N&%DW7Z>b@V4)DK=Me`d09sn2I$~GFv`KXfAX3h_t^z~cS>enkcZukRcBQ4Zy>H0Pkr0?RaozD5Dt2O z&tj*0dlHYBpoZz+Du6wiY`;6q_iz;F_kc`p2@Q#gC}%&Ze8ZKBbhaYyDS2bJt^#nu)zVPdrZR77a(b6Gx$qTmy=}+FU z<+P6{Xc%3_>bzUyP*ACzHAJ;nrTias03B4PLJt*Zn^CtSe@1=}s^`|7I4 zTd|nj+OqSChIacjmbff-a%nvgdIG18g>4DuQ3UbSO4sAn?IuGTx6R`6t5;khA& zKDtMHa8s4n@vs_4rPTjk4s)J>sqCF|$hXoT%YCB+%2|UCdpsf-L!@ORT#irVpOx#Ugd0)l*%L?YeuN!am5{T&Blq9<@2oOk!n@` zsnas*w(Gg$@OZ&Q=Q@KKLpOK*Y=qG>h~&iL^CfoJ^^y>OxA0a`YLXg_Ztr1NBr_M) zg9bCs7(YKY=*I{mL7ott)dnY2+07~XKAZV*Ip#h}@vx{QFN7JMyo} zs09>94SeNy!U66Tg~MiT^X{Fy_O?@Q#a2uktv8!s${kS=i4<1=h3sA@TUa8pbXopG z&Nrf+;EUgs)NnTVI|c>O)i_opuOEDS59*U4yZwIQDLItRH9cpe@0ZRwZ~eXhxxLz~ z|Mx%(F0MCgccip8?S5t~nz|cxGglRk0t3sC1KZ`?b}`;i`U_-JIjibMMv^?*<>Iwp zsYVsc6)j#e^!#S%BYOI$@qH5i(b?wam`AvO-$A0 z&mI^mKQU}sNQQ=oo?aOF(gW4#YXx#b&i=7nT9VBO3}EOP|ItH%oR@y#^i<9AUwYN* zDF8|7pDhvIqqx{BnELY&zWljJt0q&`L%=+wwx9TzT{spUUrE8?{g~#{n!RD z67&f3!Rk)VOqjJEiRNFE7g;|@U^?I;hYLTro%XXIVQFAY{oS**E8l?!N-LcFd0!>g zcA9&I6UA~Ax4OOf5kV8XgN1B;JjtZC(rfkkU6jRYH}`lVzlMm_hHIP`wjeMtO*7$= z@2}N69s$nJ90ppF*ZRn3zT!2h*kvCrxDDsqMiT?Br~eY=R@f96gqTZAXI=K}T}nHK ze*qLsZT$gm!ek40qLZfPOszLg*5)ONoSauYHm$IetyAY-uwcd+M^CqbJkBOt6^5?- z!cTJ&0>uk1(TiUemrC^7)yu+?XwI~ivzq^q6KIzx;{*2@kNp&J@@dAPe zcq@i$Cz6?>#o>JHlB{qbt*osMRLb+G0h1T+><@QH0I8Ers#xLVweht#pFBCguxIlTrk=$l=`K_$8?f%yh1Jtm7%s@ z*-U<&9vsECRa0xi3`11~H-_$xH4ccG^C$i~&f7V704F(kmFE`{k0Y?QaN&^88p;$K z4;5U1{+xT4E5yOl*2(e8L`}3pUU_BN|1MaTBVW^x^NbCXbo$gpW)~U~sHlB&&FoxEimyVvc4g$3UDwrG z=O+>kr`EKnHpQv?t-%_j)*YR>B;;Y0))a5YmE?Cl2#ZO9%bL92hjuNkqcL+rrlQEU zc*Pcqk}^N7e^ZtVs*6vv&-r!;f^vEI!wIKQm(62`jBk(&C)7no!s;FOkGf zGK=uu;~p{qMf&DskI1t%SSTmrGO3-sbyW9~{9l%zLTN_SEiZ!)8dRE886q&3bD zBL?6111jttMVmFRI<7pwLBuY`zIkl=m3F{WM~NX4ilocz6UBCPIZY!Q%HoXLT69oaBV0ySq`( zlu|F6lwj|hhPp#7jgcn*{KkkMqli#lxN4&%a50ygws@ojAb%=*D=Vxp6iBts0SL z1>n>B)ikHVlag8kKhB*adbbT7(@EY-n`GG^K8QHshH*>cZ?Z(#ZIMMH**HM%d+AU9 zL2;Of$UAQZKVnm9^kUllr|6BBaXt7OvNKx8RV2B-yVFN5V6j|Ut1LG=tHDC)m6tev zH9VHM<+Tkd6{M(JyWjF8ab9QeY*e!jfi$QBBz6Dw5wJF;`EBwu{;K%`QmLXaK*a!a zZK_G2kyh@1gJrfrx~X`hWJ$n*QH_Jjk0>DN)>D{H<$c{<(etW=tUw{)-Ws-gQ z)=y}siiJvu?0sYe7%1JyHeKITW_L@sRCi4bq(0=L7C*F91Hzy56L}}E^wo-WeIuiT zos+Bl&XT*yH_=jT(@8*gDW-OBvm;XxLXu9EOD?(x8M?U%R_G(SV^y4eN1Q@ORo%!n z)#&bhfD8aTx{k*}&g1&lf9b1B0x~w*TM%J5`_J>gb!gETvHbf?uDMMbJQ6%vyze5v zKV%wuY43@Y6SGHer9okl29MnUbask`UirZ0J3o=4@*(DXblratsFKADH98gy0T84= z^X1I;SGtkIY2xi$|P2Q7FYLzuEy z45w@}0NU3%<#k5BCqAts4}zw+$S*95b4R4P=Kyk1haXw)gx zeaq}Cf?~;~gb4NJ27#5+KJNHcSzpbA@&E&8tl^$;gFWk3OB+(a(j5~Qke?z0xUu%p zvyo?QJS6RXIeo=OC+gu;B_FNMCxeyeEn$>OHhQp{_N&-vCX?~9PONaUed5iw%OqP_ z*-WKy4_=R2ZEXIt7lbm}e8Vl-ZLR??)7#1#1X{=d3y#q17Jhom`BdY;7J-?$0u*WO2jur5R2 zZ-`vzs#%10R%3_j{g^!HA#!wpsvt^p;`7ysgvioznSfTV@DY8otleNvsJOv>qQYlf zfibsK=*I=p3;`OpRnvTo$~LGtUgwnqSBW9p)X}G$y`PclD)yK_7niXSMHiZ%p>s3?j1mc86r-8yeEBA0m$x`mN4pS@$8y5Y31!tvXOr|}(} zO%5v0xAWK@$a+yqo&Ty8yam1o7feK%HJd;*u9`&yjp8FY3qPZJE>1RPpI5Vbb{ERO zR%U-#ZwA&?G;qIlT*7hMZHcU7jOGqx07tbvXB1T^BhUfvuAFy+w%>YxMVsnqwI+;? zQON>`-h9$w2>V0)@4{;64UD$SU6HVAg-}2&?Tg`}2RtJj^80i0-)&lJPi#9GP4et1 zirEPs+p6hH%=zLW^66ZQ^U@HYcj5hW>5E^ z4-9Ls5IN;cS61x@2#ekbE!E+AzAq}^?~T#d%^wV!%}vo>|ARExYp*M?2|mb#msTTf zJY_ME{5mGZA~a_GcA$Lvuj$0*716b+1emt0Yz{#1q}ezSou0J<-*`EGp+eV5ezy4M zL(cprY>;Z2HBH2pxc37&W)V$g83rdTN1%;C{gvXe|1cIBP1CTSn# ze1gQ&O8ruA!>Ic$F~S`0u=rx=ik&hOPFWr1{_eBp8sZ}H;4$yY1xqU7bLX3MkFW7o z_GY2S_x{ugwQT-|oo)g7(WT#a1<#)U-W9%U@XTC{AUFlEsq)VZX~9?7K>ZsHC7#_k`<>N z4zx}DM!fZa7pe`FnQ}DVo5q5~RZ*bisgl1fSrPABlR0&t0tip{D!_h<2Jo7-&trXI zh6v9CPVVa)0bJHJcVP+aC8Z8dguXkfhhDqhaw!u)xiatt3(jd}b0WUg%0G%;ZWR$!H0t^L;^j(gy$g!RR3$RCDN!+9WSEK-GVmKT>{`k7YxTU-OwB0H z_f|~#=krM7Bvyrv$uy0;*hXpPq~ala-I-?c#Bd#s&7c66_VL)25ikCSmR z`D@Hxx5A4=weGlju}&&jw_`|!{dwmIfXv$QAxO~pWKER$9epIS_--vxq>!65_Jx@% zz)+nMQ1qPO@yfz_5B@8|Zr}GyH?*5wY8o^ZX6Q>aqubJa<@vpA6IIa5Ub5A;6XS

oza{GWkeRDpJB^;S=i`SmG_rWxo9NIu((=dRU)5lvMh0*{ekr5CRM?>Ua zPNEhHvSUpt7r9XNYPH51RSb3Fns?^aLL0q-Zx2KU3Jwf;?d+bS#9lj!H;D?O*eF%X zIPez~4tY4QDyO^3=B!DnVACdK4h*9Ks%t)m3+T=HGyZ8r6x2mjd92iUcEI?^0SO(C zzDXLMu5%;T_KE!eY46Mbsm|Z`WvG*C#wjFeaT>B##?mGvCn3qcMAk|N(IR^!8XZe2 zBxDQOl`UH;(-9*3u9WO+c3HpoeT;d}`|$<ATHzN29WQmL06eV{`5ooF$C-L%cM=_lw{;P?$Oqb69R%ti^lhUH9pFLD7Tzja zd>IoKu0CJ8_yxu*Pk^j$m|+NHLG=gFa%p5#LieK`%o?HO`7qn_YFcaedLH2;3xYPY zXCsPQSj8$1oJh)2=izbu%bd?o0a(9|WsBs5)5^gI37NciUj!Az!;> z1%eh&5F`-?LXyCe@RElYj#F^kWm66ePm8c#5B^D@(;M8&f|j-be;)tHYV|?0QOU7> ztDeCZ+!7Ag#>{8&ASl6y!~|SS0Is4>FL98~1(t!&a^_)>8vo0$hj!zv2KPZXjbg2l z*NXhj#JHdE({R<$|pYD;9%j%AS)>!O>>#z#A;SDVN8;=roAW9Fzyo|JQnv zumep%Na;%1_^ta0yeCabpBjXojU=~rJnguC@kBYANisj-NHqAC&=7UlLpZqw+f(Kd z6-A`9LN)m)sSJz`ngx5Zq%2$S8vI7p=_*J(Ohlb+$kZiBLO7HO)C%l@405xbW21nvy z8?FUVSJT%>AZut{@2J>*<81PEx5W8BMy1-@lk$Emu{YkN5w0JpZ0T57E_QzF4yC$K zIa_GRs@wlOt3^8SY9EjmOp_!XPgo4HEGA_ijwwBRUVo$@i7<`nKl=JX%}2Qk$O?W4 zWgLC42CQN(ZJ34vP#K9Vm`%6b-%Njp2|oWxZVlR+U(|hGw*gIJg*4q)^K8T6oguL} z5rTE0rx5nwFwX{%?%-@^IABOft#`}O2nnR*KmOaMIxaN}?5I++`(aJfm6~h$FXziG`^f*dain$$6MN4ZGK-NO{Tocvd^EL$*M|Dx? zBzbsViyr))!vjgH+AoVSu=v#PTY~F(HUs#yp&B4wuSNU;%cXYa&To*7Pa{D#EeBOT z4Jq|wy?K8g1*!v8-Mm>lJGtja1X9%-g{xe3Hbw7kXkTY(;*!qSa4R)=aW;&k>VB%` zyOxCSeR0*?F5Hntqi<;GS9P1yNRkAFLj}7x9+r5@G^CYoN)uF?3&w!>@pv%GH&}dSW zBiMNCqYuYnv5eKEWT$H@ATEyz76B!W0A_Mc-{K0vO5-05p2Zk--}Dlo2Ff{V?q`%m zA-&Tlcb5TVsA^aO>6Q|mn!isqEI6g!RjNT#BRj#^C&uS_vK~BGQWMG|=wnePi4rjp zLy4Jp`iGV%>cdU>68H72+k%(DZ{YF@AYEeN}bBW{gv+?JDGPh!GMbMAzoq}tGj zw;Fjs>`dA>ns@$m)M2i4UVaMA4DM009?wC{Bpz5TVmi1oL;$yXX}$Thl5m3QTW|3Y zRd=!*xi+e-YCe|LO43@zQVQGT#5 zEL&K$x(!mYhVt{@V$$o2q-Gpf9v!xH?O$FXmSQkW$7uaXfw|#Lw|l-BvWrJzX@pwn z3oVV{Yw+j{*kRTK$K7BU3<4W&06QV@LI?e9zg>PbDsqheA%m=#V1DyFM1bT(hLjke zKP-;}!U#oRKjet{$SU}Uu*G?p5glFxwuGt(M%55tyWR&f#@-;Sd5B)m@i8cS_Z#Ln zq=Bg`qu%+kuq`(LTxBn#a1$UQo13CGBFeAY(GHMkEf};vNfKp93hw?nLW)i2`1A}? zaX&JQMQUIxTz&hW)s*@<1O1Qma8n5jMFOZ$eao=t%jg%OL=}t}&Wv3G_M9PTC%x1q z)`h(nn}Mh_XpM5`ILt;`VZbgX=@cNvTPqCAKAtzm@D?Rgd)p~+wM5DQIq0Kk_F$#auv(#KZJw0#~Ld1 zshlS;=+-x2&~65vEPc@-vqM8KRO9=UOWWSx+frxhBNCjX=E(@1=fNpDBw=z~G_K?+ zMVo$cd`DmD&tDX=ym$MLpn*FtwX4@9PUU3}+W;A)pxN&_$u9$MqBLplGrK3A=I zPR&8rOnd;6c)pT%$r~lQIySD~bJF(|c@FNvhR?eicDQHLH z%tLEb7AEKZ1aZpgY%VKNC<*Ji7+)k=Z!E7n=o5aB-^Nyxjd*{@(Hydowrjw(=^#pk zvSka9v(+*xp?1-DAPvsXax)4aIo2p4Uc%(yPzYRhYr0n7$P zGqz*&5F2&xUvC?(Ol{0`LjXgH6zxwnrgMr!KDk>c{sCp5P|FCss`Z1%5~>eoLDPCJ zgU;ShUOvflMPdMo03Vjng!&@N)s3O>v{Q178uL+61GWljUnJF$`mutKbhZeHy%lWpPxaG_t*dne@f1V-9 z9H|7Tg0~IF+=;w%kQF{|eu&C1ap+?oTLB4YG>#$;z%MkH=4-wKGOzn!xUrKH^^OM2 zoxm0n8N4NMpU}3XYpdHoWjGP$);S{2#ve?Z3Z=Wu5aWlDM!W&NL-Zj=Pel{1l`-6 zf)O3%ir7G4k_A=_HuUrv@26o$kIl(&F$zIjlWGUC#HCCUyJ(&d;}DcDUEh8h<*hIZ zNI_bdNl@JTP;_fh6=}iR4QdRv3`01{^g=Mfwaan&2gV~M3dBRw+dmo~X3wQ58AVHk z!Ouk+eq_NSDz0!dA9W`zwt+t@e){;4tk2fl(quL@zyrkR>(Mh(1$i$c);9s8HxEN0 zr&cW_&T}=xO)R;jssLYh8*uT)`JwHUs}QKMJN>kMQlr5ruu+svWj(j5Iza?)8+ z9IbYcI^s~zf~3@bfV5NrXmO7IAU7DIZ21{N2`K)6GjC98Zx0oMj>|61`(RYWw`toS z=2V$Mz}3-0NnNA`P@ZdEQ}O}bFd7fjkAYVC@JuEypn05XFO)sK3+jTTSX8tDH|-M6 z#n?Mknbd+>RSTUdrLQA<4TFm0EMGyz`W(*h5&L;09w8%y{TLtK1%?EzxMiOM^bZ+i zH~avC|Alv?ph*}V23lpp+(g16OwHY(9BUWBZe&>k)_8)JW0Z_@8*wPX9&fzueZT`^ z|1l9TSx@Bkj(P;RQ2rN5s4-kaaRzH`1Il+L+|KaY+SH90`VuYLNFZ)x*R%Z)h+<1H ztT?^0V(6GsKj>FD&T2*yMCtLH_ZTn+*f&&_imlc9BZ@A=FU=!&2(yoDHIccl!Nt$85>Et;51;`$%Ki;#m1l6G!RE&Cjt3ixo>ySN!EsG0%O8R-ZLtI317WL# zqNp#BxTFeR33kKZZ0@w>_0$wgSN5vpF>VcI^mPMRCd^=sI?A&`feKZNAg(?YHYmrE2L66u_V$qyk*{RuLz65!*%QFlY#_G!||CrXR#| z`|hUxkNfZXR5~edrHTc|bz#G)a?ZVgOr^E*DqJ0cyj7f&ko~l>b{B3Q zXYzaBtZ(EdfZIf(zq~Ev+=FRbI3POPM|#`4BXJ;sNQn6`1`kcp#<$e389*SRfU3#Z zPyO8*Q3W;$X?(Wr?izVP^?7MVp6R7p3U=>$o>M3ohTjSJ3+%`I0pj5E+j%TMW_F5y;+6bNJsIYrfP zVZ(e`(9UNQDMYThw4}WGavX*HVau)WL8o?r^h7VpcBkl0c~<+x>Lc{){MGFrsdQ2? zD)T`arT%glwHCZS^CWRoXu52wZ{qEKXpKg-nu%xAF+yZ}GNceR5+rd;ZrV-aA>=I9 zw*v(y(ZJ+{W0mDY*!dG{9@abxw=GDJZVcb>c${N-8aBh?f|seBdoeVnwddq5Dk1g3 zbmU+!CmJ_Uy?%(mRntp2EdUweChbEc;nc@1(gz`?#e$+KH|B%7h-AC%Uk2}N>4)0? z8_XpI^a2aPCMXsp^r~Z*#zrTYz8gS-=|^U95R?JWyF=5_q8qyDR7*%AC=NkNaamrW z`tAF~lQ)4A5lafxuhaSNRK!7>`2jW?S^4k=tJ^V@W&2Hm&CL+qNVz_oWOW^glV{)` zZl+Bm;)s4QVJ=URf8^BmK0j97@)7U>c(ZfH1dDp%`Xt6go1(C^%st|6+So;aM+`=f z2(ow9am}+d2r-85sW^}hgW`ivp%DKk!bz+O|J{>_-+sl0cm5_~CFD)x~n$i%VNuv*Nt*3nO7Y&!sZ0X400+alzS4c$eJAnVcPIMT?CbLtzVJbwmS z9Rvt24xZMechNhb;!A#23C&3+>kFm!$Vc)uHcX9yn~&S6Nkue48a|&<38ZnY2hIX% zZRqwaC;&$UGMBkmTQ^7aS)9=(1xj@ovPJZ)e@>yIfDOAP-nR8S=a$K~nt%SfW8n4q zt18PAk;}14xU34BM5UZIi)VhJ?d#lt&Ts<+lJQDV*%zP&Qy=IXe(*Ba$@je)4?k|q z&^?NrLEbi+>k`yqkL@~MrqRti{q)HE+41QmQ!XOZueJG%qV}NpKIj0d;0<!?V#n4p$c-sh|uw_#Y6*5!IBNE!}kC6{f@pJVW$c92L9<#pv zJiLwk7`F#2RR4Oee{h+GSsW8WVWKrXyc47l{VrnNRF$`nzrmw>a zCPQ>_KHWS3XM39K)lP@c6f{Eu-mZD52nzLh0QZB1siB}THkR!H|(?sdKcNA$#oI#L@{qS8%Js>_=5U?Lx z*+bV94~IBsXdboJ%4-|mFT4h6F2MdMBF8mIWs9hs?FTCdkL%8BL+i&-;p$e&0 zaoHm_Zu95L@+KkUC?o3-*H4vaR1xHT&Zd(7avo>eQoHak(}9ns21lUd$&5;uBJo?n z5lwU89OyC~-dx4`a_6T(pw5>w&Xk)#hw(AHMDUWsbt@spCqmmgn;(8iMr%3C4Ru%dr&Q$%3Siy?cIS@{6RVpM4zCDq3-AhlA35IGv z_GA0#5p~68A3t};#B78DlDTO1uNw6&yRR(6$UPn&c$v>o$LZM~k2?n2pY#aTu!Tf3V;ChAjUc%= zIlOnS4hmqyjL-0hK4R|=)?m}vh3lRmEs3<`0Ih4&uMS8{ZH1A$QVte>8J)CiuIa?{ ztnW?-HDXT#fuWb>yI_+mS6d5&qB?pbMAvG}o)aJ%a76zOV9=G86WgG(a_bq`LgW(z z!p38}<%WHXzbN|&p=B4=HEE*z(y_EK1m8~HHWcc$%$_aBtxcA1)P!9D6OX%wy;L3Y z!I)hFk?AMl^lOw9y|&*V{6aRZx=2@OBjL>!$EN|QU z)&V#b{2!N0MYv7EF&fDfsNJR~$3(oiE+L~E*D#SJ&}&a#NGCA){1mJT;WE*L z2!H0${%?SC2QpKF^8ftaOv)0zkFgOIY7oi|ex$oC3K>Fge_?&C&*5yICf=SnaN-en zd?S7yPyfD=Xhamu3v}<;02ihonp(^HTS%O#)Ny2V`Y00NIPUvQJW{vnJ(sx>DeEe{ zbCuM0C~0C=?pe_F=J;H-13LacIiMB>4$B_R@G8(P%;JePR5UH zH*b3$6;~V9FJpVt?7qNPHWtzR~j-n|riB&$zo zXxwQv!(~qLPDqZ@v^W$02w^Fl>;ZvxI862lRy7q zJb=$mZaa_5z@c*?7e|4V^jDHMsmgv<0S~P9*KJ-`QOYm}4RM&k0Vjxx@lXdz2z^0M zw`Dal$ok=l~-OXTDIW+^~?1|F}8>1E-lyy zV4FI=oR6XCFSak6S$9Jexx-= zWKqGGlp>h6hz3iaRsD!&x@@xwKeWFLvQ213R0lTld)ypn3ZX4>e18W)qNDFe0<`K< zgV?t0YmxMjRAxT9q{buz?wZfc3l2>q;duLryT9#<;2MOQT?Na!k!mXX9W26Bg)+H9 z%C`KxQLG~1m`oS+z((e4?1 zy7ZKrc;zO>Q%v@_*no8T@?=50ZUJiH{YET3WjP%vWYR953XU;J4*vj!c!rKPN;D{Q8l0*3$>JiEN~vp971oTAnb=V zWi6;NlyVl4=vDj{^75Vy!rmPsT3K9PsDU)mcGowmF*>o~+ICZ*T76=f8N3cOi%Yjf z)yxY(D{TIO?ZYlwAi}_C4HF7kO+z`Wl;jf&X-9IyLyCpBj^}Q9-E%H6vRA-DtL&2u zix$}WJZ1iV1NM6h>E8F4CY+?&M$Ep>jmNSBLDyRB9t_j(Qg?gxDXXxrF^1se1chDd zHLrE#t5$6Cuo0k@?g5EJ)R7nd1kQYr6Audih%`X?~52>*ZSQh~iP+u)Q;d^$x}=A>wq%ZXt`R&L)c-gn)VS;FrhPypa*Dj|rZm;8T4W+?Uvb zPDX36RR?co&LNJl-&RC6uD;bvTE>f5#a=eD15tbnu-@L|J8LRnJ$DXvE=q zG(DoNfZD9C&X}3OGpq=*HHoH@0BBv}QFLTH6W;}`dF7mm3zc=|J@jgtIm6_dOHB6i z4;&+8{VaeqJ!2jT&4frnbxY!NbC$=89)4Z8}Vl z@d3$JnDK&#DyH-Z z%j-Quyr1^_iOYfr=Bi*!n&=jDyTVf-#ATdzY5sYf_-7!1^Tg~IUO0H5i&jw z;sz9H72-Q7(cgl&>N*RoQaF9ay?X-yBd`MWr#G0hDZzJc&d_q>TNWhMag?xqOMN8g z$3dcqyirDu->;qJX1~qTrIY{4Qs4g_Cf(k1b-3o?Wm0V0!HuH##8HfoS5bO!j zomX(Mi*TJp9E@?Dqjm<-wI-lZz_Hs2|il-J+&vh(JZ zq|Al@@U>g}U1wY`c!2G81<^agp8MX_NkCXtpmsRFDr8T3uu2YzqFF-x)Lt+$I`tMs zPOjy=a?I$vo8fg5w@d38E3gUlu&)uuJ*Cr^Q~34$I+O0MVK$aNu z<0T3&fa+FCjXRftjPng|1no`in6{Bmo%C#w=Tturv%yk!;RaUXIO}b(ZsA!OC{Hra z<`vEuda4;fzuNNrCCOHR8YR^7oCE4EG*uH%C_O4yh}&$3>U4?mn|U-)37M3IR+NK} zK!f5o(s3^!njqk#dYI;@&?gJ~Q_J%|>*Gd}3-7eXcJ#3GTvQp3bgPHysZ7Jkfa}hnwc;6$P6e*GHAo0O z--&B6nX7Lj`cFWK185cT>oy+$R4pl=5`8f_{ynD>;-~>p&-{-FI>)DEIjr@SZ6h3{ zPF>ka{GH3>^w{MT&O4;v4>6VtCGz^|j0tb$>6!AfCFeT2l=FNUlkaOokB#7ATI)f% z;?5lk=ZT(ZmNQfrm`$m8esxjsn6$W-L}7A_BU*E{80E6gIPL`Zh*KQ9U%_NnXVsdp zReVWx)ybJH7Utf1qLyl;+%8BnN{Gzpy+Vy%JPz6t`PDeL(Z*q1mpWU7MQ=TDV6@c^ zyn#+X9=2sY6DGe3V)cr>`bPf6Nw96<(}P=8ha)PO?T3eoagGAmb{g5?Eq6pn;unFy zphWimnoRhJY$(B^bZD@%gqh!N+NS(0SFGd*=`^tN(Cu9n6FKD!(#LT1ygE#8L#GZU zRl+PI(CddLrCpF+-4SYtVBS-^Kkf0@6(b}07+wF+nJ6smavr_qByxbb{V`jHwk~J{ z=o@yKxygrABJ56UloSKrcu0cm>5YLwrFE@_wo!Ej3_+R)_MfY8MRZbfor3XI<&Dz5 zmr8+e%%=yhRfh+=d5N?z0L`y5O#W}tW*Q!rBs<&`rYs&2SV(GL^Soxi&&IBM_8c$Q z>VtIyyhXI=#BqX3YY7Zy)opgmr0P^0L`(8JjYc@c=Af+9bqoJWJsYBto7fln) z-%|ro{vn(jx0yMb_(@&=L(YMc z`AZB-%AVi9;l^>yo;>7sx?+7=BdgB_6#JKHBXNW@G-9rWVNryj088AnQ*ID}6v^HJ z`9N6a7lD0lUXwHeThe=7{ee(AwB@v`f!H;c(bPa{HDF z$Lizjv=)vjNo~OiF*-1*kS_~-hf}FQ(Dqc)u0_f9>*P>KkU|{62J!^$6dCmHtjA%i zCxH(aDfY*A=b z6jNpv*DI~HkymqK8sr8N5xY=;0g!dVcu(W9D9w521^`8 zG`Y?699|F&5MuH7S^DxU>L}IrSyWcS#Qhfu(5*aIbQM7ZuX%BnMvxtqWZ1Z&J0&{* zMUy+D(N7bqx+=y&QegUKB#_^Lp7J^Jd?X&{reV3mu}{Mj?0Y0}{k9=I*)1}44%p%3 zBQF7}-g7k{IVY|%?KOajJo}Rw(A(7vNdesEHAAjYB<|)|yaiZd;_ONfcJ2;jILU%w zG;nTVyoe}5DiN{C<@NfNTihG!rRlCFNQs9-08MyvY)9`hHNY2Uhk2jI+`tgWwqt4p;jCP|dnbCbK${td8bsI_@r=|K30ENVK#fxRXZ>*pc9Vx= zz71eUt9fOtf@5?$$^EFu86n>3>wu+PI$j*)6rl$dYb~Z=x=Jk;>`M-OW$`X22%;JL&W;tkvTQN-V^Hsl(h`WIav4rqXYHq}RB77l1=>Iaf>#>=Lg56B#4a_OW4mq3R% zFC()ISku7bVuZQ|%hYnVb4e4yc)NUohIq?JJScYQ( zwYlqH20AQE5NO;cI-_&#X$@;~N`{i;Nz{2;nfScyIfSjkj94XmQ47Rj)Oeq}kJg13 z+AdZw*SLLmJ;B@f0A{Kmq_QAQ2U$nbd2G0tUE&nlzhC#3&3A~mQ5j9*KT{}#5cBq| zyKm66%3g(j$loq&{CsnsmHsh9R0URM!qr{q@Pr!-=-WUG{eYnQ9Dw?R)*wZ%)J{2! z0~nwV97X=B9;*tQG;$U;wVw1$6@K}Cvz~4-7@X+kxJQ&wOO zX|j&!#2SgX+%YK35Q$fWSfP*GEA*y`%n53xJEF z%TJWQC=;e12Q9XX_Q<=n!@QvmGEY6}7bt8$^uqJ`k)?D`3T5`kH@`3EnYO*p-G04v z?RKlPV+Xt%g^SZWk=AkX8%vD>l)7yvt&`D22*$k1nEpBZwPV5F?;?3-9k;ElWRpT) z4@?GYx~$Uxkk!jA0c!U79J4sbhH-;duJ^6djC^|%x*(}3(wXSMq(rsW9Khlkho_OF z*Ykw^8OwkCBm=wPQ-dSyIdEqaZq1YrTi@M(5`Tul3dp65b8imXu{+CNllqG$=IZ&5%xY0lwaMhlo~mZfi3$X{=S|7(V}LQW z#(WuGp*UT?EcP;b`NE1nGCYehXK%dsCQfmTC_JDTqB2O>il3_c%-dRN%|0SGa+Olt z4-|{X$Yw+RmdLveh-L^xXPW*bQ0}qDA-UsdBthC+t`sSQa6CjRfvl$YK+}_Mew)Zk zHM@mkankogk?6g)VV2stNGQ8!+A>=K^^9JCLUvg2GnSq&$$fIl{4O>Lkkv1Ad`9HS z%Y>ETqY4}GW!O@)1*Ev*-nNa1`D8!qQ^sy;JPc%kB(KIXy!O6bU2hJo zyGMU8#$t7i}Y~K=rc}00`O$zA&+Lgdi2m{4XDngsgz-!CkC_#es!d?usqfw zPmlRP4-)WAXVQc|WTHfLg-(Ykh5};n5*Pw)E|Lv_43@++mjjx&wzFl7M z$xihS-1wRhNCq55RJ7553z6C*OPoB&tdenUh7JlugH zmV;GF9AHpV=t9iv_8fk}e)NUYP>~24{rf>?NYEb+onO7^@V9;pr~~5wEEht4C2yPm zHZeZBKi+e+frn~L5q~{++)u+$#Tq^EGbD))Q}AQVkL!UCks`__r$@J&VkrT06r18F zs}Ga-e-?(p53*~9I=U9ujw2VSy$Lw|(&f@Y2hbK9O&rYIR(~rpCc5P_P;VmX!cl`)_%ueU_jw%7Xbf%q~I`^Hf)l zOVojXzv$%HkzcMy5lv zvW3N#r#u&vwbvYbdeG*Y3$x9(tqM2pNx6FrmCv8t!NejvHtu?Oq$m8bU4-3lscWBt z`oy-ys#2G&`TI-FOg1Q&vhDA$>bqCP|Ni%Fv;_qmfB#It5<^4(TYr6hpzKXPF7wxy z31w5m_P;+vh_a&p?~ho+iq-w=6K>sFL7{c|>l0*T=sf=){}{P-b8~ZBS$qOp9_=?% zZYZIZ7QAz;iE<;;RxCzj*3pX>_q@{2z5V{Z%nojDE)kK9 zXHzx>2M5RK=Zcxv##}x3>CrQ%(VsDD@#>}mA-Cc@i;ASb<$G|}CmEEEc2^CLbSeDk z>MH#9js5G_uQ57Vd$hAI`u_YB_U`@lMHhuXt7k2-5W!4VIw)+j$VJ2EOe+Q^rgu>a zAxfH>F?CLt`@SAa($D?;_KNk2En@3FJvvb1c*@>>|F>`7R5K@k)f~5OEvVlglA{o^ zr#*MFg|BYvwgf`VtPJ5NL@=lVtsxMp4a zUNk-0ZTtJ1=;K3IRt*miJC64TmDzC|c2qJ870b)ZV_31Gw(ntPX6AQ=2D=uc=Lpgu`{M}80G%Gg?^N(sXZxOrx^yyai!;Y&34DxCl@}0+fHyw7| zyYtlJTWhwW5|~hmrh_(H z#V+@%{78RNkB_+NlBzZzWarm=#}b;+TO2UA58RerVPzZ+x0i5(U1Q8 zbQJ%5NjrafZgw!g>vhLS=is10uie5-pFrX@=NFAPR&H+l{o9Z2fcaA_>BGZLLhiEz zBJWE|;#T$sIJ{8i)z5SFDfQ>vE+N4YCSqOkE!*zPYg^+LtU^mS?$PC`bHs98i;w5K zv1-fhwl+h2Cgt9P2VUYX6NW!N1oSjz@V(N_Rt_%b)mXM{ncV5qrJbezwiEqYmd%+f z?`+sD-CY?@J}*HhYt`A5*Qp^V{%pP)`#)#sg-By#<3mxqoz7!DSB*>kep!7}d3lO! z#||0mPPG?S&6!-gcc1KAK=64iOrIBh)iZEusO^Z%v143(eDSFvBkH_VsP>eaSyr<5%&!o6KLHA7`GMpKJ}v4Wx1l2rwfyYu_xkV`8dZpEEqpa-WpvyTdFFClsYF^sTP1(+k zR!M3PCaKkN@7U26;jwV_<;#~%?%h=p$1N=_i}~eG%lfe|7Z6A))0d5{P1M;Ke(1^# zRci6Gqc^E8qIAYTD?)qf;zNpzaT?wS@o6Mm)bH_abTDuoUMB0ywv37C4vUq`ME@C= ziwKS3n$wbn*dSS1Sss3VJnzyC{FPGi{PZ~b`}>E=1@d_gPxRLD z;V2o_ztVec*X`Fdnq}2|wDvEn zoL>oRd3HTR`k+SeNlHq(4i(aydxb8}PZ~Jw_WiT1EB-w;s^@%se5TrOa@J17s>N^c zV;7}4f4*DU*UHSyXZhNlh6uu`$ySeyGKPe*ZQHlo4m2G|w`x{5bGdX$$l>=lt-^Tc z(Pc}Qs!p)9Fe1%0WLRuu-?Am(@#BQlR_tRS-Gs7=irSOTU%z5k*FMldo!TU18yp-8z}qA3S)_^7BD}+>V1zBRi0&Jzu=op{cDM={`Tx^Zl(zZ*A=J`fOC3 zuJ-)Jg`?8aG}YDBN^c#k?D4Yk`6-iwt);OlFNbhg9zJ?h)Y8K7{mtd0<-xlzYpbZM zZ}|N!+w0}&h>pgz^B*cJW%czr?%us?XlzWodGqG|X4OjvnzL^D`ud)-GdoHR>gXuG zbIafVIF@o`bkxYo$~Sc1`8ZbrE^MXHj(@aseJ59;N2bxJ(Ru?9E=fs)zLZ;WY!((4cR3~X2P6XGU#4Z1;d5HtjW} zs3+ zvYLZK|J4zFY>u-FZ<5!al=$MR%(f#+O81&1p3C|cp-NLP98)X3%epTxm>q!;U(Ia7 z&9Z!9ZuE4N@qFvzd_+tAaaDahV^romkb^v@#zxAS_3ZbvU8hsaGJk)68!w)f`lGu$ z{?pF=f`Uf|nlduj_zoS~jFtC#{(QUp{J2J9Y8WTN7Fm6R?`Dyb-nw|MAo&cvH`-NC z#E(Wuxa~DAS&<=`b(5L%?9AA&hj`cbXKiDDHoUpa$Ii~)*4gRR({sW1h&Tl&zFDrQ!cjT_?&qzYnICym7gz}H`T$!e$eKrDt;Lj zl5T^5{@o3`wcD`HHO+(ir%tW9!OZEWm3EGIW$!PABS)6V`g0@-$8`7h-o_)+jU!RT zeESz=n_faa$!Oe38A4st_-q~*4B*K7NvPuE$HEfd%?gL}&R209cAH6m-#M5&$$>Xl z*<(=~a_bg@qqB3-moICOoc8Y9cQjn=^5;}j1p>O_HIig?b#+Yz9#G|g@R2M--G#^#up zs*naQKZzhZ36>|LQ=XI*25diGrYno3xFxsw zzmm%1!%oUpF1gO*pPwK1XVgAfH~Qr`C$F0;0*+h4L{06DQr#xCj~^fGO7U292UcL# z($Yd6mOF8x5Gic9N^-FcRrGpV+CE)9y)`%(jpb379QwZQ&KQUj(sXro{hVTaJjzNr zcAas_-6M+&^C{+aRLh3sC4A}$o_FpbrD#6Aw1<U0R$S0XRV1#I5A;LjrvE>{)VhGEaR50eSzT0@F+5y=oZ) z0Ivy8gH%t>&rTli`a)=q|{PE+(n5~M63Qmud6>k-ym|s6<)4p>R z1*xXxHZx;pL2lXXr(gR~~|xw*D>9UirIlYLobWl8XEowD?=NL$Y@et&bJDV>*1*fM6^ z-2pX}&n{DdA|opsJgsYmlF<(ET-yHbrrJcPxQlS6RWqH6O3fubCIW*0ZO?-OqAOOI z9!adnx~({JB5pHdA|pDB7WJ>Zv$C>UCe$M1 z0Q;HHCLEKOzun|sCSO~&|H5Yq>QQNtH&gl8TLBgyg46IFM)Nb{dj$n~$2SFiY-rfT z=|07UQ+t4}9S?K^M=;*lZlB*8xEJ_)DRbHP3@EIi>5d0xp(gD=#KDEcX3B{Aw2==>Kai|QSx2) zA3O*^uY``M(_?X#(;P)Jx-a1T*|XQs=60ZxIL?gmR@c$#}SlM zP&n$~AbkJ+eH(-{`A&ku@xE)=+3!xaxSa0N6tQkmeM@g>rW&ull9p~o0e)(@@q7e1 ziD)uUhL)CN8HU>5)h*0+qCOWPcRdrlbrOa1g@`iB=(b3q3w3cp>paVv1@h-2ZC5Ze zGiMm*xcAk(^e;;Rye)IRXlC}v0&S@>D$=LQ%HXD^rr--3ePpW9V9G~GdhAI=*W=@0 zY;0_H_H3lO-LYfGehs{-HT(A3Hr8GJ(M8~fqr$}*gJ!w%{|}gFZlvn z;ykrHX0ol2Hh0lgVqwbo7k*8~qt-89&U76F4!Dma6L(nT&>@|sHg{r(eCJK0XV;GiJx`HnjRs+j^h znTnm`%joE+;v&Mr>H|*5*Xp84(UXpp>ZH6G`Cr*ybStYJf1{0*f2^UZy2Gk5HLA=G zV7XQz1g~~K&v{(lDxyBuby~T%t?lWhQTR^Lk+l~Nura_gH%Ewk)X?N) zDQaazqkQOdLjkV&4CA>ZV#|e7RKJjgXLy4udVe0OPz?)*xr%G?HG8JA%z1 z+z(DcU?!Re;3OWR5~0?;N<9s} z&V6zIN`iJqoT^1a+04YioyCROB-L(kB?S0)r9GvZpnyHxZ+5>)Vs>J{D=SOn(xprG zkN8AISvg&YmSA(1qw5SWiMvo0AzAuR(0B-ai4j&>el~`0IZu%MmzXo#!@sWHy!j65 zKDfOc)$WxXhcyQ_vOE&8el;;yQBlF-ub1b_dg8!TA9R0gI}Ikuf8=2ZZ02l0b}nSgGJ1KFXWq1yASzU8`X#N!KebavX#PFRsV zhVzTw?1m=)SI4}&(L#B>55vMbfoN(icq>@ECY$)jvvAA^T^vJdeXu9=!B}0UYC> zay?&ERMZkvz!7AEQ-R;b4j`S;6P_ov>hBArn*85)^R+LkNQwi@gB98ME{HEXsWmdHRvpo zSvHHkdLxrE5yI0{%R2E>mIl)TkGGSpf59W07r!Y56Y ztZ}*y+IB3Jy}N12p+korK79BTFGJR|@`*STnAIR-0o~WDy}iA~3{IP=CFyT@u=Awy z>yu9ov)Xl67K4xSL6$iC`RNgARfyaU%}mSCYGJh{N8IF3o!T=l`bcO*)zt5!TvRoS zzgby_c}=wMPw!txK;Z1pqmO$lLBbD|Xq;tDplH%>-OEWQq~;%m2pFtF70X>VPWgT< zqF3Ivw;A8FsTRUOqU}fA{WTkRB8$ke{=-SeC^}E{$7YJ-SAq>+pZ+NBB+?+V1=}pR>62ah@;Bsb=wGc5aFdyN zMpYO>P&JsZ+yZ39C;{sm+48Ote!+pFrUpCtvuu$o+fnc z`0>q_lqeye;C?2mFQx? zvolIcPg8SGg^QW*yF^1n11U6^r8*mT;F{5y?p+OT3TnvrIn1jm3 zTOg&(6+C2RWZHlXx>6D@83A(xj=aO`m_|StdWQ``m0JH=2vM``UuK>j!CriA8B^_q z)7%uu-=3>OLqm+(YIR*bJ-4`y-UxKeK`OVgvx~pJ-YhsGf*IvVY zpvn*6XBA=e?uObgOKRMv-ignAPz%t$s%9!gDWe_K9GKdzoz64bRHkt5+)knq<>VX= za+BE>P<=N8y-A!~CHj;U+fLcU`uhO^R{?o8gV*;C3DNK#(0REsKyK;j)2B1s2o6X) zUuDkDM+gkpVecpE6AnVanvFoAQ9?@d{f+w+BP4Qjg4HfoeELKP0YoQW9+;VIYG1xD z3B?)1e7dZR#x77;q%tl+1Z87&6`P+479p#hHU6@KlUTw>a!C@ z@nC!^t7A_`NI*dR+AJn5EqyN{B5CMhY;0^xiGTq9YYH4_iv2I;9F+1+`!B2}pak*D zR246?tq^Ey2+3&M-n~j@tVf)B0%~b>!nNM|6VA5|#54 z5)y2BK68P1)#z%y4?U#d-8)*~4(j4LLTCJ1`>%Yq7UlT*jT=RfQ2aS1)tg1AR7QLo z#S^_|VP)kibXI6$OA(>ev)MtFwY9ZdX`JG26`$<~mw@xVOiA}u1?X_>sT=_Bbo*>y zi9aW^hDQB}E|p@4#uv!nDYQ~8#CHar6Wzqv3nY>Yt$Ddkx$T+Ot$yH@&^xg~96jdj z{6;<0lrTq$+8JZnx`{wF=HILpA5=-qqA~z)LRDF_apTKW|NhHV7vD#Z9&HmB&s8+N z%=f^{i&kR#$6D9XPa@P=P!-0|d(i}}L^(VI8Xpo?8|1xvay!<)ZW7S}SgFmFbe&p# z^yVt>&`{RiuZed80|P}mtX=SPwm&|s8SAZMy7K$mnWk>^5IDtBCMG+~AsQ9(p-e<( zt^8MAtz;9qco(cCR9G{&Q||5(gt|Ik^(1!<=Ui^$4wmIl#9eNHCb_Vs)}lFcH=e)T z$<%~LX4{&tuk_*_Jobd%y}M$^&YkA#Ytu3_OMu6c{JGV^nwAunmn)dQ#pW(dcXN{7 zzEd-`z6bi^$~`f=(k^w$-(?q7j?c@^&c>FnL4;I3ee&dTZ|!n%=dsTXNg(3=ImDHx zn$b!x-?%YkL{~pM4l=smmP2%)?1S7$R3f)+#{$x5AkoQKZ~r%o2>^`yA(oB%8&bN2 zZw3XaW&567-wBNzdf?LXpnz+5DRS%vIP{~q_#$4$+8e6<7#Prty!%pUCENZ29Jj#V z?~+ zjpZfh7+5cMUMa9eALvgS&)t-6_PpjGsyeVh;yvZ0nc55s4R+B>ORrwNx>xYa#rnJ8 zA-C_@vkFP^YxaBVcQ;lJ|9Tk_iHalUtD=t-NEncUf&%K>lB^@>e<1taessXXypCWu z(Cil3?!R7cL%D2+YHO}Ar~le!2`&AqT58%uQvLYb}uN$y$QV4H|oOcKXj@Iy?-Om*>ldu8^?gpeR<@DIf#kB-Xmn;ZR zE*773Z3XZQLv-o;tvkZ`>P-1r;`ey@l~z2hmIERfzl z04u5mjIFL91&-?KvZD?q2r*chm|O>-B_0AiVnW;c;3T3hLoiIv%v5je29AQm;b~!T zJ~xgZ&}EQ(R8TGhslCplQ7>M!;oS*g2KwMC>fxr%o8L7xvB5f$oR)SJE`rYqTKnjR zR!AY)60Hu2i%VR53wmf|gyfVI0`J?}+K4YB_%wI;I?$nzeqTEbu#pc0y5EK(3HJ&f zMwUnkAox(X72^%RHbWhyMc;H(S(ybyTtHCJGGx?tQH#b@?#GWG+gFanGrm77NcvFl zX`thnxK~?QT7E|0r&~6%pp>@_9vA(HPHwofbQl@ZS`7&)={*CM%LB_Fx!yV z-2N|oB2|}snNT!uKR{&t=j;DP)}); z7a&mh!S(ige0sJ1r!{-5IVf)f@I$nuWE9m_mB z1ok4wB-9!hR-aEOmg`>gUm?(w_y7oSb#1)L%T->^_^o5If(H&9!-*qOJVgpxAry1I z%P0TaVRt6h)YNo&DfztjK?KdxbSKFtca2+mgv8puS{_Vi`oPW`PVW%wP_LN|GYan zkpGu`d}G0V%~=A#1_aDc098EuUR9NJcH?o?VyJ}6ll<{l;iXIr4694<$7iR*+nm$> zdPx&Q78aK4%nO%u)iMgMzOg`;e+}e5vd~_KeoYi}TC~NC-R1OJ_PJ^s5MfX>&o(3* z{TmQBfRGCT5;qH5ddh73(yR+;fm-PeMVF}NNO>p~JqMJc5|N<`K75dS_!hpI^}<)2 z%07I6Ovwp@AVhQR#`0Jeld78%tseRw17VLI5t%|eJna5`8Z5c@XjlIGt&3gsdKXdP z)|Uh_x8=F-`$zKrl5KasB20uC9dT{*sLYM37pWVi{#jU{lbhRO*U1KL;*Y&x-8`K5r#ni;aaC)SRLV-{TIZ*xgwKz)S=$DtLIj3f3 zg3H-qK7r;$I%DI`GJOs; zp?~ldiJlngO+GRWZb$EiEVFK*pG-;42} zdYNkd@>F=)fY)Wc#^z>1s290m3C70kAB)=2!TNXU#X-%-IZ2{AavPwLV z)Z~quHpSSq^yMwgSOCq(9rl2ctSIHpul|M$zt2yVi8w;QiWTnLwSD{1gAO(}dmC18snv_Z0Lc&T z4+3`)B&*?}_M@Q1jK4%4gW$9tc{A!#1u_LgDPSOD)G)Of)WG$p`dC?Hj;lymj01p; z-T+KR@7~!$<|hkA1k+P?esB&D9}%SQ$Kl~d=g-p{7#MJUKC_14Xt+o?D9LAw853y) zWqMbz?PmaGL#%5yP=jF=i0|Lb$?1>2X9ZZ*ScAH&ds4wQ-60{Ka`@ z5H{ixi*t6A;=wh*pt9Ih1#$XZ(buK_T@6{;;ea8Ig4c^+lT|y+?RN#pB|~GUcmh=e z8?J-9((vL%KRoJFoM#FJF+{9|FP&&#UjL_Cn(|pQQ5Uud>(4o-;?s`{V9$!>1 zWz8WVAix!-RwLn_>Hx;eyW(wJM_QGH7o>a2)0h|rz~1;D>(7|}wJSZl>wz`lz`8Y- zzN-zN1qjK6rzGAhf@8u&ZYCuOoO*olS%@g*DR}>)VYGbX!mlzRfnM((K(MmSD(qk( zMnL#vaX{60rZ?iKg1@17-n@CU?CxV2TMg%?hAFovqnh(Q@;P3;7)j7dqXVgp?qogb zr0P@h5n=7!cA!k7+yJ4yCGcUSqiq^QC~cg%e)A?@I?19|;6#f$moHxqR+;EILz;$$ zPm5erb)==QH37rVi$QPj#*uqhR78_uQGes+P5NtIUKuSRLxAhVX{w!Rxg0XLDwt*R zcK-gSeZ;pIY}~SCcY>Nq}}T!n`CoN`y1ss2Ow*LS6;z!coW)`*DjNLQq$4`LPBoN zP7ZG0yZ29^gKUKL?edc(#COp!8g+Y)q;#Naes=n0%gp+5;MVNewTqS_G1W$k49=7O z5dkVUaByS*UZJnfxoV$KvkA*WDR${UU|uVi)L9;)<$L?Ka#Qzs>4EWm2M+93(~@;# zU|^8KNlA8}cP2({I9jBErU8EiK}INs3LP7mMq&ifAp<&($bOyZ{fHY5$`U#|e}p9A zAb=c>!l$wf_4C2NmKn&=Fg`1pMIhHOx^%16<;q^|n3pyEp|HEA*~jI-o%8{fNUm)i z9oK-}mr$CrteFw32gSsSK73$6R2`F&qLAJj-YDs90T=iKUVg;pT{lLr8(u=a1KKAN zk}*-Yku2afOBC)qBQdp$bKO!;EE!{I?*uKaCon&sc-7|$BtsNG8wZDyOa~tT92=^H?ABi#=m!w5Y*qZdk_UK(#j(JoYt3uDk#U*#@R0$%K zLV=O(p@{X9>b$b+-rhz4^x!#)Z*scdLLV-@gM|wz50W-SDOp3qt%L@G$&k8*kf{wR z#`K;YmtILoNDx8iuCFg~5LkYDO*E7DXx?C(^Z{9wYdV;>2PKVi{eB2|zG6sjdXWZr zV*4lYAR=roT`EGIU%h(u)AWk?e({)NcQ%mELw6Q4n$vY8_!E;9kD$D@z<(E~TXMv5 z_zt=z@Fj808ZOLDbI-UA7jN8jzBb5)*dYupA46EIs$cjtIRR)eK*9gF*}5km)wSUkuRxfv zRZ)s@DWK=w@xJ;vMf=FhI?=H^~5nFK}h8O)ho!Xf;hABBhOosGx^ zjFgK~MlLV}b%~@}?BbSxXxcC$uE5F(c)J+%PT;vQUv^Oz0IH=Fm?~(&B~ib0L%Lt< z2m40?JiYK*2s-pPa4o&*G1X3wP3-Msp$4JvheK#GBH_dZ9OQj z{p=i3>w#~UJ$3Y#HVwTw=I|G^zGa_!xlf%^%qivp7Snw8Yl7BVxY(9ZpfzoUwbis0 zlO(7z?dQN0Ak!HQwG|Q-(6gi9I5b;*`N^PTI9^}S=Ti#6Ua0*Z-^E154-gNQjEjRq zrj6yri?3|;nV6W`r@#|OY1^5G9lEjyO?!J=nM_EPB#sORL*ht%ck7TYXx@gq6hCw7J5RiV~bA{YmwBOK(r}V?8aoZ<{9QR1jrc#IgTOhb~^IWwqBw6AY%a<3Fg^69J z;*oTY@I}V`J(yMPlVERz6M?Y-tvEXaX4wyESy{w+;ysJRLaHJJA-Lg?9~hP{TV{kq zX{wz?W#*KS(bauiHi5+d3=MD5Jo=U8q340!xoKPq3k#*qj!svHkANz_-rWHZi=(Lg zwgmK1dqY!`wpD8!irdkKw^o??fS#q9UB%0pdQSa)=!b{hQ`N_fmz~n(NaD3SDPmz2DXTQG#kwvwHfp_I5n+~ zVWf2<98~lUh4;A?l%PhUiltRpIO>_0yttPVRRITKTX*-aZ`NNTmyux?6zZ+~{3`&X zF@qKSc@N(^vx^t?xAp8&L9B6k20nTeC!7p)c@K;hhVIl_w5(@FK!_5LJ1Jshyu@g< zjK%?#l2C;}o7^USA89;N-6;gn!+A-MrXc7|J_ltJYml4B?rE09ra8Y4i4l}8+qW9uzkbAKlE%pm>vo%|t-(~%D-TW$mbOI2b ztFUYL0s@wxzJopwo^?gEHl&&|f#&tX+8GsITL!|=Fe>;pL_i_9B4~khNy9tQ-$)_z zAq=yRS8__QLv59>B5Q(~0Ls&Yw90x;Uoh;$#OjH52oTXZ=l)2V(K|Yd6be%^RJABK zAt9mbofQ>|uJhydl-pjB==S()a*_tq>f<#8xD3hZ;Ogn>?_U9s2qz>9q8nA1g~k(B z1|VsL(4R^CBcViCWt%M@9F0-fZ~@;ETD4mMDlPhPT8vHzx=dK2Ut;uM%gTnzEa$Rq z`_^AOwtlSnR=+w_lQO8)>RBuH1 zu?ThAclGt@^fJ$t`fk3tjA?@u|Ay^6JaP>ER^Q*K_3SgN-VxND1r8dMJrqy86VKA; z`d0usyhm4@L=%WIdtjpc!-oQd7iC?DBEN-U5}GBP8EN4}YNcjXq}`L5a8Bm;wIC-Z*1e8aCpk^Y=O z{e_f+0%n8%zx_aCJ<|&5gR6yTqdw-f2L#IhvGKx)aE$?AWhk(dJ#L-h~U+HZ|fme50fGfh= zSj+Qt5abo6(p1qC2eb-S@vNe@0Ow{|x6)BwzkSO$7PM?8^7->MoSXtCt9zAisN_A4 zR;BJ9Js4Go9dDn)W3tV1ie8fWl`Ul(s~L)Ukhm?~$bf9!_f139e~h+wc0Qkw2zd7Q zM5%qpz`$;HAUa*1wOTDW|HMhTglE-ieuKOm>7Qz*K!QcKN=mC?(%KWabBm;VPLrZ6 zEDP}8&{B|4Rq1%)m;hi}MJev$J$`r7v!t$&L-4f?@8mnqNC6z)z!4(wMQcN9fiC1J z7@ap$%E~RuW`GWiFsi`KA;2`9+n3Jk6Dx4?(f%Vq_!J7jtn%h>hddSzBLgm3CplPI zS;;tn0|6^NtrGi~>iT;m63}n%e|2)=u`1qeV89nHoaG|Zle@PaE&6MS_SXvt#soJ~ z+CWG=#9TsZYO3l4oHq?w)|{jQjeY~hm2NBm*l2^;Cg!iDKH|~?EPL|V{+KsHkELwY z5eIiD>*xzGv=H#*$*a%Z$ks!{!)vZ_Aq04%%um%X)K%tMt- z5j{07gQopxmMG{Rlp}M_mHKV*j(C<3;q6Tlq598ki@G@OyH2PYjGvTep}Be+Bl&>X zNRx>m;}qyTi&`9k`gyTG>t`_k!`SFc#;4#@R=A7R_hOcql8RS;3@K+B1wXge$!WdH zv3S_I4q`IF#sX*ByF%fG^|!y+YWqwwjhE-PS>u&%A^HxO>HV;ak$9gN$XOVR8iITK zeQ9YY;);-20H*d&>M{EXJFVab3|5*947B9xFl(qf|NQwA_OSiJ7r))YS}$)rkDVg5 zxnvX#fG;Y22Uh;pojXTyCy*vZ z)Ko!bYJ!DkpkOv@HAl@2kjfK-t^R0swhIZd5XqG- zr>{14E!+&(`SJE?#nJP;e)Fc|^RuHx#&Km9Z?d!7$>h~+ zP)T)^0vY{-2i^!hH21YCW{$&B&@b(u91q3tTTxjVBbp{DjF+L6dZ~0y^&FBTs@|Q| zx-Q}1DT4X6t^oit&K@tDxVLnx$;}CjIU5@eJ~%CbX{FV$W!UsL2(rUx^-LVDU#{}| zw-z}68*N518e8qWmciO`Eo_9jq}5(nSeSI75G#iWK?STLgQEr~3=JP29~AGxnwqr) z)04TC^?d3*ruz7M;@+W9J(6+}8qc9)Gy0oR-|;4aR6#TqpwroF-=l=A&j~4l%yobJ z{++Blp)tYA&gjoR~yzN?4bmOdoYM<_;yK*kp!sR8d?=1|cA!#IVp$Te`1 z!U|l7IUgj1n>f&qgspA^<+bCvK?-M;BoyMowjMsr4msBg!`BcsJ|`O;y}Matm7t&? z@&2h>m}3+c??|loM1zH&kxF}8=7rpj#1Gf8VkE znf^a_3j`dIQ&f}z!+{4=SSqN>dtcsYY=fK4Y^I*Tnl)d-68IM<0=}G2@Mitb#J?(6 zOXk-4W`|tVZ`FnWUtAvWr%?ajf0crDcAncTSd}9?Sa_}c{romG&CktI9c*RP9%G7X zXomQVoN{0|G$Mb5mu{bbW7AH9qz*aT1{_D3{;F7;j`z#aH6qD(pm7PxWzYR@dF5Q(?;nijPcWRyG+P-n^L!94&-}NLa%#aPr0Csk{!R$2u9ti}} zg;#89dK%n8dTPk?z@((4qCs~YM}?eB)ojP%$fn64^bI{-<2`+Se0O7iU%q(m932Ke z$tXJ+2>IQT8(-gT4)vRiaAG`z$SsLgqeDZhzg{pk^~UIcP*OW85pnGc+O$dSw{Bra z28dBlWFy=N@hLy|mncG}JEN+q`cstBh73_0`s0ddoFxUV1-mbS|bp)gzD8y zHCYboFRAA62$-Q1%yC!1=1(SPNglx(0OdssEdfRe)Ymhwjc5moF#O@7KuPkVjTJ1L1$CT>LHuxuKx2@JRPGt~I!xp1wc4^ottc(YcC;OJY^E8x_91 zg)avfjGY3v1?}n6Rb_hjHh_S%X)9QAm+gSOaVxRJ7TiwXwc!qNYruyImYPv=erzpJ z5xGgA@9+eok+{GqJZck{Fi{MQe0>(EABjmMS}{~PZW@yqw}~qpa8!dWUE$OzS-G(w~R7KxkqRmQ)nqoST1Z2{-% zkGY_206;Q2u)CsBv4IriKUyhj&t+``wL%Xq9$R=) zmeSL2#{@2F_A%5MjCy#%o%QU>V9O>U69!B#)cKJT1@js!0B#9X7Idt%;QQ}#h_ey$ z7cripe;N(L0t|L!MA>nopB2de&sZ@irNv2)MS4JTJ%Lct$l)vIn~$R6y-dSZj4CQr z7v8SQhDweM_vz=~N|K-hqmZg*9+#CZf}loe`}s3kScmXgjG=qge2~z;VlLzxOsfPU zVR@3RnuX!&BRG-_6~nDZ1I$9knZf@liT=dswRfr|pT;X0gi6`0P`Wxs1#!s+!KUct zF5(TYqkrI#biWHU*NzAWZh8l9hn!PGB;wBOoj+er)c?50i>V2biQj*3b5- zE=whC8BiHt!Y^2xFw$6podwmIjQM9wDj2sHd7CF4e*k$CBGO8>>mbt1-1H3%4Sk-D zL# z8CqFceF0Xo?{8Rxus#m&$r(+}TcEMvh-E;XL|chc$qGB0dV=PeXeSDp%#>D2f}OUf zK8cguXo4MuxuuiTT>vH+2w*tBjyFUXSolHC$+k>C*O}bYK*APO#HI$9wNcU0q#JL* z)gwNc0NNc8kH~aBrogYmXGGny9ZVN#iBpVk7ImifjDs5|bLaidS`c~0y&yLr!dpl-lmf)0EdVi#8{LXcPe+7>Be1ym&Ige8*1I(Cp_U4Pc+zuNZ1#eGZV|~2ptHr=~p^)8^W&r$B(6J z*Ehn4yC6yiBQf1bZa5(KSU~^4jD?rbr(G;Q1oOGh{Ce2~81FNIb4q?0NhpxBh}&{0 z9bMeYd(JK{ov?!wHVM(~%^~g_XlA9he6nG7QQTvJofrnLIg|TGLJZt}5U9og$y2b% z3SRlGi)y7iaF8^TN=bm~f1JZ^vkqj^gj_L0A&xd;A;yQka7e)1Axr+;2ovNRj9vMG z)#c*hVZaSByWn^eNN{<7YYmn5kH7g(@K|sAAIYx8oENPdQ;%`Q&{NQb5h$*-vI3b( zKtBh%9%K^N@j8t^M3gEWF><4e11BzsF2&KwNo|TvF5o|mIdK_hCRqyPVqi=)+tUXp z7$o?1lbNm`Kfu%_RCMCiS5a_SeUY#(xas3ABSKnHHc7f4jy@GmO*cy`H6uge{v5lP zj)N@q!O&pb%HAc#OX{%hoe*+WLcc_x0CB8FH*P}MVw^Zy9(PVAW@dpJT`61MO7J1n zmAxN{`?b}5HqpkQ#_pi=*aZS!h&107a>5{BQ^Qf)_pSgu#2*Lr)IK=q4-Ks&%sTHX zLK?r`7m1+|tAMJZHj_w8X~_4Ggvzq~=~S2!rn;tnhFRlk63#Z@326)$uNHMjjUW@1 zK^C|D{Ws=CowcyIgEQ5R`d5sIAWbW*nlf^;7zQl?rPBIgrsm&c4GaxiI9h+zXz_=E zqOMbW;Z`AG3HA+c#7OKdh}2?K(J>)? z>JRkX0skO9eauS#Wf_?m!!;E&6f(K~y?)^Gh|6l6BxA4+(x2TiUM)Xa_5j4*9^wb3 z5GYEDH|SFBPcS0hC8L%Cv$k=N{5H31h6v>)d4wJbf*XdR^pSfsUi3XO%@PWldT6z)37OEM7- zqcZFyGSD#gy3N?beIDNZz|@FR@VDA+QD#YVTcMNtzqD>|#fR_-GnB$3uYAbQbr!Ln zp>M5b2OB22utaK@!zE7Q%5J`Cp?IP{%;qe-j&{eg*j&C`S4ESt4ip2j5E%o<#YYk- zU*5opm{LEBIp?URmnU^n-9(Zw4FDM<%I(^JXHw)Wy8j>w4`Ai zFkNC0q7$i&o*nm}fruu|0ij4J#J&x1TkFSVAc57Cypfn_#j9bkfUbHMdp-trx+Eep zGBRu)hlIRL$?rcO4Dc{KExgp)&5+z+@-o7mhVm!-$rkm3bU)5>^Vg3b*D-CvGjQI> z=qg4V878k_sx`6-!!+L3Vhs%qG2e1D;jzK-WBFa4y6n%zM`RbM+EDayHwK(i%PALH z^TdM%4E#XEaUDFkiEzjhdQBi`iq4yuUdDw$?XPq8DF?D*rWD5OWfX1GQy=LUe|TM^ ztYe|TBU9&BgAOCAoAnX${Wh@BghY(6>s3I#@vfli(YT;N!reril&5gUNb{btET-p&>dK!63`z6U*LlowRGT&Xh=7Epl(qejG0<= zAe5LrmwC#6qP3Ai+dnRrFY!^xXbD5Tsy38?^?s%wg_|0(?RKI2TEdg}vy1lG2&-EX z2u-CA!r_>1{c5KXh)Xp#f-R#}(%qw}W+@Z#qjeHa|AZ6|lpJqkJ$Y~S^}TnV=PfhY ztV&aWi#^J8Ez}o6j!0x^gyN!y<>UDfpYDQs;7MNu1pZJ}MNc7@rC4bv&FS8l8(_zV z|M?QK{68VJw-y&iRagk{!B2-PXXrfU$2X#^U5Bh2)Oh&+nAGmf_gwo7aeBd1Ux@1s zpl060%uGu+xhw@N7V$&D8iq!U$K(VK1>+JLuyoL(sVo0aNbSkH+Ts0=Nxl;H>Mz*` z3fvR~rFCVJC8ecIU0iD1(W=W>d}O#bPkg`9y1VA!`O3#jlpgbbg||Mj>cO%B*dnEs zDsj0A_)UUTkZsY!Tty2Pcu*3POFVVIdU|5+xoLi>jF1!vjN6Tyd;oH7FcQOcf?VQ& znfJtdcXQz@U@FBxL0v;W{v-`9{D0mNQCu#4`zB1d4_@t%bHNQ7n4G=~=mRcs6>9qh z=-tFZ<1OM@AM|tc0rV5A7nJn!-E-fIILYq|&dtqT9dPQzi8ad8|9zh1VQ|M&E}`J& z6D77^j9A?6(ac=*^QTsPrLIk+vCDa~CSAy-kjFA@8k$=;0?@_NM< zQ!SAZZW>IS*H+@bgKZ`pCw8!)%LzR9F=!7j?};fVfFWkMWT{!%CRe|rlV_Rj$7mmr zSP?+H+VD{_mB zbQu4gDoh3-iPrYz%xnvihXp|zVjB(5D)nIQFn-iSOvo%z(oH=^NjM!uOQjB`~*!1`GX!d6G)gTaVVL;?8e1K$TmsEYOn0oX8q`)T?nyMk` zaxYc-r;?p8_SV(fC7(l`?s7u7955M$bO!ScJ zr_j<0o~zi@&YYZ6?fSd~Yrt~|cMd)u>fj&H^gJ79W@^duUu}Vmbrx_H4Ns-~Gz^`! z6Ny@B(L?RdxKv6hTRs#&`ad)s>;*ye@?|`4I4T??=cd8$Z#BB~JyM@Wz={XZqL7Yh z0;W=g4qJCda`I|F1Bl&m2Q)I#AYg213)oK0>`SKEl~ETX#2el4&0~G_r+HGQo%3;$ zDaD0Lw6-QjuGlQRf=~^Oh>tz&xW`d1$!-!`25FVDuH5!^@mb5(ye$+WrFyp(m)VT| zJc)5G#w^S#cEV>Q1$LYG3mE4?@asn3%|ZrT{!uiw`fbxa0sN|8dX2`ZiaKghs-~fl z6u3UrmcfBn3|GNtNG_Km_;`7jBSoyIu|v;X`(}kDTAU^8&Tp2$1vuZnYOZOSQGSdu z$f9AOeH#qg<6anAqWD2$fhldx+O=L-TxtFd#-RHNV_yB!RDRT#jY4jALx08dm~XLM zBRC~6JbtB>%2-##F6Tq6n>TBi$6Yxlf8xXu%zu~)c>MtgG4kb~J`s;ChIj#?GsKSP z-ED4`Ztj+C{jr#@9u^VcdW_f9jhn@%s3B=GK&?3L%<~)fo<*Jli{aU!_z=nnZ(t_h zC%dot9v-AGg4*?E=J2#dklYUEX;xoQ1g);4E5TEd*-)@teCd)d6N(KkpV93hV`v;o z!qow<4O;u=$EV-!jQA9p)GfCYP98D@N14OSD^jP*rp57W`|!6Ut>B_lGCK$Z^1Vi0 zrnd3l-#1fm?GiJ)xbr$J&)QV#9gHJS)WiMEo3eAdJ}k9cVIfaA>G@bR;Jaq~?wEsZU5eQ`GWk8L=Dqmc14nfNEnsEO8NT z6RZ;RV>LqJk|Sy(xN-Ga-6q)giA3(%v2_ktmdQV^2Nr~YK#Na&50f^OO@j_5#wa%d zfzw#6^?C2XQ`~`*wGDJs@oCBV>mVQ6(R&ds0bQrkp3!blEl|NWvP6+<5ddG!<{y8c zagiFXQmxAi?#2B~xI$?)90EaX`<(h{T}G}ZBTu{oUk}K|cvToW8=*kRgeYb!$)sC9 zUINO~+L@lw%?GS&vh4nBKK}SH^ilqM6t1(WcqkYprty7a-N3*A8B1cA#5E?igDv8g z-`}hUEGEtqGFXUfQsNZ*)VqNDaNC|nV@s;30-3dcEeXFOnG>XVq7w*V=*POLOnK^S zWpXI#p5({Ll(IFGKz_-v2wZmjYKL3?3QVltP>Qa@L|%MuioL11Idy42=}%3yn6HDf*x zZrvHc+|Noq**X`HyO(ht)^Le_U?W$3Ub4Tfz7>rhcx&EI#$gW=LMJw?+ia z-F#j)^XTDRzW*9rho~_2rHi&3a~IyjKm0Ldrm*xQ#SmO3sSBX}U{|>bSBVj%K?Xk5@DvMBWOYEg^Y z7_O!FaI)8@D(L^0rDbXw2|_kjqV21UJF}o#)fp|)suQvjZk%B(Cf}CH@zOEDtha9u zkvV-#^TN9K4BU|P-eOcx+%0t#qlRGf2nE{-<_=ZX6Kb&1$a)I?_NBe}NRo(YRmhy1 zmA?N-XDAFc0|KoYjTS08IMB$WUif5yK=l1E$XUanOv}^ab=o~IH6bcN(t0{u-$T4N zEJ21(X|9XL>MKvVCGG9CH&w$&(K#E!-hw-a*ONFW>{9!DCSIm!7+P=Kk5--I$B&bl zKtk|=cVw2Fdn)XenhtGZFK(LUdY6gKglkcSm0#_c1aDbW(;LOAD9{b)<&DhDv|x29|`k}bG8Bo8u-$;;eMm3pLgd(E~NWR#Lde3N|gSKWiel54WbcZ zswX(Ld$}4<6i=Q^!JRrFJ?MZbi!d3#mk0|GoxgRf6KeyZdN3VF20URaqNU*f;WAnL zU-@zk@wzFQDtAZXc@6FE0_TTC=wM^kf<07P!S0t zQ2|LxMkHfETNKoiARs}>IZF~yY(SD^kRYf?PD+&g=0?T-ec!wH{c-PjZ@h6Ct=%nB zr|O)&*IsL`Ip>O`)#)+5sjY=8$Ldt`&NXPQFS&PgK04tb&nS*maNM?iBnAsj%^70y z1)OVjpMj5;_p@gYNR6=L_=8ozZ~hLmq|)22fQ;q*Jo zU-a~Am_17yANMwdJeX{{08?g}j4S|(pgPMSqrU_asH71(7f;G~!FRi^vhEY-eRN}% z7BD0@`t@WJcANOxwQJF{rtgImEob^Ml8ajXh9Chj(LGn4d^{y)bNxDdwC(6lQ)A;* ze4CR0>bZ)f1OzqhSr;$*QK+--k(V58ZB~w0Gj@B2MXS3MiVHX|imkdC1Pp?Q7`Pqq zQDUkq#Zz1cD7~_16K9b!%ZDe6x+r0jFCU1Ei4CwVzL2E5ZA%f|q#LYUa~3~D+jsr+ z=W`zM6n5LClG7@4H_3TxgG~>^{qX4}}iZ}pQ&54hydd>mpb)9P~zxK(4V~(omX%HSe4zLC6Je9l8tm^M>nQ6bKE6e;3c4R`tX6PaNPSY zCFgC_j`g|ZhA29N`1B`Q_7lzA>{sU#K#|iwT2@kCgUMJ9QHPUgT;YBA`iV{S@Dolv z>UEr2`QV-x+NsGC0|_WuDD3Z};QRdWTb*x2=CBr^4h!QUg};I=eV;ue#*fi? zrI6U7-3V$j0`6^8A?J0$0^CAuXYo>>knj;NEJUL4YGxB;O6QYwSL5VaKu1RxT_d{= z1@d}Vo1u>9mFKYso(fq+#GO$GxMl)Eh3_R6#0G{0_6&>ekjn9hn<-+NXQq|;yl?8F z-Ir?7z0UEAP*Z~V9!LA^vz559vvBIFwQB?8?lyIQZU6Eo%L(H%@2*)6C~ZE45PykW zTS392a2m=Az!7h8KP~(k-~oh$l#W9D+i@OMTNNkJnvWRaM$>B#R=Iy(=$u%n)p|5o zcK3hdOC15fNZ#wsm)~Tmpd@o$D_zj>_k@a#)>Y7r*^QtjF^{2NeZ<~EtZ*FbO-#dC9N{>R^FrWnxcYAruQkT5hqaivnKk4U6k$=&iY-TI|^g0LNVNC z0uOxZ@8=acp!0r9u-$_B^KXyRjnxKcoFv>^z~IP4<7sj|Fv^n!$rboZ4ZpO zSc>+_8QJBjA(8wH#RxP6xEvGGT;j6jKJpRNE}0#T)z$JJ2@Z!<6g z#$=uPqJm#QfJhjLl}lr~#Zq*BC^D0u&GFs+eke_@5C;yBpiTf|fE?x_!Q7Ep6C(iN zir7@kDY@>i=XhPSO30^NElVhbCq{zE2>8DzD?#i*V0meCCDZ6 zU=(-T9<6-Ak{*riuBunwvbxj^RqKzV@E@K zY6veHARxv0NhI>@*aMF^__6wcBSu{D>R=$jXvek z(b2JHq6ts#^u+~DD1Vav5yXPK1$Cg~7x5glMd>@E$s&a#6j2&dUgKfOhxG6$B`kz> zEc7g-2}K4gI3+xvRR?aXXbhHGk6{!^Zi+-xJ^K~J0+Iz$GF*QD>>e%#g2Dp4BSrS| z5h0}z-x#7s19_d;6`<10#VwJy?^9?Z(HH0Yd`Fq_Q(~J>FaZnfY0hRr$LBblN=TP$ z-}Tz?bGtxvV86mBr$E?F$@=bv*T$lbcwyIWR?#328A3=w^NvT~o<>k^L#ZPpE3cYe zaX#IAH`FH&z=ZDsjqvEVF&v8|NW34E83S8<9Vzz8%4CRUQ$Qf)=+;gEE^vQYB(r51 zo|RR2+f%UQh{p{jmwJ5QhA0JWL}8r&cVXnR3?=GOplgH;5#Y`TY$Knr9I#|P^|=Q? zeU3-l?qar#Har;PAux5Cq7;#eB_|%KERkbXTXm_C&Mh+71r%a}vh8A#%-*=WUyP&( zy9C{>w>W5tUKy3*0+7~jLtO=o2u9A^wYp25?meP+pb7Oa$%RR^iITAcggtVfiRA=3 zcZAq_yTcSJ(r-ZNB2~_zTqnHT&(Oadf$0)4cET+TL>i5rNFF+iq=!k#-Dux6edG6Y zm6PAlP^UPuhU1?GQ{UJ`=y4HEzd_)miAD4S{nl?-GsNrxBTkU*93fQ5UVbZn22wa3^hx%U-dE0&sqSKDqQ!t6m-Kw*!5p!@ex*&|#X$}5m;6T%*V9EGbcE%3Ztf`0D(!V-tU z!9gfcIj?9$O2a@l9?q3(HeQc~TOtXt1PY5#IR~h8)s`(pU6riq?%s2(RDJCl5mySm zeE)=Q*#mCJJ9qq@iC--UdaKv0UF-B<(N+F=L2LdxYIJtbm#Z0S7_Xv=#5v5* z&+jQrk^%V^nnx2`ILSG7c|gNmWqg7=>V}K{lUQ8w_3Rf>aR9|Gm(sI%RGM@9G5I1M7G(ZXKbKwttV1 z)F}Ozg9AZpn&u;;6^0DV7=C8Y#2vB{JdbdYD`Feng7qrK$D|ahxvIs#6ZoBb;@^+g zfi!j-=Wn6fV!cmYUE_HQ+qM3i5y&4kIJwgQj~q|_I9AMl=KsZ+Q02-oSlE!K;2+6` zK(v;xf%N69y5%fmJ48OO#L#WFVCLPr_0R_LTl@Eu**q$#6u>Y*vwuF4!8}6Uf!PF_ z8n+77B;9}6RFb#%_y#-;SBd?o9agBGPuCSw0zeT$8KNac7Ei=8rKCGaG z@`+?RIA)*Y*DPPKI+6~uWuomtKgY;sO%s?`8gOl{J>>op9PD|hv@iA% zCF6(2*4;S#5KYi9KccIvtId02H&pgc%~QyGJ+EI4H^qF9|V!jLmrRT&3lp3VbexTCMf^!noJ}S>j+d-8UWEZP2eC+o39Wg3e*LB9|To$ z2$F<24s|2(0c?*S?QOUVuzkVn%|4l1(5{B1-O59`YMN-16BBdD{)lE*5T*!DC0BL- zFpXb0Z{wR!J3XKFs!)wO#ve~<>Qpj(i)?{NmpK~nKtSnONg)E*S5|{^BfVAOPb79q z%U?oaM|x+Ys4R%-I3|3MAx)&l0j&s#up=s|*KnCa=kNqN-=Isuu2UuB+Hy)9PI83( zm{+_<43H#lfL}BeEh_>9I}Vt&N&glkBWPPcf}z%{@Gtq!8W$K@G?QerrEWQ2=&J$-s;`{S)x@|qfF&~-qNrf<(QL=S&< zK8`);l{h&U1$^zEvWS>phx*_x@I=O%As8=#_}Bt^Ae<`*Z550~l8qg~(h=%8)Iys0j(cVh$GzWibNluPL7GUwH|`}p?)?A9vCP&ZFAw%x zKq2-S7?eP9Oi8iy%G)}OB!m!8MC>Ow5W&C#8;-bjb8bhg|M$3BXs@{5@dDijBptVq zDgtKv+}bMt*VQj0TqfJ8Oyl>&Is>6+iAOX(JW^(d==mN@LS($;~4cYlxYcF8DPpEAu5Un>WD&ttr8NAk7$mlYKaWP<7gT{0l zNCt-+H;&JvD19h|4+C6YUWoXP-A=d#?k?C=|HT5s{gF~IFX+9=-v$plOuwoSL;WGa zg4aTM$k{szFng&T9tmSTEP8IgM0fG?ckl&q6zscNFL#{_^5lzRb*IG4PI)22&;;v@ z7+s@NPh6L-U%$RpQ1E>J#F_*8RZGx?%(0dSD6@LLHpq6?s<*O;j+_8yfVuyZj077q zS#88`^=O9}@A1V87cS{d*xCh#AjvGrR3rfO*V}z9iJi3XO^??z6T({?)K@xIG|S6t z>r_@$kf1?~G+{4E#3G8Z%)r&@h@%P2?ISIalb#i<8-;Lo3^C-m997hC0z1BW=|0hOpi;S!3j@Bm-AV}8p*_$+Dfw@~Vsb>}I%gMP_ zh{cM>g!`i<0Fv}@;vw!u3l|2UJ$*jgUX=U{%3h43TMohk#=Xo3|NF)0LG1}cwKUU3 zpu|Z|K)8IEYu(#-@4_TKLy}Tb*1NVAF?i;|xZ?IG7ihe8T#?cQtU|T119TizEnt=8 zBc~%%Sv25@O{__f6?53Cf~9tX;Ws37|#Lk*p#miiVwIu zY!9T^OdkFjgju2VlLlmrLK-;pduuBzf-Hd1MdCcP)AAA%^4{Q3cE)yNWSx0TZcbc^OKv9X-zh)!L-6$-$pA(12e2>fs{AbeOByP?NT z`TgK&V!M#0BlPBkF3L8}zO*NEI`Oe&u*AA^|MwOf40Ob-B{V6P4+uNB#4Abr21dd! z(6&Z)1eV$xe4jL7yO%cu0GIKQf)aeQad?M0N$ zn|=6P$Xt!jsrY@`m%I?9$(ukIqr;BJD56DJ6ujl_7#{RG>};U% zPkJp;F>Xq%lmjY5&!U!*n(AiNtE~;G2AF=I@LqF(9IRquistj#g$B<%F+H&O?tSvL z-Cd5M`nFc82V#tK_beGicTV8Jwy>AaUdu$G*<%|BOW(Py3kWULrO`;ae_k;i!XG?NmuC->1SE?J*zj-2I3)TGs1w zRA2KbFdjdjYJ9Pd_YhsZM@u7$Z9-n75Oq*FkNWm9$Q^s1?cBLjq79$@Fg}=`he{_b z&u;tIB7<|ge5>>R|IGa(mVZn`{g~&)eg8xAkBaWb1V>dgzE^A!3@F=K^b(B3T=<+0 z3&m!Eg-=Yk5Rssua0gVndsq<16oFGIuL}zPI)2=Jy+R!rdJ-~cLV#frfzdn)kh5TW zBko8jv7W#BhDR#3aMe5If7zS0jPl9V2D+VlU14MH78n^A~Y z803v-8&Z(i<=pKC&V{(BSV!FNaU7@qnvh9mYLH?hK=2!}aRo!Xvr}I(6#7`ATB2y6 zbmtfyf3j!_1rc!T^DjiP3j`0qw^T3|%>d&`v_xd1(-KO2g^8*N1243EMc*Z{%fRW22x-p#NaTOefF_W(z_d`Aj$Xs4->q+!e+#jU2S8nKUDhLj)| zpqu>5GrmFu_gP3GGo#+U;5>trcdaXuEHn2hY)eMT&xQV>4|;?sOKvEQJN(-*)SQe~ zqa1=mi7^m+mnrQxuRx)KwNrL+a4*>paMY60>?|BOVDjZCF+u~v3SJn6Oin4AhC4X8 zu=6QEZJdDJDdzD--X_XkVA2#K(uf|73M1QS`ZuAtAb8ejnc&FIVgcj##K~YiF{=tcV+wAZWjwouRt$@1@RkObZ zm+CY?ymPZo$pmpEO?>T5;%L|k#uUHygMV24HKk8u;4r_#$T&QLA{A@7&6q319`MygfUbC*Yf;N+ec68m*T?QW(ir1pp(JL(QeXeed~XWGWFk77H*X#h zfWluG3}DQI`N9{i?2etgormYNwhbgBWfAh6pQka^;&yRjKtRBn8=0VWZ?id`WgaR5 zj>+dV(#D6EMODmR&~5}#Wh*qKL=ZtT*jq~|BS z+itBZyAx5?Lf`n;s4{YMA~pogAL;m!3m3d4E!@wCk`VB~n*;6CRP9>a6KLC38LR@GtQ1(&uuBDR&e{pVgvn) z23qZkZrT2&akUvelPFkmg~!{*O*Xp~@(f>K{Ey`m4}%P7@yK)}Epg&nMY08SW8dJw zB~N787Qr}KDHS~1jlf@TpmG6FPV7G@uYvEcVqsZH*&t>si09<)*9diW_ z$VDYy#~`hdkRj<8Y?8s6VFM!n>FZZGiABmsE{0_|Ux`r%v=ro)U=dw}01dSzSC`7PgvPXm+$j(#kR<8 zy}zi42uj(vl4?r1WbE=!-bWIBPl3*l@S>B8xAw?~xd5>Ch&Th(A(=UYi%4QIRhP^a ztMQ=GwrhRSyWlf9IC-F-8X?)*_WhTS2~f8;-2j9gzldgO-FW1@2Sw7wGvPQQT7Z5r z9vJNg$V;XwP+pTkv;Zt{7|e1%NtFh?#8yDPtbmRD7?^~m;fKPJ_QV<`sI4q>JtK*vy>+MERJ`_x9Uwya6sXLJ{K5e8Q>m7 zFpVNl$T0!yzj`Foq~(tBBv=aJy%PuiV@+&Vry0R>Rl@7t02ypd8byb`)0Kg55c|kV z-V5^|jkln0kFC2jn2NGC_P~>J^b`R6Q5@0CbU$2(HUjaLIAmf~2z^wC!C32Q4tM=m z1%;32=Xwbxtx$zL%v zem@?mwV?(hMk!)f{NZ&5TjpqU5me+T{qYaNt@|)dG8i<%oM#D{oe}h76(LYlP$<~_ zYg+f?^RZ7Z)1CqnaQpRXC1RzqVGt16Vy|RkaztuN^f^dx4Sxeh)Qk}hGt)WW!Q|RM z;=ZVwc)OT4m!leKfzpIySO_9DXXJmFTny#{na=|zaU=KnxPU6nB7#H+y57HkuAvrO zEX*+g^MC&obcF65=TK$goTuMtrD!68%UyF{AZourScSCr(3}Aqhl`|Bpnnh_cg)c_3=1+s>VtnS87kFoRZ?6`Cd?9*twc=|YN=u4 zW5GHwQUx!B%wPkF!{d=%8%|1m&p^b1=vJZS0fy-cv@*%4nK|)t$<1HNVZ?2Ewl3mC zMRee=+Hjh$Q96c-JLfx6CB)O`Wm&)$olI}tc@5Urhh3-t2ZkzkmH$IS6(iNS;z!Fi z@+|~1LU2)t0dadH5{wXa2);t9NqBp8R0J=@QoDsiVJqedqx%G9@o{KGNhFB>C!`pS zP%=8iyrXR1e=lkgoo9UtpC7%f1RQ9|ymYv25Nj=nA>vm>nVXw4NN$Azf+o<0>yYvg z)i7H7geHK~?GEJG;CTOyaQy}8v&0Nos$sq}h`KqmN*`<2cj^O~B4jsU&t&GAN~Sd=+P^lxxO@Lt>&_mlP#GSqELrPu#IHp6E<-G~Sx&pw#IC`6!v(TPy*Ti*LL z2z^>v5Ou9@`}2XbnDR=V0suFtl@9+j0+AoV%UK^f!PoOgFS!L6U9GjW_MH_>^zc9+cLC@CaS9D9%R*-4#Ptaj3sRHX8&SlZ`o`cT0$wKu*o16Z-_ki z98K>+<3ciX16Lp~20{Y^p3_|j+6aV+)*zYcNRk=QK(joj^Pa3M3+Id-v)+0p>w%7v z1(q68k=KxKl!KC;e0W=gHz8V4h*zwxZdiR5h(uo)(PbBGhXM{{o_7ief0)%LD z^pB%`y#wy4I1ui_l$w~;qdNhV;xl}8h&RkdtsvzO`Jgy}Xp;+CB_RSK0O&Wy8UfbL zv9U(*ASWlBKFEv@V@u0?z#z^j!(ITj)6WOygsrkzMx{c|W8+EZ*|xJ3<8jtC^BfP? z*oBx52+n7Rp{l&w%KvT%!@##rZRMpfR_^&njmk%hn_qUbZMS}#j3@z|AP?dSGE=;7 z5D1|LDSsS6f_Mvwc4K>#G!}i%v<-4kh?Qm7Vo#p+n(iH(sf2^fOv2V*kMxD}_m_*( zntr}&fusyGc;UFCVU4-3Zp>2Wawje*BnPO;=Qya&(fbf~^!9*h13;DHm?LHulmT+t zC)C{(N1PFg1NspM{$&A+Seo<=cM*8xtd})*HK)x%qh(3~<^Ym9kQB7?ggEnu_bmGN z1vcKqog_n(6Fk4?0rCjzVcW3m@*fXhyoc2rDgXHg@_PZ6qb(TSqbni>2nLDaBS5)Uem*i za_=u%we;Gd_#1yS?3P@;YRR!q)u2)DL(8@~J(4)-V>RA!Uzn-nQfp)ZLV$qmy zN8{!}t^NfR3WZ_bLL2<+=hvathvMjF|I;z!GM^WH`a=9qBqx{n;HsW^vw!}{weCaX zB;@$k*7FqSQz#mCDfVq?!~8Obw|yWlN1?oCGBY=KV}+#$`~LmOj*3(ge`B)w{aI2q7vDMz9E={m^BBQ|$ifIOYrGD| z&J=3wVKBdkAGgfa;@d4M`q6eWgq8exetzg{hrGGZp=yH90h1jb)pV`t+}hr33O*qF z3RFE`TUyS2${1$7DM~&th0?eH?DR1VD?E5mi2&cHPkWT>a&;u&TYX5|UA8bWp=R58 z;za)Z=5WiUZxd6(_em|bi|p|3)j z^*$crU2ZKs&6qi;Qkhm(PHWkOLTe9McjqSk+83^VeKi-{ZHrp1 zemy@5g?<6<=OUi%0};#{f4GjYhyfx34`m%qBD>EoQjDQfhzS z;8^ebmdYk2*eN-*zgN`!!hOa5M{hJz?+Py>ivnK(8rZ0DeC{B=^QO&jZMriR{RgiIf|n7{99fh#TOs?9^a(i`Wz@;0~}U(Pb>%Nj1#aL0NkdHP^qp_y@9%AwX@ zuSEIEOYV8wKVDKoRy@+Qs!%B@AZuC&4PURlTGFd+V0pP~Wz(eRdnYvy*35&i9x`&w zeacU(d*-q4X&7bVPf^|<7SFbvZ2Hrp*y-#D_n|8dKB%6QDxE!?(6!e&L`h0-Sb-Y+8M8)$7@4$ zmJG9Ye)>MRX>xquu?OLti0Zfq)dR4%H< zNH}2Q(%Ri(+o*}2lh6MN+C1LcJ)`|}v;R{q!wf;;*6Y3v6Ki936_l5&(agE0(;plU zWYw!?ap*d3>KrkqSQcs|5_;g&0_{B;lB`BoXO2&Ek&jeV+g~#y<{entY~Li?jDz9l z*TAAT%#Jo|UK)5vdUAMmPQAanN%u#hYE?|La=LM+^Saok=fQ7;s}jx32A^&&nCbaq zX8dlJcY8_tNsxopbKb|sL*3V6qJawwF;*;+|wUxhK(J|+a#!5%|WZm;p z$@6X5s0|~`{474ZsWCk7+pb-zaFI4Mvb1q1STj?cVYi&x^&vv-rGa^Rv{C=_$~3tZ zzh30fXB>U6y9^iE6zmRn&e~*T-yHhtZkpY05uyD3hQl`L8QOv#mF4APZ&!#mZ*i?1 z;_lZrJaJsTNOSo22V+=)pc3_7V~^IDeMy8^+sM5{&jeq8_=oau`|Q=$C>t5kFRp3S zu==TAKSFHGCe=%dZPhA{`GgQ2+H z(+;Ogq^F|2s}7`;I;p+C=9@UfvO-eU$&|Uy_2&VPD6?n#hXoq)!Xr$z<>#K$4DXm8 zI50C=dfFvIETT{SFfFMgh>dn!v2ort;-|D~Bxq3Ywslhz2ITATQ-Pg+uC<)oiA{m?QDnHFE^ zTl2uFoAu+emQA(M9(vLr>!-VZ$cR!a+*lNz-ukD3N!;wL|N9^E6(#LMqTf#r_82Xj zy9w_!_`IYSsV=0CRW3c=qQ3U2YW>#WT~r!J@`fn$z_Vw}4MQ#u8-#sXpk{i_ROZ)) z>0DaFA@;fL1GBHUenE;Rzo-!}H@E7_;c5S$!L{qV0*THls?U-w`uAo|v~EYd@x)qt z$vrKz{AR>urJC>j>#LLU4V>?1$5a^dY#Nj}BihJyl9r@hde-ho$r{y6kF8X1af>9s zt{S1i?JwWc&e+_y_4)nWA9494uj<)$Vxi5uuj+5qPJVD#30>AzsaCl&Ypc$#e6EQd z+he;P-{{w!DK}FZ@iaH7X)IMgJ9_$**{k;Sem z7oQGc+Rq6q0|V)?*;aJzw;r=>zpMckBsoE;OZN-_#?ma zL}Z40q%LzG7=1MtR*;nL_}`KZQGr@NexvcOQZZQ4eIV&FZ1>^3!wC&0j-qhD+WO@$ zoX9von4Stp7(scy31b!_f38yXfCmXWIY`*u&0O!6o1a{n{Q0}liC-(T0wRB3m0^cR@r9l~ zE2m4B>k_mNj(D=4-JU-^5BBxIo@uOLFD_>il7m$|C>0gxp(epr&=)OvF!XBx0l`Y! zr}qLV2GxK+*{G06SVO&Boob?qkpQ|-)Lnov1PT~TwGklx+URO#Ci*PuyW?wiSW!b1 zO0C~-Lvj+*V@n+EO$Qg{Ux~CJ~^1x7i1lhkkbd%@zXK`G%WP}wwTq@>SxMgPAg=B$Sq>r6s z-R7lZ4vc&XO@1t;heI%Yy1c>j(IX8*;e!WTtxv({!c169tV!SC_kl(UxW4tnlPWk! zY?&YmI{@$<;JJFR+QA6V8vR^M&wk3%s#Klr-#==Ua&5@U#-{zm!9&lDvwNg}FJ}gB z5lvpg><9(cBSnKSA_O3?P6w}2LQt9=5L}hoj435M=v@?)|MNLB!T#DM%?|XuWpz6Mt=ovX0 zhcD0=)~unms?0qk29||3mRm!8O|X&Lh%czAN;{&OW7^P~zm{tJu;5 zn32TNgRVb|dc4G;@}qHaaW8t+ZKGjL52<)^g~ti%YA`EG*>lxA=$X7A^#?W=#tMhu8p`6Qvk4h{hn0 zc{`a7n*pypH!dYvC@aEPccYt)8916A242!TA#f|1Ob7kkDtp`91sHaZ+Qp6Lu2jH<6fhcvPoK}j z4rmIZpignrD+XDzBv}B{sDSwwd>5O$kc|08#QpGq`|KwX;A*jvP`}4PveNqL)1y{vh{A#(lV+t3eEU|ml=l@n@>@aJFo5u-K2hf( zYq;&$Ks|h;(RfUZ?gc{XPYe&JwEOq(V{ls((0k5p+YU>4aFt`EMgX33-IzmcjL0f+$yOufa}L0k>+y=mkoCI;8FF~4BLff%_&*T8zN{w zSYxMrh4xoI1*csfu)M-wbIC77=6d6(@CeLgBhii^ldFZUe+vL6{Q@mcbg{z`-69mC zy)n!zyxq^{G+Bj%>9Fe=2X_ukZ5}dEiHylxvP1@`^L`jOrlqESwTi|8s1J!=J**Ci z!z+&dg63v*@+{C5{o2`SC6MB`s|WEvUURM>c$KJCW7GuW~H3J zZw|5-q5mVWn*V>mQvqRnH!e+|-<`qIxBBNJNSn*dpXAmO5BEvB~s;u?i<-fCLt$Trja{=FRyW`v<08BXlPi{^5|MFWs1KkpU zBgh9JWE!2>zSRagshbK!`r2CJ}h=(kjX!;pNXP8KJ|``9)3WRAj` ze3ExAO{bWSz7?HKzGaS{-~ZwM-+z}O?k0i6)mxW%0u`HA-c#wFR2mW)B)CcFW10B_ z_9Ph_$?n2K{bbPxk5=NrZ8p|fcA%oQSN4MJVZ}>K*WS}C@8691nb=Q?M=rEMo4OiI znq*9#G<#~v`MT(|!W6DRBG|S5rfBFYRl%mL#t644IOZt_UcqaAdLPV2?abZxt0Zu8 zK6WdULuU+#%rU&f>={iG->wD1bI9N^! z;5}QppovP%9nmh5K@)Tz*kZ|78)|5+zjFl&#mVE?%}ks!-R_ENtp@>c*1TwYE3wRi`AOxHU@Im2im`GUD;0sDMF8;Y3 z@w<%fM(H%)$6j^WID#V+3{zI|fUWA7K!!=4uz5465&-9PGyeCM5y)Mn@YatBgmPT=Oh z6Bc_od-1JwC?Kx%SuOgw>|;-F?|!X!tH^>uo_-ek8?JR-R9a?cCZwXzVWfHrw#Q{? zOd*pAgEhi^%wqlh#U91Arf2zvJ%^wRuYv$I6ohQ0O9flxqc6x43sy2YQ^((~rmObP z&(M1x^NSaNDXJ8gSw1-Z^gZ>zUygFfKH(h@t+}t7V{@h9e`{c9Jl5$^l10zWerC$@ z*wLe;NJHJB06pr`bgsFV2sor9`hkCw%Q%v9?R}fH4GhR^lL7!M2m6w1)hbbuknC`o z-5HVZx5*TE=t-3_{$xJ^G6~jkDOQ7*T{iL`N3Q`!0L6S&PlBLP*TbQwG{TLErs&h7 z!}kO{r=M&8=hbD{m8Le@VCco36tLR;Tc4tXOVYx&vQl_#f+p}dJ3E`TK#NZghdqWr zcqbSAM)m+yvqjR`IG`>oSz7lY3~NZq)oZWj9V`VlD)NIGP&<;)`zDJSwCUg=pU+L$ zP0^}k>&;zW^aJq}U?qnf3@nvgmD0__(?l^X3EjJ=>acli+_L1(ZI~~X2P#CZ`~8Cy z_GeMdJ;#n8M*}5HCCyZ61nu!=A(Jn+S;MsoXpmK&2-%DKurFoh7C{Ynp+BK9r?~5l z8g?41c7`&sbfB>!Js=yTsY$HzX@liA9L%}&@_)*EC!gwDnaVeskpCp~G$VGUeMC*e zl@C#scSFieiMhA*$^O!?1{_OGQ8c0%?k2tST*x%oh-R>@s8NbBv0*>|;^<#R>;Ibj z2?kSzl_SWfcXeJCB>dWCb|m6}-@!%&1Rx7yc7jR^ly+^`u9bHES)R`PxbLFYLwiy{ z(uhEC;b3Ds2I5CGM6awLZL`!s9RwtrprVP)0$P|-j57ND)wdp(M6*12HjRg9r8>*b z<{(DsqDX4Zz5Xe1=Ah3#l|jMWj}PfwPPK&{<8sb>@p|6eL=)dA<)9Bk=+lCXOr#VDaZr8j)*-e2tvVXhw6_6e%RVqLlXh^wIXt2 zkGbXsl80+t>w2b?QQ!^tW%PG}=Zl#hHwhEiqUXa|(8Jmu@rsG5k(3<;S8H!?^j&7_ zlbE3t5AKd(G-*5j{Bqo0p#P?+mq0J5OgJHVf<*lS)El5eZ-qp`?42i&V}y8JRQmMG zmdU}_9}r0}S>B?eqT+>rZV-L6?O!8?iV0fO;hws4I1^=$9=#1*Duf)!AU6~mYqg_l z+XrDLI00(lgIITkhwg)%JvT3}&2p~uLOFD!XjG&B8bT%(PAy2p<{(WI6bhDHXAv8; zQB1EGBegDksgwW-DWU=&h95;YdIS~Z(fgq+*)uRUdZHtc3*0|I+EN<(irK9f=c@2VR!rRDLN zinp=+zLON=Y|@Jy$jQq)zEX7P5wms2Hao~Sgg_>S9e%hoBj-AEy}7@AD2sg!&=!PK zaB$BqQvGwbKQf|Mja$EIQv?(is_6ckT3RX+^KvNHZ}R*p%-LlSGO+Z>@a&B)$I+dJ z-Qiij4a<~VZF-Yyhgv<=$rC!#y!MR9bF$b#La*WZgP^dxAyi*&dZnb9_W<~&45Cxw zvSC&XI5okvCsPkQES|F%9g85$3bK=d$Dd0xNFJ!kdJTehaW6azbPmF^>!`o&z`cV* zi76?HQ0#q$dkq!wwjMJK5Ns~@n*J+H6?DC6!h^FKlsIsp3)z8(CX9`qqy0~n#n!RF zv+ixqRtJ%$oY3bxFQ*D%#Ta=0`||kYs9kqZ2S!Id6BErmG*Kn$8=e6$15~Z87Fr?! zx|>L1g{+6d7$g4#3Hn_K(cqfkE@AtscuYa6l7duY8QdeXQLCB2}Oo)11f z_pk!?BwELLy1B7Z>-uW20)1GK2800;z#xIreQhI5ws^I83HP}lIQ04QutvW^034{3 zxhBB5(Kk#1%viJ+`C8jZbik^)znSWPUi1T$K_r#{tz&yq;1(_}DEOffWN-@)gGOe0 z5B9S;1IPY#@;$@JhqHp3zVSU^!ktH&k>y%I%s`bz!Hf+37FR7%Dog@(d-#_BtV2_ z0fZDGMnBh?5l`kx5HYx-Y4LegAuX6>|JPzSCshuXSLJ$j+vhl#zPLgjC&2r5)7%?; zJ;mO9Gc%*Lys91lDT0#-V~HE{O^Cw4yBEza{Nd@)#!t?3!=Z&t@N zFxGc}&g(uJo(M^?VX=(<{v%ePg9ea80*nl{wY8N)N@L(@T#APfgfz!Xz0`ZZ6c%-{ zg&NQ^f{UU;+P!_d1cCy=I*_~X`-OF+-62>9}C`v{}4jOBkxY%h$#E_++vD zNLa`k@8aZEO_y6K_Cv#!0BkPIq8AY03*uV#!rYnMH_!xwu@>|&qe{QwWiM0pT#DxS z;%#32ddMD2Nsg-vZQe1CstnUsXBH_kyZh(5?zi&24?0M)E9pm9o{|@y<55;_5GnE5bMa={REa>d){@~MAK~@ccs_#@U~-arAwk6eKyh1(fH+(; zxE}Vxr0tc}dNN!D*=U!@uLq;SI17GEam3uf24d&K`67?j6;OHH$C@m+T<|YD@f_S( z$qtqWDmu8YAAHERmObz|8v<;8`QUpngQ2C{n~nO#pWo%)!-ut`a;9RPZo*(&4+CI? zq#rRkUSoK8ZX+`AViEe;4p6BT09=P*QKmd$x{jD(gSKCgpML^_bJYCWqg69%8v9V} zxN1U;d&LlBFP*MtEORKI*{Fn0dT$L|f@=4Ofp+qrn#NfShsJ`%%VZEfwE4UUw0~j3 zbo2_)WN$IMbXGkSaHbX(C4-F(537i0f~zKuYXh;SLC_4_b_n!e{;$muk=ozQW{TsB zHweqI?N@vByOx0gCoga8O`aXbbZnqB0F$b6^#sR666iGSzx z*^hBa+A`0h9D0L8K@q3cvd{R06}ad=3|ALc!zn<5C9Tzqbm%aHG@P{)g`_}#NXA+i zD7L-Cys4NLjDIibMXxL}dhX*oz9>a@!Hp%i_|~?{74Z(@$~>I-&oeE33LyFUA?+Y2 zF!tFdG27JY82JxhN9;0(OK}T?#S7=rJ599mO5ezFE!{42{hQT7dOETDhd#e)2M);K zMzUL{6PBCbEAgX>hg3t_VQSbD@|1m=dCNd~{1`}wlu6I98N=afze8ft_7b76x@Iom zA1c=w)+p88cVS}9=vN3Kys_n2I{^-zMatA?9Sw@FPXp5LG*?XVdv6*OT>Gyd^)y23 zP=WZBqFIJ;qXik?Xp;?Z5s?J&r6B(ZM~|3F1GhO0fV~tA)o=*1bV(Ze0*_iOU@QPf z&7D+J6O)1=>1@C)UJYr{Y*b`PO7AhhmF3HqSc7U7o8gZD z{j1mmB{%yzynpV{*Hb-ND@P2)cbdb{)5<#$)N&H~D@u0o8!fUS&7r-}Z&spGNmM0s zRev{#I4cwBW-=8{qo-K%#t-;hp$E55=mZ`Hi_Ho@*=$d-q6FUU915!&E+t=VIBdsf z7w%!pui(ioyRsQ!+b=i8CZ=m`vUdJ_GsfZ>{kKG=NZ&9m6GEyj`HSi3 zd>U}*y{~G9S>Hh+Ar7=i5p&N1k9g+G-yip5bhb@u>=1VTjnq{i(Xhpb<-aO7oog*h z=VaS|YTHG{eGX&mqU(~*teD*ZR0m{@ci&(&$gU&)*tk7Y}a=Y@RsW{j$%K zb+~`b*sqIk6C4OfII$QbqjYww-b=q`DX`?{8*FU9LZ9w<^8n-wUeX~d*{H-c{P+QT z4NhP?CX7uvI#S5dLNkZC7D)h$44M!n_%9dl$?gN(7_yg5%A<;CPpHoK?~};`Pg^fw zN1$WPpWO!^SHFHHij1uYxFy>fbo6UNc{{Tgq(>;{hv=)McwaaW5geM4QXZkyvQN8F zgDQJP<%O7U7S~%_8yn$83Zq5eBNL1cUTwBip?y^^SLWVaU3l(lbFp%u>cp9d+K{|Ch}0htG@JwA`}OnZZBP!^)oEv1eM6*%M7=u+vk3>|y`1V` zOHarlRn9 z4DPMR%*VyqQ!^9S~yW>a^_7(tOo${}YLm;Kq6oKX{_{B4F~sJD|K<)l_1DJseM zKp%-jJsf>On5#S{Q~R>*?J~dRECrMzYi^zd;5869#7SBqBwetq5NSvT4igId0UUQ? z6-W=NAeCMY9UJzlf`r6P46}WKjE$O}_SOgur!muqI3o(c$vN#hB6(6;)>N{KsnkEi z{7Kr!Y#n@n6C~Q57H%DjXoC6nuYQ{@ocyfRMZfh6m&D@AEBOSZ2jvN zsV<^?%|IYSvEgUcwh?u3*`!dcn*EqIH@uZlaU;N-d%kjnRS!}hR7n>w1sTlfVVG}g zZ8`^Noa|dL*AK?*2NsB-Lk8GKlG4&uI8^yvb}PS(TE}%h08jKn7Y@+evOK1{42)}c zQNj88?Amrm{a$&$@Cr$qcXjOZ2zTap7Vis}+a!VPC7F@dQ7Pb;F?e&fP#oO}f*o%N zSU-)S1Fh`uNtYdszP9!X04!8wZ`Ghh%yJmo(I8gA8$vnJ_EFEt7u{g;4uE)tLAWt6 zJnW7?Avj8u62BdC3epZ_w*B@-r52wKb#4MIkYn&-g#bEXAi$xTp43$v8w(X$7obg~ zoma&gH&Oz^s3)Z>z|Y!m(j{Tx@Y!E{-^PP+-AvK^4r7;tLz_ju1UWoBV3pqY&7@h9 z2D)o@p_O}DVE3=8_gyivL=<=0Vulr9O#o!5!ZB>nXL;?~dkYt6ehiRqJKwukj^sr? z@$#w2Y9C)iDzhQ` zXhqCH*JjHf53-(3`2fEj>Yd;%%-gBH;@1@@&Z5XWl|cw%Ex^oC%q8ejjW(Gu>wwuE zmn=%NT={In=;{!?N34W<4R%=NvtS3^w=ZvX@lR@=^53UYviYms=tN`i=8|3=n>O1a zGo0XeT&E;EJCB1wD?hv-vhm}opMi$*?w>9P36@QpH~?>=yY0??_Df6+J2*mVz93dJkc+YQ*doD62Fs(m4DE6O8-3vLhN(8!Zajrx}Tl7z4EpkWYjmDi64B# zb3)CX6)a1$x(4N`{57w;Mz)9q+w6&bJdFLz{Dpo!wRHr1!>|yQ?nT3shmfazuOb<@BW5;g&+UESuBayWKcI zBu}1{DWF+SOWjp2?R*jhUAvSA9p$whsJkVNE0*7vyML$d;i>e;QNae5lC=38;sGJU z247RJnMy{mc7iV~x__Wj#lULx{;{@^>RW? zZ^8-p&bGZ@bEQ7?zQc)d%^4G+1Ve9*>o!^!cI-T$*6U|4b@$BecQrM~&sZ8n=hq7; zu=ywOc?Auw-;s22Vjj1024}4`p3BHP4>?NLyzB3U!w~zeS9>F@KHN zx5mC2C}b)T2e4y5HLOS#u$&F$Wf!mFFy0$ z|87H+K*ZVrF;(TRZp*s`H3!=BFMD1yJF8)losqqJ_WjA|ew|dNEQOV#e@~BL+g+eZ zl|YpSLIgSjHoPi22ZNnv)rALVB;YQ#I@P_-or^kc4bGhnGEccetqf;ykv z(V-tU1Psi1>y5Ez_3UnRe36Gr3YhFZFmQk;fI1d|gVg(=SqPMqX3qe^3E|K>N|JU2 zJSfltp+&5&Z=Fw(VJ%Uc%%D9vm)I+Syo|r|1e4h6?NluB5RM6J9bqwuK5DLh3P0@Drjzck1C6w;&w&Ds(HhlmV+| zSie3Lx2OvgAw?KufH)`a5Ip(iK5rND2+{-q)WSwZng%a9l1c!Yp6#zVKl}b@Q5R`a0PHM! zC_9hBp70_zue-tj=5X(sH+8Omg-trP*-txn)r^#yM3dv`pitNuPY(~SwW+*ptUEg| zU3arrN>=r^n~5Bodb}Xx@&?OqKNb!0WCe8Dr`xth82LYg4VouB}{G? zht2{O@uFFFbhX`AHs8Pg@(gPPk>0)rslpw3(dwKs=Sc_55`q*(CPmjFpOxL=NL>#>l51 z79JEn7-p+vSsmFL+sEV+rqh1e)Hn_w3Nnzc*d1sPK1?c!-VJ->zgvHM{Mh+HmdC~7;UKN4bVE;ttCQ!%ZABzX zza~VEOy>SA9b2mYplq_$XZWPgVs8O$HymJ11<|vgXrNHbZJB}t$AvfPEPe5-3P$;D zQ6~g|$=d-qv4SANm{$!zm2>ajv!rX8X4+bX0h|Q5ZE6z1U_QrJVP~sP?2j1NMXzG? zl|o5G7N6?*i#HtE%uC1s#YYd_6tRC-kkm@wC`-F!mwP% zg=1;%T+cf`T)$dpC)Yfgbo_C7ZPMjRG0~C9jQZ41Qz}Os$m4pgy63}5&lD*=>LS&9 z79S2uHHol@i8Fq9dywR2U^_OKKZA@j3}~N11W7mIp4orA5P7N+K(=L_ugLn4gHkhP zcdpV^>-@;~7Sl`h76|jYc=}SsaulBgLgW38Jl0yDgzJ&-T)KH#)JKnS$-MguCuLIbi2fXRiOr~RW{1BIAhRAKa7UerRNdkE!_M* z9=EG1NzH;SG)TW=v-)ms?vIV_U0<#k)<(-KwZ1CU$~YaWAaEvo&nB1p(V5cRS8Ltv zGOM$WsZuNI9u~okbR?@`Xg+va6ce-VWAp<)puibR0W;x!J^tbvk@Ca-0o#jpk8tnuS8<*G*@^hSNP7>k zDzhwW7|T-INR?tj0mDT=KvYm9WA=gyk~1bivJxd@!ZLwg36dp=d4hYsv&o?vQ|NC_J(`CWEZ#ZY4z1LoA?Ik>#V8R&HAHvKaH5$gH1?Mu`oCYd%a9b;&!}Qu-|qcEm3iQF zK)$wG=-o4qWg>f)Efd1=31ME8jBNS-sKIQ-tPX=bEo#{tjoy}yEB*lvy=_@$%_k(1 z%#WSWl#SUaCL{5oDDkey;HgHVxyBX)d)j1bDIv(fb(1t(!4`nL+?RAk~#UKCyY#*udyX$K#Lu)5U|DW)5et z7zS)ut^Xk_Fj>vVmn4M~E`H@btPVugLNj8{7U=cexIgM#ezFfqCm@dWt1=dO?w&Cn zs_5sSRyoPZzJJ`SZ7P*I=njqvFq%@Kn4yAK?}KADc6WM+m9O*lj?fBH8HiCjyQlc< z{ge$cMQP4mbz}NwtX*D|FlvJyw1x(vIP}Qqo@Gr@pLFK&y zJc|$n;JX)te`Mda>jVul4%R8SZ9p0#WAFYiI`IIVy*2C>UsWZt7-dUCl}x*_0bnQhIQW0OR_YhMV3i*hCT7u zW|XZ>_E9KeWaDv>sjb>)m#uO}GxlJeAhUjxl9>HC&Bte}NeU;{I9L^%TJ&R)Ifa74 z7mg+nNj*@3;^hc-!rxxLDbvmz9=25eql6##p`?UI43*&Xyi+1}GVlK?An#>UfmOTc zzu1Fs>PVG_Ke9=jDrpv6w>J8(mr6N*@5-yqKMY(fLWA4Eyuap{h!FqNTes|-_3g&n zwwZUA9wSgu?}T%=NsD~SA)zpBd6wax`qS_HtmIgIm=ZiEd=452z4UePEe#uV;Ox@# zx!An6^=l8se}H-ZTvs^rThupox)pb!g;g2O)#w$tt}@~5bFQX!)A9FevPk1Uymph5 zUM(iln0e2yyWuRWdc5e;@mx*&YSWlAMdiZ>_gvRB^jgw-s7&(=JkSc@5P{}Xn zc7N%4sprP|QZre>IzUsWwywR~A&}&A>^BFdvKTQegrq?tL#0OrQ6R%?C=w41B&^we z2EMv{P`Hbp-0;Va?IK+9jYl8pcL%&r?7yz@ZrS3BZ`pJYD(D#G|9jk0X_PmC>w2gk zlgRPq_o!{7G^5}v?Un03ja4r%8gI3X8oQPbjdGG66M2*OlYSdBeY0Qd!xiLRM1({{ z4gHGYa#ZMvWDs^%2F{Y%#P`V{lzlpALu6)o$?2gVJj@j(^t_I|jlMEHz^#&?>%V83 zDa3gyw=*wQH~e$&lU^15_Vgf>ayoUIwaIEpb`H#fQK;8mY{}=?-P#q%KU1ele;_>M z$KRBxZ-d}$r^rJZ7-4j@k?4e7eSLdH8u)RAj$b3*6py`>4!BtrqZxzo61#WrmLL=~ zZ~}-xUYd0_hTgw-*Lw-a<%vSBiFx#KV}e3gh^03B!9C~v&uYHNZ(gdCrjhu8{-w;C zFY0GC{C;E@Q9{8DJG;f+>)C~qhIj9F<<~_#d@&h&gp}JotXbV-nN0m%g0l}w?7{Cg zIG-}QQ=nLavWW&v$j@%1c55! zRWn?@+;C>_HAS~WmMKAa*XXT%QwJ<-PaM~LRIS77zHZj}Adk>J!3h90puU`uP#=LJ z#rN`io*uPbO+KE^C@vwv?P*ZP+O@?J{qv z!;7a9n5x6C^`15*U-!9*EB=x(dW-H#bL^4`0ElHd-`J<(_0oWeD-KTVTxL;ptsH*A~yOFkSiF#{sKzmy-L1F@(;QDo;OcQ52 z$~PAbP2JsD5vpHs+SI82)VZO`>r@&fx?8KWGG=71%Q?gU6DUd+Xri-=M7rV?0E?KW z2^YO3a)lPB*NYj8d`w9JlDNHTnXsFmQAOC`1;`8|h@SuZaw<{Do0z<+Zf5ej{7bp4 z^gwd?NmHROUk5(v6kIwo#B(hu8SWmc2x- z^=9hfEF=82<6Jmzz$cYZW++Ot&J8ajf2YNSbs!b}JN4 zTan~X{IJA-IE6NvEqAO$l#heNHVgs$lilZF6~0LGlY{z8et6Yg@ZfctJbQM{0DE&d z&eK!|X18X+jHuXvxy0 zdO-2VPzFhN2r2dfw8?>&9swc)_<>~K znCv_-$L)M$_vQ+pUgswV=5>E6O%<~&-ghZm&@@fH`IF~(nx(--qnXz~M!Bld@jU*( zzHWDC+nTH{`I$cN^kpPCr_luRdtx7RFzRcOL z;5#Gl#;Ed#cld8*#QTN7Dj?zXCD=0{Q#9^- zP<LPTyUgcBtV!BQJkXq4O zLI3N+NJmjUT-W!%vnJn6*Fw2I@_W>$*NWsjU`BBNulw?+4Q~4QZk5Z7OzWx~P#kqW zgSW>kaHvMdt7+TI`)y}7hv2lh()5LQg-d$)%V&o|Lbu2Rsa#U|O_qqr)%r(v%Z6W$ z9v|wb^RQ$0 z)>>w~?=^+RPJhbm1{eRX@GFh$)63M)Uu&IevqhvK<57+t-gFetu2qROvp3Wy1{$O| zxTj!+n6;g5TXws>5~gNFhTzRgx-M`M=omP#6X4>;K^=#;R3h4&X)2e%Uu?4gV1gkv3p&Ycbqq3a2nHX6YW}sk*&Nf7k!MPh+!E+6HYz0yolgjOT>njc0HYQu z79udjw0ktV0kVN@6$qB+lisb6sxWodQdBvB3N>)3ZSSXg(425 zl1V@0*A804pFj%&PnGZ$TcIV=oGsYNZ2+(+PlrP*8cV=X12GOPgM_~vUq@^7BM zjyK)Yu_7|)bt9n?ItnRl`)Nerei_>GZ${@zwUi#^Uv|xYY%Enyena{cG$VBo2@5wa2CP}V=#Tb{~2 zG2H%zS8$7o@;xfQ+=s;5dP{zU7rcAK*AnJz!qe+@iQ7$Ol>)Jq&^KzJNPBOvyQ(;f z^>p6wcpLlKr}=X8tjW)7dIYXZOlgdB6=b3&JZ|uy5>VW5{!oa@K#|gc@O6(b6b3Z6 zR|Zzkz~aVL*B+^#VT0*u)zPix?Ys_^lVLI8fZP;P&^d5?8<;n5*u`$WTj85boi=^= zVd?c_?xS*J@qH51GqR^@))#4J9w#?JI-PMdhnST4&#X@uDKu?fODhb}h;SurrWGp@aJo=k+TP?MG-UVf%2C0`L@xHIvJv@B2NR1~T2Ds05AG`}4+jSPF` z{QwAI!$c9jm4VLt=ALal5VCSo(UdmJ-Z0zbpmP*XNK7m;FMKe?azISexV!Pv=5e5# zVRyy>CI`w3ellk9=J!Y379o@?K zxr&5>Z|5r?7^a*C1!!;I?M0` z`K>XO(L|Y~{x{I8oP)9~A z!n+hjJ>pKvk3R7*+eJ~s2et}JMkr4)b~Ob(5=_>;sz~s{Jf01!M}oaC)cdCUZvl%P zX6y3cd$w(cR+IcWUh%3nOZV>|oysF1(9d?!D`!~1X|6JwmR#oS>p?Nslb14zW;NM5 z(;p;A@c6L@f1>M`z^q#?KKSAx%|^F+Hrd-Iro>m{eL!EqsT$V;jS!@c(&x`V2S)7{ zY7PTR$mDmx=NUSW{7N^L7jxf#Kjl9KYB*=AbZQ0-gh)~!AL<9_($$eh6JMi?&c z&)_HQZW&Mn<9kR_hUM_RI!0ALQJ0ZIP4#n)MA^ru)5-EFw)gq?su`tnw$+M*OAidD zBkq+urxSDz#6T}gnb?Pvw{Q6ffbQ`p>MkE3E^mb)K;mxJtKl% zljV6(=#kmlM!PvvIA-+2-~EXf-!W79T|WDK8;})nE&<6mry$7He$4ve)~!aC@|LL86o2j_PZ!nESfph28J`q|2=r+jCcE0VV}eL`kMQWX1rDoZe3!Zd90yIZ z-QCI(^1H1g5Jjz1Rl9{W*3sSPomZGMu+JgrTr%8@+9F2i)LVb=HD;yCPo^i@_Z@S~ z|MFLnkk(!$1CHmVwWi-($1{eM-R=H77k9sAL&WO3HC<0TB@uC= z93jiCP|nuGUHQ7Cz59fTgwyEJcRSWIZU_U@p&+OB9FJI2(Z%ZD9RrJZWH-XqMILtg zl3en^(Q~kfqs>N(G5|f-0P>`ah%@Y<^HhY|%{9((hC+U2*IF z`t*vV+FN>m$Q{pY6_H2kC?iwOu-4-Epoj5CuQL;7-}~9kd&JJ3tU4pRr2k=>!kHHi zpAa!tg!hG3yy8P^*CaPFeJo$i?pq}fEf~IVd@rSJM3jPP67E;a+&c6>*>t>u?b!Ck z_k`#!pMP>b297gbb;&W{$QZ&A{M}l~wk|T&3PYTTz&Yf_i?)ITa}n^oGWh(G)qQ+f zYa%uFl|SQ7b9q@EOYi38wF}U!ikM|!j!RqGe*pCd{5GhwuwS9)Un!F};;B56Xdovk zWoaD5qa9Tt*lIA#v_3i!^#1ht$E_<~T~U(7CgFWJ20Dz$!5<>54X4hb zP>AvsIgA6Y{v4A&|4hghewp}Qmxq}zGA$rX0}?kq8ezbG$+Yeo^uB3)ow!1 z2Nj5jssJX2!!d>k$trL@fxR^2SJeao3&6$~OIB`R$7~UDpn;ei0p-WXwaIf)fo|m% zF{>LVvSw~si+wu6%~<8?aNHPpbf=1KAgmojYbB>kQOi*pgm0RD65_f1%;6wK&3D;p zY2;S5H8ma0af>e-8;9;>OK4(hY3>auo3ZX|{D12q+{s!e{_#903~VQDP{?!CN(i;y z#KSWU@L;7Huj#+!rjLmS+D~SCSQW&z1EfKA2=r)XaQ!DHG)Puypy!Z`7N-pKiqi7o za_121an#T*_+r)>*zcVn8Vh2G*GOMO1QKI(BHjzXADUSvUfB>- zTz?FRX0q54(!5V#S|>fWsAy_(ePvXy|CgyK8)uIF?emp(i-pm&S1X9ax3vepwaSM}LH zR}j8m@@j9*-G5=0h$sT7z}FV#X!ffWy8u5 zBi*Vu;r%LSk8(_)p|J^K^nC!r$w)z0fiYj6>#a3_Le++Lq&MbfKjm6$% zl(&;Z#d?Ci^P1w|vJoSX*1j~MO7t5J*zW!8SH0-|1Kk9N^1r^9C-fk7{HSBRCSIb{ zkdM*)tHCo$KHx>aA0-sRMJj)NE4M`dTX*>@lXn_Zla-tN5|+^=$6ZHR1e=s$bgsGy zWffc&VpKqsgV3fcs;Q|tn>N(f6W%M@BNE_ID_~)o*gDRtdU2JX8~=gazWOj;{WYfG z3ZDc`Vd{G*Q2du(4omFD-<#@{^GO4O^0s987|~x+2G_z@wo^t^{Xqi6lr&0)9_AVf zcvky1twJsB2~~);eLv7j;@=BH1`I5oH6$At7@)zCceh+@@mSH%&3-kjyE-cI6W?kY zMO{jwD&O>iSHcjzVU2~fos&Z(*hCRlx4brF8GRkz(l!Fr#mdI*V-gw}9w#{lC)7V3 zlzEz3Dc}C&30mMM%G)B#US>D6$TpUL9i|gMhrEM_6K`nX`*)7NB!8yWZ|sWjU;*EH zY4S&_o|&={jdaOW4xF*$Q#JIrmMqA$IqD)@|0b~7+{7&Jn|0wz5#HO@$=6fUIv;(> z7;d`OM5qXow=DHqOkI>;ZJ=WhzdpeJ^Jc_j=xvUi>6TYSql>_;m^#Ouni=uH)Tx?D zwX(p%Ahd3Lgjvw=qafXLcFxC4KQmpFQtjVI9uan+`U;n_zQt^nffu+RMD=!D;la@a zBFZl6K+%`$XPO2ypiulSe(leV4(bupH4$hs>Nh>%U6HmmDDRoR#96BR=49H6@GqfA zg+7{Q)P=QV{r24fb%9r!S&%l**wK?`?{uito4fI-o*-%BUyIb`Lls(utIfwg$QMmo zWhg{81r2)aAl+Myz7V7o>&4TvTT}vKC(x#I1ujh{F0RFf2W@`Kq!&zO1`Rpc!%U%$oU*_Wx8V3iHGE9%%r~_rk{4AwranfeeCpL8$aNc6<}C73D=%`#Joyc zmw#i9+JkDGUW8a}U~oNE^}@VmH>h}MO_U`H6~!!$P<-{{fVG9*3(G7e3RNY^LNO`3 zvFx)$*Ol5lliVA>-N}Aebdw?8PTSoMB>%GVhPsXi%ACVxLc9!HJYGtL@|3q+yT$uZ zHOoAUBuiB5uTR>>$0GK%eU3WkODBSn8eQgA%5m53F$!Er6-*3gvzaJ-Qu<}x#-XO~ zg3M-kM!%(oe{nQTwUcP>qYzi{Qfg?aOD;&$8*2Xv~X|A_`SuW_JM<< ziOEQo>wm$~Q%!eHO`NLy9spl24zb8mc#eX9y5R8{blCrTa7s=$-3nTiVP9@RX3fmG z!RXp38yBsvnVuQ3ofZ~J;{(0{o)+cu1KgqfPNVJ=xb+6WVOYz`_s`}~PV7f>J%y@5 zmy_C?ryIV4wX5F{D{Ar6^8C$?)4PIJHF6Wzzx|#M_iy^4YQ-S(Kf}ZhLq5UYC$F9b zOw3h1_T8h2H(m5pG#RtMBey@I)6&X!}tuiqC>q+hJ2-T5V7N00o zi9Xz_DZ)&1d2cr*CSv8M_2=Pv2bBkVHyJPpedi9*e8kZq6ZtmjVVZ=CO$?-6eJa3i zz)^?=o_wfo-y%1bQkaV7?|=L>xLw2BtGB*d^qJe2;U6A;Fpu`a=CvL7?h={Y2|8lu zBdIoR#!EUn5&6bqF`=o6#Us|@*OGp66=~s6@02Ewmg>UQok~+74$h3AC=r)p!tVlu zK&NX)tI~ov3gvxUmHDjZI-0Q_CpRmmcvxh#+uYECUF>WSd2>-l%HQ>;!_9jWcQN-r z@_4cn*sfH*&-gUmehtJyMX-=TZENXic@Ao?e2jZms-;1opRado!8IE&1v)!Yb;rNv zHody|&Pe^W*>PRnP9y_i7DX%-1-F(|I@4 zc^?X)o13B@B=No!y&}2qA94B+bQ(Uou%phePln!#Dg$D$3AgeQR4`Q}i_FXaWAK^Y z^m^i#P535TnFQ`9>|L+fBt4eqU>}oTq05+(u zW7m68G<|00d#RH65>Sx+GXzdWdHt?^9nIBODZR3dyt%`6A8bz1faHRwrS-2u7vIdW zhbTSxGx#nzW&XIP^|f?Vj=z1@_#}U$ati+FQjfY7WM};7^{3J;hre_$HXD+A z`AKxxy~?28DtPrla_*Hw4cgc~L4ztYz@i5Nu-1_aO^ zX0Id_AaH*W{Crrmc!M~s1zkNczy!W_4*iIgsk87kc}I@6!~e$HccMdPFb8|^fwr%!JQ7rgESTj%%*&9v@`8(IuV2q8d) z2=@pKit;LS_HKeKmH&*NurhSTT#jAyOM`B}AKvdNOuzxqh)dE}7(S5YfTT0EHTFF= z0X(5^uo~9|uu&7k$MyQNy@kG&l|K@wq8X*vI_S4EuW&x@@-iOe5}W1FBo zN>{zI;Ct}#AiQHBhkZyq&|??}uey>|(Fjm7Om~|zCF0^na-bc_} zqRDpfQOJCs#qn#=&ttVXrAbWs)ExCap(*b&AL9Y-G8aMH9mi;dXf8i8T)bC%=y4w{ z7(8*p=_q#!Te)%P5D0;%XuBueWnIpm;Z@ItBl<1tWDg2Jb@?n~v1hu4t(=O(RXSo+ z8UV?m`Wcs#*>N?zBg9RVjN_hRar?D_qX zZFIy3Yu75^nL~XZ50K{I!=HyW-vu+i`_PrIFYQ$_z?HglGOYW#CGoEw>Ug)h6^91N zkjttc5KNH9_a!8h6bIH#n_BMSwZVUYN(1~`5qP|K<}Pm2fqnPx59z9eGrIXmM%@{V z*G0#<6i5c9QEexS2wqk0ySMz@M)#R+WuJ&!*CaDu8R^o&*=11hI%4ScC^C)D(Kzw` z>tbDNrl&+h0mU^EUU8^St5c&Uc!%oh>U6taXIn?wH5u$FHN_8#z`b#CZmSa#d4W`yi%+s8jDmJRDkk<$L?*-Z4^uztV$Th#3rxa8aQ0P zB*r*!Aft3bxJyK}cf#ji)K`w-A?oIxdp4SFR=B{`?mMyV`QsISB8MXm_EnCExpbdC z^?FC(#r5mF;ZMGdXsfIfCo3~9ts{iMHETv{qnp8LB1|*_GLF#1%zj+{3sT1j+uHql zb(4o5ZT<1At0LS-Uj(t*jQz(-y8}9j?!Wh!danE;k@{qZam;?bhTx=Wq303M6vehA z(~Z8lnB9oBB3!syT3g+^Yv612tPbkIaCnnKfon4PP zZ)taT&t)YfGz8{fl-d9jxL^v$u3hcC-Y!f+F96yMc0-rnlA7e(AU1YMLgItE!n~pm zZ6S5Q1#l-OJsKFPLO!e|)K|672Q7!PqDu!&O+no>oltQ6^&9n4aEgLbND&GV0k~>l z5(Wq_-gQLqzhZ~v(a;Efx=Jt3FZ}lNf5j=qRs*RR+~)}noHX$^2mQGL3T6ClHDH`!i~e5@;TL&6Ux-$S*LVRNX zq1+2Reg0(Vv5oUhSgHB>`NT*%X$nKINFN?HEttss60XoM;XWr2O&a}RQbRui0~MHa zR)9z|L1$DUI8VS_lKdGPIOZQXX#df#b7zc9~Q+7#=q+~BHow)A3ws3o*f?e zIZ91+1DwuLL%dmz$5cxOR zVfsuyB)RbDkG&p+Jpsq4E6yc--jJ3P7Z>Bqu@kTsB?l0U)BJSn;k;Zl$LYuS%SS4G ztQWKWah2$|iQyUSvVg>fz*dp9K)(+zR>X0Q5(dIIjDsbe`Wpz0yL{cQ4p?lQgV-BU zG#nnnP)ycWjxGcSCD_rpX2V<7VZ>n{-lBVy$^fv9Yls&$Q0h&HNN!lQ9&~|lVz30eIPyxcT-h8PW>z)tI{oatk;7mSKaL#L>hZ3Km?1O( zIDwA;ba-LdwCP!=7eqCDELmej?Y;Rg+^`0GJwM_d5x-Vq>_l9tyQ6-BTy%7t`~NS6 z;p-SB?ix-b7=*6>8ezm%>ar2B8f>~5b0|yb~99` zbun(0D)Y;Uj-5GjZ4WCeD=~QK0xJjh*f;~1yFo^V<4_h7hT-m*ES2VXg-d`ls#J|T z2U@-FS)-l8%k<6)MpdRl$vTT5#ZrVN0WBDrfTv4+x_2VtHzW^iRI-b4?3<*_pY@3E z$2W-GH-&u+={I4J0wsT|3JIOCgcY&CK)=$=7V~u1>{xSxP6_ zg|O^|$SoX>LK3r^4Q4W@2Z`|9i0~Qf0~igT~f z6zn1I?w{rjcn%EQXrF_m7eX`)aI-8c(s|R+SEUbJ8_Tvk;4X6+Oqy6B|C4mTCIy?P zD{EMd+hP8NG)IlBeWHN~6Px$l9~OjRKMC0-86q`l-ih&E-WzA8bzFT_)7D>`!F(k* z4-dt|9#42+_~GW~rvY}U^4ElaF)-UQ-+0?>KPEh&eC&c{mhA>sQ5W_};wZD&|G3hv z9OtimjE}mSva;aSTX^9K@^s?w9qMN-Aq8tzG&s0B{r7gl1&WbZ`(cPgY&cj;FCkN{ z?Q@bmb!u<2uaA$3dI`|PdS@4xH)BF6H`-JEc9X2NBUT`WQY}9SStgL zE`vZ;dy^*Y8cipvc*jj3Bf@m2)KnEi?u8w|K#z0-og*95!&|o4;gk_@Hbw1BsW zyO5ZRVKYmlTHWq|<=Orr_-}rycfNoBgLy_rM+aBCiuHf`C{n8K-6V%lDa=$?1CIHRPs{@PyuM|7T9K*-cK z0@;kI8XNKH%e340nVYb+{@XzWN-+NHLvvHR*awtc$lF#;Aq0ji#`sY>VS$BkcgDS& zm_7X$HZGlQM(?qi_7#xXFT_sKz9HGTDZOFAM2had&xB}=_kOMW*sE^M&)|tfEaT9g z@u0v7wmYBAzmU%eR@ZGK;MOrPGI|AIUYOpGZ(rI*ZHBwtzSI=eWlW9Kwe1wdVIKh= zjus;?46O+@oCZ_Lo#vfotho~l%R>C4Te2zdgE+jgS^FY*f)bBdDDADRt;t*vJl%1G zM#BBf`TOrvsa6#oq}-#YcR8&OP4*e=y-JKR!MKP(gspBEn@LMcKZeoB0iEKl#4wyD zmuYK7{5oo@u%<ln$RtBLNqU~JLd}f!dU;8GIAVpk==@@J|&73byPE8FrST==`VBqX2Vv~f-aRX+G z;>aS)-Shp=**k8?Oq{DxAkbhHMp)t4JbI{hFf9V)OL*`}BkG%~{i0VM_m;Ql0gW8q zSmqTizJ+nGloPadk-m@#0`QZ24igg&OsfQ40OJPWQ$p-LhmaXZU_L#3ck`}^U|=NS zZNqu26s7)ioMD~V_i!4M$MXgd5rw22ur6!$CtNfcKqP86P6ERJbw+`1nDMWNvw{QJ zpCx1QdbwwoX%G>SIxQ&UN5INcE6;G0?O|H)>U>3dvH02exGN_;2+*B`ql8Eqg(S&@BXWF|G@Gb7cT&__^qvU3$v=w`lM4cX!wGb zC`ocvE%kSUqwPMt8b3Dor-svenGM)=vN&Q48=@%r7vtCw#w5pomeBaA9?zm%BJcJZ zG_cP965ZqWxLoFifBH;kAbKeVugQ99xZ1Gc*Q!d_K#ox zhoL3ivR|DdJdJ-&5dCkT#=+GitKP+GSuT7k-L%+7RdZsANF{@sqGqSW&i&hOGqqeb zh|RC$760+$H$;BfYW{z{ivR69`42bznO4bv9Z#;PnfU)|eQ?e1QvJ0)Q_i~2>Zv_) zEsn2?{u5N5@ zOT?-#ouQ+9VN5-CSdRSUX17*dRm2XhO-}HkeNvhH>$^>&xadd7s4?madirX_$%Ju> zg%1~xc5ThpK)PR%WT?8?DFD0}PUMRb#y^Q__`w#z+W7AcFkg-$F&5yUUg`523Xw|N zBPzX^l7Fo*lO-OJnc4XFus~Ezu0r_`Mm#c6b&}#3CX~^j9fzOKzG-42q5A$14EiFu z7HTfKSV?8(o<}GZHay6&HHcHXyvYftJW}d#v(c0D9VCDq>FfnC7~j8Yr+P^!<%0Mx zfTD#J9{`b!_VBNz>!`+qM+YOH$jl`RxBR$Up!J`Dh1vNZ(=`6mj?5bn8Idt*FJ@vA zG&=?NNh12I)T>1K^;oub;S=dBxK(e7L{x^uUZ2$HHX}XfV4fN{n7_mZb}RRhhO$zL z`yd{GJ9px*S2IGXyP1K35W}72CrRB(+3fVBBR%ILbwm(Kbyze>Zg_AUUT0+(+7+L! zu<(U%8ZQ?x+W-QsB&N)y^4q&=v(t8rU8@X}J$d{-19kWlGO7>xo%WunP1P;tWI(gO zU*l`Ae8hUB2N3L@yuGX3Fr|nLN+3rfnTiHgL70YW`t@rEFg?iW0vSkjA1;Ei6b+LfDsb1UhQIXo8Zv8QE&wh;8Af>8nB-wYYwvGJBXaVi4wK*SB3TF}`c(PKn|hJE#&;gFtwW5b4tsh%GS6 zgBI+9zaamf7%0~zMUkl;;BbXW2OI(LQ;yt_Kqm-EntB+~MHJ-tyx_1*AVvBer2#bJ zGJEIl-MYSuYG1LaEC3$~DCj=1c>~tPCI}<-b;ErPs$^N> zMhKrr|MQz0;Yx8A11<0-tjAhChB1$dZF$dcaAPgxE$JYV1CYkoV+gquZ^`_HIC8nQ+Vo%!T>jpNr+e$nzkNPDcxb21&?12gs6Ak%t4fpZ zzX1$I9~vcEF#0G_YAO4v-pnG_S0IQ9D}GB-ED3im4ysDr#XFG--ADC>;Mhi7YEfiD zva6$OB{2U|w%ygTidBqI;HDBfIvGW`YB&xl=iqDLR|msZ#x6t8gx^KVSI{j$R83vI z=-Qv_*AJK4u}3M)TA$+U@h~J@ga=MJ^(~}s@rnrfkUs8r$#r4z@9WMQYuiGi2zE3w zcC6Qg3+Cd*#1qg()~i;h+g`iGFDk=kG#rJsH8u{ftT;~NJfq%5r^q?DB^Zf zk&C$xw3qk?*v#(<#~u$H0We1%Mt+ZoKrKTIAO3rt)D>vI5%46+rt!eZL`EcqK}7Qc zR0vy(NW}?STG{$rZ5t3GFAN8ng<6oO6ibVdnIHstpkg&kCH5kMnGLp`A;ft+g}lP`s7mLh z_WTz=HjTG^-?-Q~V;BxX$~!Q4aI@2eX{9IyWv&g>GF?CLXY6mjO3qc)&WQ8f?;TAHB^Zj*F1X?$?r#C8_E<+iY{eQ1W+ zeiTx=5CqGWLsl;6#A;M2ki$MEKeC31CQ{6m6(Xl1CTXX^tco;1b3DVlnstQc&ez|2}`S+j1`#$oMSB zcsyMcVnl6)n^dz|`VvyNq z@3d-jAa=ea4PKdn@Ic>@lSIaozzzvfjSrPq%InJ$zmDvKkAob(@W=%JMcO-0S?Oxq zP&Sc(g)7*lsHlhof$!+dR2B+HG8usq26q)QIReZ~S%aC6Ct33K62noD66^#z7I*~G zN2(-BfGY<~q8tW@-hJcqw<%tAzH80n)fn6njB=m@a5Lz@Wh8gOqkWO)PkNm!w%=}0 zI+6O8BO9a#8Mz(tkiLM zZhr-L`Jj&TDEiKRZi>kx2=2W6T;(vtrjo*^2 z{+xgPZ;UQm#U`7!pB?<+spCgYD#DKtQ7~@>3I#d9N`&W28aRL_WRajWTD3EAYi0^Z zNG>&iXZ8BF?|8$+id?RaAAX>yvZzC9@TrC|{7!1B&1iJ0Melg&gb_Ha#Dm>3W8sT4 zafDdno2Z0I&1}{3rtF^^x9qO>DTww z4x&-F-vt6DcLVR58l}OSi*YrvKl#)6b|?5kZ0 z+&De7{+VEZK)P3oakqA862z-Sd5Jk>l1Kpbi;N5*7;+t^hi%IdO~Pyq=4xP6lHI>b zg`RV`Gti_zZZS>|z@+57i*Z+gF@8!h)d@U^1i4g_C&RU87=Kj`wf&9ag(KJ70mqJMvpL80CzlXxYQ)CIurv80xM?6MZ^vl|{{3Ry_;UwY)ThgI^&`q-DU ztkpih#?=ZxI8PrN;2q~=^uEn(z4E*A1C~qO=Wp=Picp_lU@G04ceMuZ8<9frnr9$W zz0bfgD_6s`RP3;W;S1kQch&abECj^!Am?I4UrTp)golPT6*E^9zkkg!tm2vfe7etn z^%!xSeYY1SA+bfmqz7?iIpx^frvDhTw(VpsUyaQlAGF%q{js?!GHfjEMjI@XxE2C2 z-BoscW2A!q3wvznpuhh5S8qd_A{L(P?2rZ|W=%LJ8DcVE%{w8UA*{jzhc??Vg2B1j0FF;U#WxQ#Y&CvGa`RoLEdHy zmL8JCksBC$4cVuF(l6-gF>jA(%@*Vn=mH^gBMHRV_;{Ncb^a>p#FG;0&i4~m2LM5W zTZ@kR&rGr{nW<@?buqt+{3^W zNMirT3qAB}2|djuTPPvR(UFe4_k-OtWEuQJVo;s`l+7IRAJ}p@8MhAl*Z63E3>jXK zQbNW%FIhr{W_(0ygXZ*h_=_o%$VJO>P-AlEe)OdDG3%W`N+>BnB%2($ z4V^k#D^Dn7pFp`PgOq{Pd}y@lVj2=Y!xgKWpav1AJ@<1|jiA*@dZM)8-j4!`cWUNI;%fBxxlD!K{8MK2;c-VLV!N{6BMTZ~^~ z>I$`(o{{eqid;n;^5oM9tPiY14t>_AuWJq>aaLj`!6na9dc++jLmP zdb#L5@s1ThlQDQX`i9i_yLz)G}6vRlRj{WL>qRnf~ALcAh6{#Rje`%uOk#Z24nY|6aMox*;(J2#a@aKtKSs*tj9JvVJrMpvmlljk!H4 z5&bK2x2*ZyM%VRf{%Md0=LUL$CQ;9P^8B6gcGx+brm)`3Cx_X<} zEoO_HKcj|JKu2iSMIk0*M1^(`W zzA&@N#nJ9O*AC9Tm-@Gp5(IEWo?fB^t=vHdHp^`_k|zudB4KK#OxP~8ir5QBr|==WrJsx6uV&bzRwL~E z@S$yo2vSU1Ph!$9xPvZbadx9vJ&p-{@7$~%1Qo&%2sm;_CLgeKR z6g@A2TIr@{5|Y|?L`_-4EsTtM%p^(99|K>DT{F{`MsO9@Pe=Rx*EVvdWG=j|$3lPf zP?1tvBUrCryLJcFy5-49P}JU=71>9qrNjgLqENwH+v_nf847S$uQA=Ktr<&#qT38C z0$Hd8&8Yr$z~|4OFHh`Wbrj9){VbzBwY){c3yN(;WWWc#X+WP6q8Cphw#LN zwkRCM`Ed72-xb0$>kx;B;wX7`0$BFXGG%(lJEe*e6epO z^tE>|@fCVf^ai5IV0eTh0z(iwQDLT}D|-OyKuXRuhB*<#nb{#vC^;DUjUvHScnO3W zgCX1r%8Kw#O%;1*jPz^s6dOz}vf(pX*FToG(}iNyhi@R$`I6?B)|>CGW9R!35)Qf$ zY!^hJ;LcRQHf$4jYXPo&Ko~^knTqjWjSe$Y6hku15=Rg@til?b!OD<^aW$DSh%E{y z^_|o$$gu4I)rI3(dUY+OI|UMeJW|slo>Rfc5k;V7oRX{CSbP3yUoq(Ssiz&j-D z!S(FYp9AcGR-u1>1_NVD@rE!v`XK7CbEHVX*-a3d?MIz(b_;FM*i3gQ2`550a0t@E z0o;R0J};0)hmMVd;}n66Ff6GSK2_M%G0bT-n|QhwCR{2*lf; z8p=j0?L$8vQ)hn0pT`r2X8STNOOfSOA||s^(MCH5tluPQ+nck1s`atW8HFq&fpl+! z1qSXM|3S}4v|$SLfGgNwK+Ak3JoZGvv?{;4S`iTlm;J0NTHU*A7|hCX3`?oN1P|nD z1(lWZu=2ssx>8VUkY0qX%9NSg+;s(mydx-r??Bdk7{5ge&a|7r>S(!o+|iBzx5}Di z%#dk`NGJ+#HfvGLze8_`j07P*`+%xgZKiOi;Q&Ru@nG-yWr10>2C)i+sxAOl)nO^SNMG z!${iQp2Pv*U;V;I=7Y%r7kPDn)>=R8R8 zxCKe>k^5wH?XzH!*%csub~}_q+tz`(7(F|5hmeI?Sp2s}|0}P7yke3kmQ8Iorw40C zl}sLo3GU@h?~hI(KHs;R1l|}@L*~QHwsUn}U7lN4-gBSIT!;aa8#S(PU&gs;z-igx zR0@X5$O6=nM;$liyF0k!Cc$ ztDnWUWUhE4di($1_UJKJh>$0h4b1Gxj2qN&ALe!lf}693P45B7L3TWi*+6c^WZ4LL zKDSoAD+T+8Ld8BJ&Iu;3cWfJ-zz6)4j_K#DWRojYc)9qs1;bY7{WGvE6A`uvdjTD`$uDr!av zs0=6o+Ve8BdAEU^>?>>~LmUon-nvByO!)6{9F#K1YiL6UzzZg{XJA=X1Up0la;zO> ziC_|F!T&~|M0xs?qmNvYfijs%4@#!wXd-S#7XNQO?7Yx+Ag4QahAZB;QB!6V4q{yd zI*#JN47i;IPe`^{pkF}7I$^3H0WdLsaCY_&w%{-->61%VFqGny@CmD)^{R0uUZ9Sv ztoN!)UEF+jTiUUcC%tpY*2kEN5Ja9v+h{<4@YSPXCN~;45qJzmp1dbfDr(a*NjZw+ zMuuRYBZL@Gh6qN0OTrOui&>JVOA-ZCcNh@67#%teF0Ru!B&8HtkToe_wjeypLXb-7 zqf(%1|8@|QGRVwBC|Ba`ghy&FF=FH+$zWR45ZH-A=Kz3K{Av|hcxZ}To-_H&TUm37 z+{Isj18_)EsQ3;=5YSDr#*9xL;!Q#da_>mSZJgA0ZFs6vWD31HDuWE z>uWH4e*X3AS3WFl>S)u-si~=i9*u>Dq}CkW+a_1W2kFlRl)Qpg1Hog5_U|tf7R7;t zHH*QXetx898a`!5An9uoyiPJ}F&7%xzo*L#xMIi5cgdmSkC_gl>QjqTls)%r(L~Nh zMSX3&D;z;^Ps9b>(4sK2Mjr*oP_lQ-uG4SV1=vJHI@HuL?+7mb~? zCsUcF4%CowtmJ7?%Ls)8SyX6*JD|2J*gyDDI9y;(*LhJu@&eh+kHxWZikg4?_#s!f zOSu2*fJ4>@Gp6Svw$T{=_~Wr2%kt5x2>>YN@|=(BKoHCojz(=kJWXJVas>i`Hgp-k zpTja9LX%NjFTZ4=se1h*x!BL2mF};jb2J0a@e%b2CBz|#*|Cs%D6J0%zC>d0t=4f& zT#bh~tyA(5W)`;?|ZSJQR0M#bPzaZ~g^M)3R@sk6;ItobD8l zCbFNwADH;Qi&xn%3qHbTl8C?Q_w4BWE8%BHE^)8vC+CGE|)?@ZB&J0gqBf)J0iUj zDh)j=^oO((s_9ymqS@T`9W`a*`Rv~ae-k<369g_$UZXLf3xbPNG^DZG&S9^{v6KlX znmVe}N=@kttcB!VJd z=)>$+`|5X0c@0NmeQ;b6`{FilJAv%J4pd-wWDDj3p(+LMj_x7 z>NUf_+GMig9&emgDQv^gEip2^`SeIVHbJ6EtHfYjR+dA2wjbUVLL^~YTtS2+ttUB*6qD0&UY*`=Tu7hFA z^CYn}jhJ(Db3;EJ{9?7y!&S_QS;*ov+RLQ~+iu|d5I`1zs=zkR(RXlgu%L4-&Ns{y zM^Ba}?0^%Vw4(9k%gBTwVqFAyS3r6@9`8@eVJ9Lm@8FNQkVC++h9-{~@I3O95E^9B zfn(Ub8C)zyWEET=MI6%ufp>?iB!=S<@CfL}blEU~dD*XmzB_Zc+FUF4VSmhF;llS;59zSQI7#*M+rNCuxoU5+9|zf z{Y9f@wIe9mx!6L3g^sADn$3~|vBO$02YhC#-+}bs6taGQWAqT_ve?Gb$SKGK4`d^N z)kTlm;bnwF1+_VKg!Jk$kgXD3J4!o-S_ObP5ROcXTpxiWKv)h=?KyyJGjRFI|J*!) zjVD0^87PQMx6LQ5RH6kYCjqLc&lE$#f50h?96=UI8qrpu^X!*+XR(DgYE|ojd^S^C zDk?b3U0b?Dq_G_U7yTdh-aM@5wOt?nCgT#BH6TNV2AU(9D3nYMWGXbN6e-c1u?$U8 znxhoWR2nsk=0r(JN~Bq7*8HAV3+vf?AA7&YZ~ycD-s5;2dq2lo7T-SK`*YvdeO>2u zp69jOow6JJgO0iZe%jCRiEY7PSiV6iMncM-bC_LYX@g`|e)yyn8V!gEibKs6kzRDz zX<6~vgo!AEQf2~UY4;LzIp$w$^aifY?#UJ7) z3oK%E5y(f%E%g5HK06Ig1}FzQgL&4yXmA)$wRfki0~5J6R-`gBGta;7!iZBxG_drq z>3)7`6fFz7mCw3To6&&#C#1 zdhV}RG{iJ&yr?WiWRC zjxU+xqx%RG1IcisM!MAjbQunSJA;J{!){(ZqmQBEj7%;J(DY{0h8qwCKxQX21${{5 zJ#eq{KpFXHs1C*lmd9`)i|@^>@{Bq%djRo~Ls&;4HD(YIjM@(x$tEk__JDHm29|?# zj}m=sXr!kUKT5yIl!2oDl8DRILmA_(d^5xeLnaTcfotCcQhEHid5}_x6|>sGEXd zVm{EJk^b|`ZIJbXsejx8%81M5U-Z|kY@jK``rIeg=Yd*gEhwm;P~W1gegn_DpjA6v zn$f}~b<#T6KVdaETP52u*Q7)6pqA@eaF;l1Q92G$4x1~}DJ#Nq*o$+5f75|Ts0 zst7?qb|I?&=7FQ8i2az2$h)@M^ylQs)@*|m7Av4cQcfIJ#Qe8-7LfyM8CmE4uIX{` z82SFdFIrH{poNqkiet=+`k&ATSS*--&SE0jiaUZ}xVon0>QAFcy7Q=C7BU>~2s!4- z@*yT`L9nx=1{>~*A9XLsMSywhm8HTWd-;x1Ah?S>9?xzv_K@ZIjAwO&G-H&pVjoHs9KiiQ%@McwJv4(0QXl zcbJNPn?M{K0!(K$uBJQnj8SB1A%v^u}pD~+$2!d)vhWZeatcJh*5&Tsp z-({> z%p)3n%XJYm)<7fB3b`Bm8)~t^+ugF+obpr9b#xMrfL0L=fHhpF!2PhshR;9HXn8=R z?aD?k2}mhbVp8S=k%hYj*ATV(?i!PyhvKfD-LhHA-x>nFW{|lc0_fhZG{|1`ikQo; zO>DKpje1c*ekr{(Uk1jch&-LlXwbxMz0pf|!aszenJK#}95*XcJaF%e-c->B27J z;1gq_|65vMnT11-Zq_>Gm9H)k4Hm320CnG5TO*M5ha+ZvP+s}@2dbKIF!GfqI!G)6 z8e9Hf(1w=3zF$t`eQ2#YssBcR3$6wHGKKtk6QYQZtLk5^)RjUKK&T`U0PlpkN)foAI@^=bk(HGtAGFHN zi#qJMCEU=<1^^#DaSZR8+%Lk7;cXmf@nwbxccxWeghb;<06$;iQt$^VL1^lIy`{4m z*fli?(Je6=iD2aEMM??WUceK4>b~tkQ0(^8`-Eu^_GKHEi3&ELIem5GW9pDH6P#}} zIPa3q-#TvDWhNr;-;`AO$E-YUk=pn=Yo0>n*ywN(YB)T284eP`K=`)#KsL#(1FA{t zbRx*@iHW)PS@s~%raRo=y&=PE5Blzf4iiqJqB~aO_)7|NEOOsW@tqWYh6$~ZdA>n3CbuY3uTY@E`ih~8 z3}z!@rA^PCg-|aj88R)o6OF*&I!(m*a{C`e+VWCAumF*dd$*@hd`IQgs11Sw|IB_= zF`)!1pruM^CYS?Pu3T|9-lFw#dO9c}hgGIsdUn&*wGyq-)rutbuQzmC{Pd5AuIdh8 z{jG=>T%_{dC1gBts>9V?K7iCK0x@JG&&Z6&i;)F_WF#?W)v^Q@+c zaNwo7D_mFo+FjUT7jiWF)7pXB0Bek*$_f2HqL-qkr(qw|`vt5ZI?!v~SD{`3l#@T4 zS>eX7q!53Le0aXxZvkE0<$oz#r0~co82sHxW5CNY-WMQ8i>-~3Qep!N1*UPp_bQ3uP zrKkU2Dl-3kE1)0$lRov|KkD}u`Ckfr|8H49MVO{gUW3Mg7{VUagD~3wLW2K8wP}+& z%!2D0w2p6`odTdhqEQ$D2XU$SW|1X5DWDlKwFy=DQLEas)+98x;0H1mJMe zyA=JIokU8+H4m*FzbznV0$BI7{=;^()yfU*D6LWo13pGItHThAe^I$~JYP$6Qf=AB zI-7@>@Dl-AiF*jjZ=%{#A2Grz%eBqKMQFzS9k@s=w@IrIpPCY>?yb#pIN?Cy}QKJlZ z&e+)SSrT24&dL3I8)_L}kO{fLxKRS6t0bCp7WfiX{&3@Y0D#E=wQDsJa0nBvj!XaE zhft7<^mCIP!_eoetP__dV6~i4?wZ?Y0gK&)-3OHbKm|&gkbOlUS>yg#3EpWtFWGp;Ut*@oFXo}}u`Pc-yPY#T(AggziN|KgBd}uxuXfyb$jY9e=JIv*LLnbg zf@~uJ0z^e>ltK)WsWVJlNJ#72;LmKMt$#kML3*;acpSuRJ;|V9g(qS~(k++3TcTkE znG?#uRpfo}jwok=&fLdT;$gcMo*0crH8PCVii9wwH0bRZg6{ZE zW@aWg4+t8dMYKe4{`sjuJxX_=zyFI-n21W4?nG|#QlQ+#n96fRy$7v_ug8eOmcaPM5#XiVX?>1TsKb#aYd-XxD=@02;mc zyt#9O(3HYIvV)zc!FJolxy$>x;IgHhUxfxv@NX8+)#`X*ao)%&ujYDJb28iQl$qK> z!-CqSI{;EA61KsioTOXtK8PThiU|1OWcZdGM+is%B)UPH~o){ z#`rtvmk>Y1{rs1gGoa5}^Wg%u!y%9@&O`K3D9c5b3Hg-=H(4Yh8!zaGqzdVnu5<@- zm>os`1e1VORq;Up%6;;P(2f5%KOpHyG;xduZNToFxBj`$o9CSwr#HMEBp)Ymx|PcG zAg-a~$sLZw3vlz@J(_<$kkRgn?BWC~A?8pzb6agV>6U0vz6Lav)tl?k9XE5RWd?+D z5CIpxoM3<*gl?jP_7sOA9IvH0@<~P+a3OBWN8do5>8`bhXiumJ$A;_1X;LmAllP#H z5SNpvfv7CYL7?OQbiaNhSfIN{|2Y*|8FSl-j_HJK+yBcT67|!7V{QgU0(EjfvCkbz z^ExmC5uw|#rdC8zeunxt_@2P=7!nsBshq}T^8h_9(BtILxSObp#ZhVvK@f~`HbCv4 z9dt<>6*=c`rW?uTP(!Gr_3VK#1%ejQ$A-|CCpSxQ>e~J=-N104Al3zKn*t{gVyzL6 zkWB87m7?A__~A$4pL6bZ*E@DCG0=rsavs?xo~_fwmr7DR4q1i@x@|^fe^NWflKI5? z`u*$Q`O@!|VEiXfg#W>%x4Tat;PHA4bTaz3E04%w-Ihlmcj7us+h zIm9IR``2E9lQ_G*X6M>Ob;fmC`2XjQl_&A_yqQASJ z@A|SG!>Cv88Hhc^@5A?xw3$`+wt~y=v5ok^OUw$k9BigeNQhCnu9;AIOG-(p8r#G5 zrzaWYcY_Sp`QmpDvg%#zu48eF|D9G}?V4USEqCSbw3<{urF(%pPAFt{Cm4h71S-V} zMKW4L`4|`X$4eiYPOKRrp*@@U`S~jnVaK747S@EiquuXMa_Ba5E*9ip5@s?XY&ZY2 z3i(pg``aP7{4Z(oy^23C@RGl_P*uX)4M-@dkq6UTH_(Luh4%qFUt_QsfH<{?QT0GE zuo)&HWKqPta8;7_L-j0a!C(5KXT;DXJ-^GY0Weg(b-s_PF%QUAz| zKioV~$^|aH^`P=?Ol3-HU@((D-ko-l2 znfQjEe{*vt{_X$dLd}<>vB)?K5{q&0jco$ zjy6>S3X$SWwyeD$4E;yT&t{H3Ldwj8LI)!KLMTWm0ul&~{%k6G1b4qApjr~_%mm)x zSlb5v#FG!T=!AfWTe!bOcCQeN98&9dFz4v2F)O8^%;-lup{PeogB3m|M~J}`bvXV! zSQcu-sVE!4!3`x>B=q!QZ1@C#GI(7pkb&<(=-byb%%MFu`7C<5)P}>i3IfZI7fvGJ zbrO}AOoC41g|~E=0l;XqP%Ii=|8;JPbF|&Ggm1&O%pjP)`{hKFkER&e8}|09p&JVV z-bIk+*5PYbNVCP8aQFZ85ey< z9EDLsYpbEA2*`w@6*F}1Mdy8i0c|Kbt3na7|IW(o=pt^gx3`z-DHb`Bdfvcy=myvU z;I+Y3MxT%orao0oYWN&J{* zX%;bH4V6(zB2Xj|>deJhFAbZ`W-#9?dw#<}erBwnx&qdIDNc zDdqAnxhZA0Mih2@N1}5r}-48a?#g_fV|t*WJ*YLpcmb;I+IZzVfS;B3N`C}({43fR;qS3vACFC5Uiz~s zN$S?3^#mHW;GK(S63vLbDZxxta))7Z0sta-o8>hQOlrTmKJNosj=d-jiNEI_LX;Rh zHzpE0Aee=Og{`AG5X1a)zcT?i$n z64>F?=_e9C8Gn(hH+4P0(Iwj=Sbizv_ME3-Kh$eLg|xdGT$~dO%6AaY1^XZPX95i3 zNfAa~Ay7L|-hP`&?8Mqj`wAEHQ`SDx%A!pPHPN$*bg}~PzvsFwCh{nK6E4!BBCzY3IDQk zral}_D1i`nG1|~!Y!+Mm)5ZNmK0gTd1fvy^n@b?QcS6}kFd~FUr9YhnKOhtQ7wNUN8sZ?@?bAuSuO$4V|@8WYrH}g9!yg_2l_R^`lR4^> zssxTYi{uD8H3eF~PV4xdR40yJy^0Zh`aUcum1^gFjTB=C7IeOCIXc?`1~K!WZUhdV ziyxzEJb&@xQ!MPlFGo5vfGsOlry(ob0eOfM0v$X4VnM$dj9(ri&(xEFHeyd#pwsq- z0|z53^Z{G0C@{HjL7cL&RR~wF`oq&7kCuliKV(td0VTDUFYZ)*J@o>i)lZ}kccT+>%hd)VSTFhb>+YQK4tEj{ zZ^HwKeN0YCVF$3XZ}j@4!&!^8yC86D+`SQ;BW>(*#p;*2xg5BsHx35~p4=8^OQL`s zU|~2zAd#Dlx%Y9#=wqLiM0Z{9IBxcBw-SUF^@v}QTJ9Qe{`4Z~sXeS4-JP&$*}+^% z6j)-ioSpp?PKVGFOB^#1^YM3MbGl29%VN}|6Q1@|tKFrCH{f%IOIdGhz#b5od=su) z-$!=Xp1S?YxTqI=M5I!O3Z*H?{#oe_#NrBCH*R)-RZ6|n3W_INJMC;()H1sC-Mvn0 zQq_L^IXD=~(YUs~6PAB_5@%^Ia@p`$X8whivRCbVp{sv_r2H0uSNtdRUGEwXJQZ9~ zzt|YpsuOYry@(50_sX^`LZYr4KZ?~+HM$6+fE;jk-Eu@A=s~ zHAB}MVPCV))m3^(fw(Ei?+MYpE6wgITOIPvwiLf4$B+nu>89E)P_Ma@ZUD2TPKO)x z5=V9?V=M!hxC-uIhTm(odxI!0;)>Fz+{P7jDye}QGqMumEJj5)k!;X`64xaM_e z3_cz3x^`U6@<*+QG9X=1*Yci0_gL?XU^?VK1Ag0lb8X<838ODGNGc)WhFIk7k{A|~ z-INrcWOENema)s7AXP;0FhKgI7cY{t9GP-~Eb!9E?;u)URFoy?7Qv|w!&D>irJ*@D zO|)UZq#?p0M?*Lyl%60CLsPd69WPc^!q&M9pl(7KGDvlay^;1gd)Mr;(E(BTmLjZ^ z3=3@y{46iDOz7&xOAie%1qWUqoMm_LHP?hg3GEYew`+Qc#*x|q(l4VpQAizM{RYbN zHN#ulaD(+Ct5b5vHTM`40EsUADXpril0R;x*tSdG=~db*bPXAwt4N!P zjo*x;ur=FX3B)D4V0ygd-#e37xgRyrPQYz#@D{X%gM44 zwS#+PM}i4#aqsNG3r1o8{A!ZEC) z{6j(yc3~uBUVG#VQ$9wpwLQk|wbrs0_OBGw;KAZs&nih%dq~R(E!t{!DXa6oG1~kq zpD6qc6bA!yDO6s>&O@IALaztlq8p$iGz9l}bL1AV7*vamjVyHm6J$Idwo2FOtkAX2D~E;2Ple%eQNthlJ$LqIhn zB9K8wKNF&6ynV)Fe2AhWE7yBRF1s)&D%N3qHW>xtY*AIGSVq(yH#awKTM**2N96L* zcruLFz0QVss?%Wm1S)WH0HuccgmE27N=f63M=m>-$Bejo5OLK4_iRtqr%$6n1ACJv zl_LbI8vB_@60dYQQ_5@fx=X%7u67#vh+;LliC}}MxQCvGruR;aHB5&o z+6fh$sj~CywS_t%t#;#K?cZ|)79Ti{;DBF-THD0tV5Hbv`anPJ^+AX(EmW*t?mSkB-FUvaY031gXnNkJNh1RL2#*w=4z)I%L{$gBGyEP^>1I9GzH%RkHSp$5GG9 z(QU950O*1ZZ~$@%qO8-L0GhRf%=A2@{MQ{ECL}K4O=AB?;41JAI%&ujpFcCMTO?@x z>Zpn7A)^g<=V!2dd0~a!Ue1|2JE99|3a0-+@Z^>j7V*d4-X`2cHzZ-lAUJ(S8 zu%gB+6{4WOX3yT+lYpq|c!E_HL&L6-gOdB$S>_)54NoR5x1DF#Rt4GtMDV*~jnTg= zKC4CW8me}sgib6gy$$z{vr#p;n|c=ST%H7N39-@1tD{tUcsDs6q9;!tj3|`WyiHo7 zs-1A80t&3rA33b2@$|G}BEZ_Km3(h%onh!#2EM`!~{VEZ;<3@G+Dtwn<_Nn+P zLU}b*Vx~ta8)DP73U_BL=HhySmC;9GOn?Uxh$y7&kuXW2aV4m+D`GUZkj4e;?8$9h zldO!zw$_)zW}Je}*xb@$uuP`v;$|`QK@}5S8l09$sEiMm+H;TANhhP74Wk>z&TgOz z-t~bMh80lRiJr(}(9A}1jfIIjD;*jPDy%PL@a+W$pop*FH3oq)q=TWzivD}%s2B`3q=|(}AIuBnvT2f@Fq(?AImCUM_{Q2E*S)0NTA5Y_UcMk#e zT^~^(Jdq0k7H%*Bk|bB6qAj-FQ+AtDs5?8Cy~V$C>yAhn4ebUVtLan*{Hzwz%TEg@-Z*AW-nlLz=TU!5GdML=&Q_oqV}q8bHb)wI?TwS0=^d zPQS*KeN~_&1X?^1EUYigIuAh|$?Wo(5=&T(-T5*I{>~@K zy=}E^BqstNfv?4mE!l2#NXP5b9==?yT2Lyp)&S)ueh%uPy@a*`r0IKNKdOY)D}2W} z&u#JQ5xE^luB2WwF6$23=MH$i7lm^nq%t^dg#HCuwek>o6sxq`{_Ul!yOPu1K=-O@ zJlttNev~{dxx-O|({~3r;TJxPIQj9z7pW3KmY$Sp+CNUxam59ol$v?GAm@tI&93Pq zs(Q5p8#L4Omo4U1YYzH8e{8nj`_NeS`1}&1K_9{TIvqO{t7^kHZ`=^PDg|=Q`RH9j z#?_*z2TJ@URIxO$paZK(1(c=onrWZ&CtN)rFJeezc$;WCE@<}-*N^zd5lD@FVy?z| zZ^V7e@aqDwh-44hEO*P8z_oEeH1lk=r)1^es2103X$FJ>yVzs5jNxAk+_*Te<#0^) zt5@48tpj(8Tags*ux9sZ_EKJ+v;_)$OaJO+Q_rS#IlwPyg(B%HL0#pAk*6|ike#&wCGp@Uu_H`t4j(yc%I2D7>tkD|#MX#gs3Yh*s2``7EyhUPk^$Nl zao83(-4oVS9i{sFl0kZbBgFd&LQHB~Anssflp9*c;;KnF06|~bC)UsUxK`->?Mju2 z9LWx};h`ZlDMotO=hy_kqNUwRGeJTEF*fTFv6bP6D24du9)cQ@6fp&jB@C5D;Hpq5 z94Q+;tNEzEs1n;TY=ZPd!MDS_N<%{<0dsUnx>YtR<8Z$9T|v$KDfuNhYbh*`xy8=Z zZ2EA4Y7^<9=2eOV8-?qzH^53ZDnEkiHX2E8iGb}W%(}^7^_KC~D_4rs6Od3Cj%|%c zTS=JZYmf{AxX3WB7yrQ##JutYB46iP2CT0gnD=@Pl+WiPjNvmA1i1iq70Y2{^5y0N zx%{T9j@qs1#Y@J;wOSFE@qQB%lekgj;ZWN6xB>_A396O>NJA3QvQgO}ICl{HPWav8 z&+8~O^`uAPrXzjUE#syVzf&Dt8V_Wds5WivEJ_55`3V8O>KNKpn2HoJQNR(5M(`UJ zDW3wo5WcE{^MT*E6mvSj*lCvX=U(oe#n@L|;xfshYuqvbVD48&HA= ze!aV)AIi*Dupu{QemVws0gwv$6rZCeGl1&Yy+eSU$95o;RP=90V5;G(53~SGFEakq z`wj9HTo{9KVduZNEJ ze4VS%?V9|1_h$fU(*VM!SoUA7#5y`z-$Q`RQYj7kW%@{W(S#d6&9gugwZz0;b}u(ii0NG`3D3e}Lz; z)};0d*j)8ipMEkyXb0C6jg1vBqrz+i3{KgGSy`39x#)IR9PsJ_NMPq4nhBSc(`R%x zkhyIo88m_srq+qgU#{lWw|kcM+!SR*eX{zdq?txc0zn~XA4pA_@%m^wcg9dFpIKrp z66E>o>wQ##7*2q}K*NxPh7lOl(>`e|5_J-|;Z5I}mN2q4;OBbQQ<2hUyo zMq?`>y^{j%T_PuO?b4JdtJbi^S={q7f@ars$X#Ubnf0b;u14kZE?VO=P59}|jz0=;pOyr!n+5Om8b zBmBv@d`OJiIQEua3fU$An3t}oRK|~I>HPlsQ1L5jra&+@OR;5W#$SK!Dsx2=Z8?U6 zA70yo#C->;aQe~F6Ghn?C?+uHDKqs_;LY&C#N2#O-w=K_CwOQ){cq7miT{p|QjH$R zZ|*?aRoqYT49XVIZ2MWuPo3|mktq541SN7VD$u*%*9r;>e)=kra4z?dyyhYDU-r~IOo%A91BT;ecr98c!29S@T=jC$4$k7tJ zpj3VIbLdyE>Wv^vj$r}^t-GqNK!0DK7$s?jGBd`H-#PM$wL=1LA)dUG;lprC?Fp07 ztApC-;OxhmzG3h#XxJwk<6zsv7KYxtC)D7dw}oh4>Jw-34mc^Nv4kXkM_-5+*fsuQ7;Z>N zE1%;|dj{p8Uv&Em@9k%_BTKJgYm$r}umaTe41MY9V4=b2b`u(eMHRZYp^v!GC>Z1d%b*D;)>K3avGNz z_{X07(x*Ry9khP`_ifZXY zr|6db)iVP+N2dPZ{1UIV^+Wt)s&;`jw;=9GXb8LUj@`fNaSSR*gb)vI%1HU-jSUQ)LdZ6royeY~NYx%D zkvfWMBmrV0`UrJ)g7*XB-$%|O@X8>z4X$XvrrBK5Q2+o95nMnCP866d@j{S=t|4j> z&%bVkCZ#%l8X9kWKq2H&-`}~b`b9>}AMfmW(UeO|E`nL~e0m>Uv*d227p9KNol1R0 zOimu>Cv^($%v1Kj0Y4n!V=Ol&Qmky?`T7@AsJ<2w5_0Fpj^E=gxRn?VNS=r)#3Mq=+2$ zEoCMMwPd~Z9u*_GP{n@-C0y%u^(F0LB&+jFEHFAHf_e+MEJ#^GxG)JXQ30w#g-AVf zFxk9Yn3k6U%Atk}8IqIS2F1*d#7|qmu0sYUCes7;=V`=TD_AS2rZv)hZQw1d74d*^ zw@v0T+{m(uk$;!6XS3YP@KQ!UoQxqB3TR3y9!2^Eqc*vTn+bhtq+n!f@d0BKbPiz+ z*bC8ylSO#8YZimK?ExAf0`Cf7oQzmhBSd)vbZ+O~k8+zNB?QI5aU#pc4i7;s8)MDs zrt%0y!M1tuZXk;iVsZc#QYxzj+n=&M^0YvqKwjz}4y|WYIntnCYUZe^sRi5ilR9|) z0PF18@5$&hdAzbVAn$$NkF@-{X50h!*dbXI7RywsqSArtR`>-)gPx?0=lF6Y+XpoL*ZL+}ZRs z`>POhQ$|y950Qn(MYx;?H`m)`Whwz*TjN}1`;LZ+*?$G7w0^*P%(cpxmzRep z8~Y}?5BEpgf}JekYPZnq)@56ju9&j?|%~GfQFo0N{s{9xFF@DjH|r z;>H|!XdYgdI9vCkZ^+UZD8pS)^=G|%x4^~41*?{gd-O&`1g~=JAgYyX$Xg$vsx&!w z?s9lI5C8i0^(`$okz1ZX<#xqPApn$?li*}De|o+V!sIEcs;Z)AYnR~ky#hpk1OL?^ zAaf}ANzdU5%ZuR8?GD(y5D)V7*)w?@LfSfzq)*0oJMyg$sOX_|D>01dn=x(LG(0tp zfn}4ilLWsdJ2cXDGmV7x-g0pR%wwi=a&jIzdX$C}(eDr&ro!%4f_C2QGZ;pGhok(y zgxOTEBWi{KEepbt;_XMA?i}RSDcb6qnl4W=hIZ*%jh$Oy#)7}Q{-;wg7D0C9ofF8Y zYC7Rs;Bn6{&iK8J=E^&t5&jqv*6T-*fT3f{N?#MZJ^A@_ZFqNWLMN&5YVgXKHLcx; zRo8YqZW+ayQ0sV%*Q^rtn8or_npxBA3>cyGcGhuW^qioG$kl*=rSKXPgpJ$?@|7cb zk{3IcU|QZJaM-;4{2l^dL@GE9&iu|JBQ^}*(r}KU4RQzE-QAz+zg;eOJ=P5P)b zk-EMCSQ>RGc2Y`83Ws)B_2Vm7Cc){Es1uG4AFjH%VW0n|xv;mJ>nS7cv)ch^^;ZyU zt~_}n0Gt04T3VB^m(I7npM?@W4-t7ot5q>u4z>>gRuHu?_4oJF14l;k!2{a~mIEd| z*Tkj{ZNBdra1lKSa1dS~K-agn{)IRH%;>{BNZ6`U%pSk*@dHGX>5U5sCD1e_B_*iD zAE4a+35S6@0RbhUk!_m`<_uVtJ_n;?D)z#c*GxD8f?>Zg3WGuUf@i zP*5`PQ%3#c5aLtjx)|8UW&RWODVg+gQIDV~fD76yJACMk^_N-NudTb53DPZ+`W* z0U1#gn*SUF;959RCqb_`!elE3ZHLDD0x}9Spxh zvU=0=-aU?Id%xIAZ`$N7lv{Jv}`mV6Q8cw)+F1GwE`6a;n0&5Vh08XtgV) zFlK0d_8quccE-X53)bz)lbL$>@L`Uxdb+wZq#wd4Q6n-=wPwvYj3&F&>T z&~IE^oN&V6K_@hTG2Ke0x2=-3v9U3s3|=Q~&z4$STVoA7*1t#2|Al5O@!aFZM4v1~waMvT zIRBqpT4v0jKVL^rZ{^yx(-C}RajMqYXOe6U!5$M^H$Y7ud#PU8xP-R7-p*Cuyd#2ot{eSOyvwcv4e$Jh5(Vc{-? zP!*7Qr$WQWg%Kvv=Q7)_u6TVa139%8E^|gi8#kDxm{7AEaRT;12-BZYw=IC7{$-R- zg@=$PyntCmvSmHv0lptTpFERkkt8jCFmN#Bb#+6_QbQdw+CuPkzW^R*+AmeA0f+#! zvCBxbzMx#k$0j;+$t(*D4CH9anj6`;c@hJtl|yuNban560|(&9QvV)SolXR$KKqVr z*)$dwjdM9Ui%LFfD}d^lY(Gx!uQ!pHFfqu$BhKkvz%(@UvJvRJK4&vCD$Ei2IbmIX zCou4}h7#OfZ|`(nR*x!1P)O))&oZ7$U;y4eJ}WkETx?)qz=+6{i^#CnYCWinKt8z* zc-%|ilo6`@x=>RJB86SD`TULH0Qr@dU0qpGpWVc{4G9emdiHFAlaT!O?JM~BW+IC0 zYWf-31zt&|FK6M$kDN&9=uy@4z$KhaUU?tKnt8Wu;e;I~S~Hsmw%`S0_UK&HGC3du zyWP7t6}JjL8or-g1D$nwXi;r{1?~)4uC4Igby7!XGMbE4A3v@V5MV`QK9-7}8wWCJ zBH~u$evH$dNk${H`lhLsmV9e#n~D3#!^7h<(5)INK70M++rgvDf)CH+RmxQ#K4jw= zaXRt~3bN6ZVN$bS3yka*ZZ++nKk9?~(>JiU+Qi)P@^S377M7OT6%`9H2s0bo zHF=p4qS=ibH?oL^E*=DREJoj)3JB>OPFtLfYUUat+DAKUQbcgx z>zkXyRMTKHgq33o%V4FsioNdp5EZQUKPkC&ZO*p*ncKXUS-*gzI%h0 zSqsh`sDHL=EI7>oTJe7Ew$4L-6j|x@`H|Vic_({j%4s-yj?4eW_BscBg;ltrV zF&Wm@T?2b|cg`}VR>f0iJo7;lr~9v<=ba*DilI=Y^XAR-c+~bTe7+aWu82xbK4ee~ zZbTgL5EBEiRdP`?Wk*CrB#UAJP68EW7ZDcLns@Ws7RO6NEwM>RowJ3g%zqB1$8u*i zUR`i{v~c4eO9iPT!l}Ui?sL6yq+y6kJsKccVkiK3nHp*7{yI-pfMpi*&CV%6NBies4Cp} zlg$quII!>|2L4pu2JuEzKH#POs=C_RKt=9(dfM7K09A@{{-zkPF5UbxDmpr)7wK_u z;)(G2_Z8^>EB<`*A5wP7fuNMl>KXPu-}KSEF@P?zMDn!`F9k_TxiB zxVNT)^%jrX{jcsgSlTL|>-+Fg>2T+M+g*x^a~pC>N}NMOLYmi0XJ&|ti|gTj(!)_$ z9Ks_Ya0(Ud;@(R{!z=Vg+@8~r0yvsqN=n165nDXUetayUb&`3LM^NF@;NWd1Lo<4m#Aw8Xv`eZvy zU2r%4lIf-*>U*5STt*Roq@Nnr`36d5oaQ|ot>jA?n)`o9C!f+Mumkn`*iyR25v~+c+mX#PTZm$$~nuB z%&AN}Z&1aYg{H`p3;Nr}WS+Ht>E<$;bOH&@Nz{AlK6ViPU+F~wQNflJE7nf4JfAxI zeoe;+DgfP2dIYtRX~45*hRf~L6^oR1?eYffo;&RYzO%H~BS_=gf#E2#o1uhD=#P%! z><4zgLEmW*AV5X#ZY3pcT+_#fz;#@==kcbOswg-sW*>iL|2d{3y9@&H+Xh|xyGl1pEyX#1(e=0R{@AqVOKin z{BqHvMY~+LXgk>gc{sKiW>NiksS!>--rnmhZf4IhcmHyA3yO#f09-saTtM`ZdLS&6+j*Yu4;A9zrT1j3?@_ zj%lCb)ZJ~`*wDbesZ(v%-6P7%+1QopZK>h6f(lI#u#R!Sp!Z%tK(zQ8`{Z<-P(E7; z#V+gM@Y%_T84rRKE7Ik*Z=YgjZvL$d*tdSMfcEZh=?gPs&p<7cplS0tG9p3?m&dUo zybyz$7&fgZQ+(OJ*yu%F2oOcT&$BKCrRE}#;bJ@P7U*0_hATijXk@%Sg_(=R)B<$E zh9z?kMW?lW-C1ePZ!6x8GA9=I>tA2M(rDAoZ@}NyGc+uGb>QOh?buvr(yMIW0eNtn zdIqn;&@s$M#$GzfKOn#v>8SEK+>%ac8hhGvp*MTtMl>`*H*bCPP6M-&4O7xSUj$;6 zhroObfMd@4_lq!0gi2p z-+j>Mzkf{W1O1*A75+hXGtjGGiN475xZEI`waGybFq+L3Um;>o9hk$Yey<%J2W%^L zCMQ3CI^v9@kB?8G=t!r6mR9hIbp;$qF0ZTzLx=t{2^_xm`-vJ%nmP0Kk<4*`ypxN3 z9aNoe-1zKZZPVL47vwpyGtXnNzy}*qh3)`NLT$_zt&5udH+6M&lQFm?T2$D_iSzaR zuw|3t5Pf*H`(Uz?0=0F(N-HD*M0|^!r!%4^=yegk09X(0hF`S@DQV@B{;Vm8lPgte>9uGLd)nh!^N?+&^x2OM?lee60%w{dyNi1sL`|oQ1sI`nxyO@Dn2<3}VpLUEuiU(u zyxMG>oHGkg1GBCX?4ETQ+2{7e#6-OG_`)_^^0A+X9z`s1UAo1gz1`@{DE7`3 zz*vRfV~!;Jh2)r{=dr+0YN>VYw}yu49ri#M=fW$->ef_qCuip=b>G2Bmo(drG-7D$ zENf&D3d}^Hj6DalkNL>OYBt;3Eie}?@d@@`vuc&x`S#owRaI=`1NP&NNP&561y_0% zAzyUCch^E64ei8N$adZQ{iDP?5c8A2A?FRZ-E@G8Y3nLPRE=yoiw9P(1;^4vwHlch zsOCE06JlW+edzG+N8%`FU9xEo8U=>sy{#3u`RWiQ&e%z2z<%dJNXTT_ zJ~?hX=7)&@B-{c6XFsso>yuP#rSz`p=kORoyJ@zzA} zCp60aclnBfkEYw6`v8bnGkgTAMbXz|!zjWeJJ1s4PDQ@Pn2&nol)O!!Y`Ys~;Fb~q zR(H$Ks|#cXu8o<#!l*PiU1(>!)HJ^V(W3oMKQdGq3FQwBs7cFuC6zF{J0PMz(Cx64L-SnLq zh5NO+@(BvQ)6b(8W^zxS99&|xflarmzM8%nh{Z4X? zIwOHTty+#F@5mtlF7rPzzpSgffAsx3i(~36=F)sHzLLkyC6;?0m&!d)&#&l%CF`Or zaF$wqxz%eGXxnWKTWJi@+c;e%=N8fLj}s{$YP)dZ0*4Ix11kI$0;Z|Ir-p2 zp7B)brz=}7Md*_rZWmEDxbK34?QzTZ`ME7egV=&KQnR4jh!AC;rP(g+zBO?Fk>Z*50dye5=VWna_07Vco zj92XFxMrS$^}XwmtF zc}M&h(D0k)EHfysTkz|@ld55TkKd5xs@J+q) zXWoi6Yo-F~Uf3sL$9hPpa<)*P+oR3v)~?;T|A}Og`CTE0XU{~>j3Oyqz~US|d%QL* z=-9?t=cM@_8cR&_ed4Eeyf(7PYoxU;^+Od()p~3z3j~OixzcbXnvQA)C|{G$*Pc$! z1)V&2U+=gKt5)GebF7PHeKYv7_qC~s$(*IG_wUIpXy_@ftXzzDRpW1h`@bsFUIx&@ z_2S~)8$O{KErj-Hc8#i7j()`)5&sb3TRHc-IMtu6&m7XP`pV^qzQYnI4IXc5bvVo1 zJFv7~hRyhG@b&jXgFlA@@7`5yV@hK4?MIr>YI88;iD^Tn@~v5_t&(kmceU6f_HLBl z{VF#JXojR}&a&}iR`)jbZ8u7f^zPW6e8v~}0|?}TOEh-un2IXP39e11=`vh0c1|cu zD}ATA*q_0i6Y284;Kt`s7B%MUGgDv6%%-dgq{{@C6^+$F#!?|IzB430n#ZGH{n3r- z7)|G0%I8qd@<{rLpSgL-WRBP7@j}hj349>TxPbqEPN2LvV9R1;YYW9hQ#K2<3a!58 z=HSvev3#0>)gyO{xuo};(@$J{Jvw?lvI7USXJ$I^p!R)%HtJ-9zP?3~$2KgE>Tykv z`KIp0V!njsy1mW)KJOOoI?h67kKBVUsVd@mjXf{1c28e-E{(YL#eavF!14E%!%jkH zY87>3IhkYoVzfrPf;BhV+S+zOP+AX$&(pZLFLw5D?8=@geCcE>`>U@prw1!m+b(e& zO4gfHdZ8vTsjavtQKnF`v$8lh3WyG{qPg#$1Ic}XGkBk8!+6HluD~m}`A376Q3jAd z^HE5M{Nrb+B&|M3kC(LP;+f_l+Ff;YoHmg44V&nS=&iFtb&wtO3T|utL0{?dKHow& z`z25dZD-{L$taGBGroMRbCjx>7Rv}WB2dVNZ+zDeD$mUGBEFJ^_RziN0`*mNW) zhB9>3WLqKGeBGl@BMJl;G|}jA)g{}_E!WRI>c#u0JkLBnR#W71t79?n=@(!&q|^mS znC*CzTr^(maQ%9V?ri>vFWz$7TeyMC5uv{xdM{xG&EBoCr+{0zz5%f*K#NE76hRiR zUsWB#+ZN|Odh{q}X)w>R|728Lxm&I{r@5iw8g$X=c@+mhXE}$v7z5)TP9RokJ+HR_ z2+Jmtoa$~nAHG^u+=byXe}}%b6VKRv?~LoE#|7Q{c{W@9z=4~$K6pE>!E|v)v^pw> z8_;OGlDS1fV#a&SGU+Ru^TL%cafD6#iiCrjkUV>L&(WT&uoYju-p0RP{h_K~V<|3g zhcXDG&m8v4ZE&dlF%pF$;-r?=3)@UHGqbrUSo#62ZursL`w~|>gMrd82iTW+YdB?7 z*_-Y2cSzeUd-d+`MeOA5-$AP=bXd2J!D79(l_5f&tM=P|iYi!ZpK;Qhqeo1& zGDx}N9Gq3Gi%(Uc6TU>+h7FCeS9rB6yo$PD@|OMbq37O}cT9?A>FRHjO)kad za3xb>^X9*-R$^Ljv#uXn`}Ox!RaE4q6x7;RKfJY~&}}@?7T7MCmGGXit#2B@wk;K% zyh!F=jM)^i1rD3<&k6;+SksNZ?>UUCbG~0tSU9hOFKkKqHn*{RQlCPQCp6}B>FWTsxnec5i;G!g`+mfLVlWj1Z}sRe0H<|Moib}9X8&nv zRGrN(6a8+t#k&Dd5tmKq6H?g}T_82kp*=jvF?Q5XYuPb?MbpP5)Es3TuX;-awO%>4 zp~>ngLl#^L!_a!xo*erJqx_h~QM6vhZiEd~3=Wwb0U2`^We1$sEA}IOh+eBjMLmvY zSPyJKE*EST1-kbORFAUeUoGUg9F*n*zi}voG2hmG^MrC9z%)ql{D{M$wKqv4cjmB> z+*^b$;M>{JWRu=D_z->#D=f<4dNIgb-rcrR$ZUwfvEc3puBOAtSmmW2)Wx4-O)7qp zG|16y$MFKed`jD`$`WM3XY_ZXdyO=qRrfj4xv%$daohlMpU|Q!%#KocEpH;W!|_UN znlEVI26w9*|5;fQ78Z8Ocb|J^C%XT^J6ZK_AoJrabXy^p{LRB-0r#3U5qpj+D_;i1 z?e?utbK1Cby=(1XW-L+Tf>-%9%v>@_cf!ZRTJ#d$%uN?6O=HG96u5UC@8w3aEc16i z-;&7|)5zBe2&fMZeEFhD=dNA1u!9}JJj_1Y4@jciGG;YoeNLPk_s&t;U0UE!IHxH- z2sNg5p$AXR5WvkJa0>|Eec!|58jj&Dm$Uhs3*P*6UIM?*IeqMC2id^Wzs3r>a~fkD zdpTss`SYSA-a8|;ZryZDc;zkcX$uDQ5aDbKvgli>>3zy)R8J1=4rhS>vFK;&C(CEYkNzW>0LeZ*$PbRWX)pKbU9Fk zN0~(d2s)R(+|1PUDO$AY4YATlL&QC(6X_obDc=6!c{%h(G9R33-hA`M{AJ5rKtc0t zTd#58z;!UbK7*bxr_XXsN)DB}VaaVppgv>!Cm_Y{NqCe+fLEUM&Z~}3$X~+DLwb%|fsXgav&+!o;@#{zvT++Y_dj=^y~+n_gty?yNodhO zwd~PT?x|ZMX)yV?VCGH-z$D&X+uyxJdaOd$K^>Q|7RlkB&9S%6ir69W>*%OIb@(tlVFH|-kzHnBu)V;1vmXE` zZ&$ly{Agk=DlUGBEV4q{3D{T_+&RlM7s6HJGsg=!6WoG2ce<4oZw^}ya<4ADN;LgC z*!$%(C(aVAWj?pJvph(30@2?A{ecrgVOg|!X@50T@;_F-!s(0d3VjI3J5)dwY5a`FlQ4B&C_VB+&e`0^ zz<>u5B+m?;1NT|ZNps$UTlD8CPWZ}c;FKq5=34#u%!D4vbsFQ@D^~u-eBRu%AVotS z$f~}~)6)~}V~;S2Y5M{;zgq8j{W?4EZKQnX9}LOx5w$_m3+p~nlA^rXDJ#n$)DJD< ztsA^n@$k&x+<0bzs6Ia=lRqw`2D3{4yku=*k75+0aCXF}>wwmf@dCEhmEW;rB^uG_ zqFe(g9_DZxDs7V-EvUU&fGRzvW3uUol#^%VH5u%c;$0?`#Bx4Cfk*nZ-~a!@s0 za0gJ2wWXy+K|w**#zvA#ty9bD(9xvLs*UD6;}R&4Q&50fGzDDz+|=X(m50IqW9sO9m=qg7%=I=^obw+{kgy%yC4L3h;$X*d9)O_YH6*+t$hP+ zte1ItmA+?R0)?YyJIps$+%0hP@R*Elb6S7(|I^;Nf9062ar`YcW6)$w5;Z0@V&1Hd zLnkHPgbre+k`cQkyoD*%j2i5aSEi$Bp;A)h5F-hr5NfJHQ&W@V*l~)=AtV#AKezVU zYxY|EPuRb(w4S{0b3gZUU*GHd{a)7<$Zj4^<~s;JL}Sz_eS~*tEkQ2zUTCRWeNY{= za!1E-X=z#6+6(93Z4~knW-@PQw&A|4EM?B)Wa}^~d0{lZT5Ly+V8@A*CaFkb$J&m; zqkykQS+eG=)W0KZu9*e_1HNb|suf?9CM6@U-i zoWM!t?%~n&n?-|0&%S!HQ9`_-hl|MvFeui;Nhr0_nk4& zEeh|9Z$+|cP)LZ(*LR%o4*=?qPcOk@hM=xlVO%Q4?f~sWWKzCU`D`brVrmpPu3||$ zb?xlNB9qUQ%a7deKb4nvS?GvruPno#E)$^-#I2Eb1Ku)> zL8DsqNEenl%5kyRWkPhK)s=FL1e4$$9~bj>?GT*Cok0wcV!~c#PR<<$@ zJzlI|tE$RT2~Us#@U=vYH%ZHpa@f;-%NIB2I%1*3{YD9=aVbLN@DU@F;qj7FQ=75A z30DZj1=M3%+-LJ{*k%ci2J2@?$*YY_&*AvD;5kKOK)+mB`5`-qYF6=Di5!^|wF%=? z?np74_H0m-@`7?k#=a-^F1!3A4UH%?VSA(9`+1oW;o(T=Up3*$^9Ns9T3J2V&>*@_ zDslS0MLm>Mr8zy0&62Pvk^3qZ?Vhm zC|o!y>`TAQuCXrNs-OGIq2niXWLug##_MdSoq|iUiIQ2$Ftlzh)1@i`u$~M~Gc`Ed zAegZwPdWG;+_X>xXACpa)7z}%BT1%}_~B5Y6_2#R()7xOafLNs@b8uy`f;4VgT8#t zn#%sU*Q>a>6c>X(lDCN(vsnKXE6PbQPu-g38mzCU=kSUCOe8{KkoMcVv!7)?$+nQ+ zapaKDCP3G9gNBZnEnm1WW#4&^emn~Kt*y)Ew>zrIcKR+~ZZLH4;NSa5I-0gQwaD|` zCPO?HRBXf){Hheu<;q|s8V?)Cj>Nz{SR~*U5+MV8+L~zf8;flLW zJB;NdwRP5Spp8AsggG()%$YhZKQz5Aew0=9xoqb+zgR}ECV?I08*~UD0i_t(MqF^4 zOyyO<^NvR#z%H6T9K*?*_3W_GUfcyjWs10%o|fkDaJNC>-iDkPVwI-!nq7S{=Wh6f zq$~9}26JD$DaQ%+us&~p-CwU;uY9~SIP6H$unidtJ`J_at-Wgz-QVzKHN2$i+d&Jx zyiQzv)8{O|g@MRuF=xQ(JAv#cg`+HZ1b-Xv;~TUf3Q(VRuI3_}Nw`*{Tz&17b0)RBNz|o3=jA@5Vtb*AHqH)-S*_Do5C_fjVX2Xk{tb#(GVYLX$@x6 zVkf#>QZfib;&d06VdOn(Ag$@Sdaklqh`^ko6hbt1#EB}W$-VpY@=S~RUo1Ls=uitm zAqp$l)l;m9aKvIx-@+4!Yxus1jK7)WQu=+wnU&SbsP{7&6Yg1VYhp4S6ybGp*#`_O z^7#hQ-)o*fP)b1O%hlEMA2jeXZvW*M2MXA{{U)K+0nr^z9Hbe*PEj2q#JuYa{@pW$ z{!AkT8%n8oCQjLa?=u60r?r^$A40ib9qa3;u-8PWCjz5j&ja?g8R6=DEjw z5*^Y9$voTMn?j_%n0@GwzLnKzqTJT(?D1GJ641Rvl8y!LC-$L|PB1e!pYP$J%F*%s z(cMKY!&mjhnJR#xjd*&62MbCGg2{LFYBkKUNJkiEXRVec7^r`0_X`4L;+{P*Sy_4l zMIuBBriG_cYd3Mh!DGjU;p-Z%(@A9fAU+I@w)c`D+q$+yPqy3F(^Heh*pZQuKo-a3 z24y^>EHw}(vQFJz?E%r!azo!2v^Cly{`=Hap*B6~?cD|>@KhVRwm=6j>TG9c+4Sj3 z_<(rU;uc=^^%c`@$z4A}mihQuK@lYZ!jpXVtd@OGT69l$(D~y60V8`4lDwu4#d_Ld z(k>BAS%%=ZZ#GbJA?*F2Y6tSK=U={IM+c zdh{rsd@-mTUg`Gc!1VfA;}w%+5H|pf79OVfPvES$lG3O^jNj9bDxMVzpQJFT3tYn& zMamz9LmmHlfbFt2TqHqsa1u%kql%TNxKxJFT^A8i+|clk2kYb1Qi`hD`<`M$Xl6$3 z`w|sZRa{t^dqW(CWdeaIm@4LE5$_?w+@-p@Ge!A0IHCqhMiNIw7AkC?zOKpkO-LMd z1!huIrc170=)roR?W#U?F%y~vBH<+E!Iz7sS2oUSyK>(B(rNKabJnf1v}o@D?00is z)rQ1Jk6U%Oa3s>=pn0~BPfrP%ONcn49oK%^N910-A=UO>B`^6hJ~W-X(+AW1m$yZS zQY5xaG5lL%9}pm=K|zPrEH3WYQ{UN!ws6+bR`R^KKI8{eg)lZsd{(cHBQx8_ZGvk? z4=6k$wX+bz+M^VTr7+u0?K=#Ojc2*LYtR!q9^|QXc&~b=0B|j7bqVwZS0TbnBH#7O z3z0!^6p5ZyqRs($qr3ZBV81P%OtST+h#)0lj;yUTcNvol>C;ReA&CUC2Bu9&6{5oL zYs!NDeS7jtT7LA0m3e#Xh=;|Kw6w7)ffyEVE9TdE_{_5{%*={G9ZhX*o(ToVZP1pz z3PV2^i3-uXiDz9-n^n(-n|iQpZn&9a0eHt2{rn+*rZLZn! z^z`&R`p40o=+yC(H+0E;B^1LSe!PmuC6_=X3Rmla1BYFR*Y+4WN~){#K3wbAkxI1U z<(c1bQi6(jZ ztpbOL&Gnx@I$S^g4btkrel~bqPuxJM)ZZ=~ELRzzG5>yTqGRO$FC19C|GhkoZu`LA&*pytaGE(| literal 0 HcmV?d00001 diff --git a/pyproject.toml b/pyproject.toml index 4012628..13aa491 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,16 @@ classifiers = [ ] dependencies = [ - "torch", + # Pinned to the exact ABI the v0.1.4 wheel was built against. ``torch`` must + # come from PyTorch's cu130 index (``--index-url + # https://download.pytorch.org/whl/cu130``); installing from PyPI yields a + # CUDA build that does not match the cu130-linked tilert binary. + "torch==2.11.0", + "transformers==4.46.3", + "tokenizers==0.20.3", + "numpy", + "scipy", + "einops", ] [project.optional-dependencies] @@ -55,18 +64,10 @@ dev = [ Homepage = "https://github.com/tile-ai/TileRT" Issues = "https://github.com/tile-ai/TileRT/issues" -[build-system] -requires = [ - "cmake>=3.25.0", # required by c++ 20 - "packaging", - "setuptools>=64.0.0", - "setuptools-scm>=8.0", - "wheel", - "pytest-runner", - "pytest", - "torch" -] -build-backend = "setuptools.build_meta" +# Note: this repository ships the public sources that match the v0.1.4 wheel. +# The wheel itself is built in the development repo (TileRT-dev/TileRT) with +# scikit-build-core; no [build-system] block is declared here on purpose so +# nobody accidentally runs ``pip wheel .`` against this presentation copy. [tool.black] line-length = 100 diff --git a/python/__init__.py b/python/__init__.py deleted file mode 100644 index dbda493..0000000 --- a/python/__init__.py +++ /dev/null @@ -1,62 +0,0 @@ -"""TileRT Python package.""" - -import ctypes -import logging -from pathlib import Path -from typing import Any - -import torch - -if not hasattr(torch, "ops"): - raise RuntimeError("PyTorch is required but torch.ops is not available") - -from .__version__ import __version__ - - -def init_logging() -> logging.Logger: - """Initialize logging configuration.""" - logging.basicConfig( - level=logging.DEBUG, - format="%(filename)s:%(lineno)d [%(levelname)s]: %(message)s", - ) - return logging.getLogger(__name__) - - -logger = init_logging() - - -def _load_library(filename: str) -> Any: - """Load the C++ library. - - Args: - filename: Name of the library file. - - Returns: - Any: The loaded library. - - Raises: - RuntimeError: If the library cannot be loaded. - """ - lib_path = Path(__file__).parent / filename - - try: - torch.ops.load_library(str(lib_path)) - return lib_path - except Exception as e: - raise RuntimeError(f"Failed to load library from {lib_path}") from e - - -_load_library("libtilert.so") - - -from . import models # noqa: E402 -from .models import deepseek_v3_2 # noqa: E402 -from .tilert_init import tilert_init # noqa: E402 - -__all__ = [ - "logger", - "tilert_init", - "models", - "deepseek_v3_2", - "__version__", -] diff --git a/python/__version__.py b/python/__version__.py deleted file mode 100644 index 08768ce..0000000 --- a/python/__version__.py +++ /dev/null @@ -1,20 +0,0 @@ -from __future__ import annotations - -from importlib.metadata import PackageNotFoundError -from importlib.metadata import version as pkg_version -from pathlib import Path - -try: - __version__ = pkg_version("tilert") -except PackageNotFoundError: - try: - from setuptools_scm import get_version - - __version__ = get_version( - root=str(Path(__file__).resolve().parents[1]), - relative_to=__file__, - ) - except Exception: - __version__ = "0.0.0" - -__all__ = ["__version__"] diff --git a/python/benchmark/long_prompt.py b/python/benchmark/long_prompt.py deleted file mode 100644 index f6d4d0e..0000000 --- a/python/benchmark/long_prompt.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Long-prompt benchmark: single generation, measures long-form throughput.""" - -from typing import cast - -import numpy as np -from benchmark import BenchMode, BenchStats, CellStats, Generator, apply_mode - -PROMPT = "Hi, can you tell me a very long story, with roughly 3000 words?" - - -def run(generator: Generator, modes: list[BenchMode]) -> BenchStats: - """Run the long-prompt benchmark for each mode. - - Returns stats with column: Long. - """ - stats: BenchStats = {} - - for mode in modes: - apply_mode(generator, mode) - print(f"\n--- Long-prompt benchmark ({mode.label}) ---") - print(f"Prompt: {PROMPT}") - print("Completion:") - - _, time_list, accepted_counts = cast( - tuple[str, list[float], list[int]], - generator.generate(PROMPT, True, with_mtp=mode.with_mtp), - ) - - mode_stats: dict[str, CellStats] = {} - - if mode.with_mtp and accepted_counts: - total_tokens = sum(accepted_counts) - total_time = sum(time_list) - speed = total_tokens / total_time if total_time > 0 else 0 - avg_ms = total_time / len(time_list) * 1000 - avg_a = total_tokens / len(accepted_counts) - acc_rate = f"{avg_a:.2f}/{min(accepted_counts)}/{max(accepted_counts)}" - mode_stats["Long"] = CellStats(tok_s=speed, ms=avg_ms, acc_rate=acc_rate) - elif time_list: - mean_time = float(np.mean(time_list)) - speed = 1 / mean_time - mode_stats["Long"] = CellStats(tok_s=speed, ms=mean_time * 1000) - - stats[mode.label] = mode_stats - - return stats diff --git a/python/generate.py b/python/generate.py deleted file mode 100644 index 5724e8e..0000000 --- a/python/generate.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Text generation script for TileRT.""" - -from argparse import ArgumentParser - -from benchmark import BenchMode -from benchmark import coding_prompt as coding_bench -from benchmark import long_prompt as long_bench -from benchmark import merge_stats, print_summary_table -from benchmark import short_prompt as short_bench - -from tilert.models.deepseek_v3_2.generator import DSAv32Generator -from tilert.models.deepseek_v3_2.model_args import ModelArgs as DSAv32ModelArgs -from tilert.models.glm_5.generator import GLM5Generator -from tilert.models.glm_5.model_args import ModelArgsGLM5 - - -def get_generator( - model_type: str, - max_new_tokens: int, - temperature: float, - model_weights_dir: str, - with_mtp: bool, - top_p: float = 0.9, - top_k: int = 256, - enable_thinking: bool = False, - sampling_seed: int = 42, -) -> DSAv32Generator | GLM5Generator: - """Get the appropriate generator based on model type.""" - assert model_type in ["deepseek_v3_2", "glm5"] - if model_type == "deepseek_v3_2": - model_args = DSAv32ModelArgs() - return DSAv32Generator( - model_args=model_args, - max_new_tokens=max_new_tokens, - temperature=temperature, - model_weights_dir=model_weights_dir, - with_mtp=with_mtp, - top_p=top_p, - top_k=top_k, - use_topp=top_p < 1.0, - sampling_seed=sampling_seed, - ) - model_args = ModelArgsGLM5() - return GLM5Generator( - model_args=model_args, - max_new_tokens=max_new_tokens, - temperature=temperature, - model_weights_dir=model_weights_dir, - with_mtp=with_mtp, - top_p=top_p, - top_k=top_k, - use_topp=top_p < 1.0, - enable_thinking=enable_thinking, - sampling_seed=sampling_seed, - ) - - -def parse_args(): # type: ignore - parser = ArgumentParser(description="Command-line interface for text generation.") - parser.add_argument( - "--model-weights-dir", - type=str, - required=True, - help="Path to model weights directory", - ) - parser.add_argument( - "--model", - type=str, - default="deepseek_v3_2", - choices=["deepseek_v3_2", "glm5"], - help="Model type to use (default: deepseek_v3_2)", - ) - parser.add_argument("--max-new-tokens", type=int, default=4000, help="Max tokens to generate") - parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature") - parser.add_argument( - "--top-p", - type=float, - default=1.0, - help="Top-p (nucleus) sampling threshold. Use < 1.0 to enable top-p sampling (e.g. 0.9)", - ) - parser.add_argument("--top-k", type=int, default=256, help="Top-k sampling threshold") - parser.add_argument("--interactive", action="store_true") - parser.add_argument( - "--with-mtp", - action="store_true", - help="Enable MTP (Multi-Token Prediction) for speculative decoding", - ) - parser.add_argument( - "--use-random-weights", - action="store_true", - help="Use random weights instead of pretrained (for testing MTP without real weights)", - ) - parser.add_argument( - "--enable-thinking", - action="store_true", - help="Enable thinking mode in chat template", - ) - parser.add_argument( - "--sampling-seed", - type=int, - default=42, - help="Sampling seed for top-p sampling (fixed per request, default: 42)", - ) - return parser.parse_args() - - -if __name__ == "__main__": - """ - usage: - execute below command under tilert root directory: - - # DeepSeek V3.2 - Standard generation with pretrained weights: - python python/generate.py --model-weights-dir "xxxx" 2>&1 | tee test.log - - # DeepSeek V3.2 - MTP generation with random weights (for testing): - python python/generate.py --model-weights-dir "xxxx" --with-mtp \ - --use-random-weights 2>&1 | tee test.log - - # DeepSeek V3.2 - MTP generation with pretrained weights (when available): - python python/generate.py --model-weights-dir "xxxx" --with-mtp 2>&1 | tee test.log - - # GLM5 - Standard generation with random weights (for testing): - python python/generate.py --model glm5 --model-weights-dir "xxxx" \ - --use-random-weights 2>&1 | tee test.log - - # GLM5 - Standard generation with pretrained weights: - python python/generate.py --model glm5 --model-weights-dir "xxxx" 2>&1 | tee test.log - - # GLM5 - MTP generation with random weights (for testing): - python python/generate.py --model glm5 --model-weights-dir "xxxx" --with-mtp \ - --use-random-weights 2>&1 | tee test.log - - # GLM5 - MTP generation with pretrained weights: - python python/generate.py --model glm5 --model-weights-dir "xxxx" --with-mtp \ - 2>&1 | tee test.log - """ - args = parse_args() - - generator = get_generator( - model_type=args.model, - max_new_tokens=args.max_new_tokens, - temperature=args.temperature, - model_weights_dir=args.model_weights_dir, - with_mtp=args.with_mtp if args.interactive else True, - top_p=args.top_p, - top_k=args.top_k, - enable_thinking=args.enable_thinking, - sampling_seed=args.sampling_seed, - ) - - print("Loading pretrained weights...") - generator.from_pretrained() - - # simple memoryless interactive mode - if args.interactive: - print("Welcome to the TileRT interactive mode! Type '/exit' to exit.") - while True: - prompt = input(">>> ") - if prompt == "/exit": - break - _ = generator.generate(prompt) # type: ignore[has-type] - else: - - # 3 modes: top-k1 w/o MTP, top-k1 w/ MTP, top-p0.95 w/ MTP - modes = [ - BenchMode(with_mtp=False, label="top-k1 w/o MTP"), - BenchMode(with_mtp=True, label="top-k1 w/ MTP"), - BenchMode( - with_mtp=True, - label="top-p0.95 w/ MTP", - use_topp=True, - top_p=0.95, - top_k=args.top_k, - temperature=args.temperature, - ), - ] - - # Run all benchmarks and collect stats - all_bench_stats = [ - short_bench.run(generator, modes), - coding_bench.run(generator, modes), - long_bench.run(generator, modes), - ] - - # Print unified summary table - print_summary_table( - merge_stats(all_bench_stats), - model_name=args.model.upper(), - ) - - print("Cleaning up...") - generator.cleanup() diff --git a/python/models/common.py b/python/models/common.py deleted file mode 100644 index b213793..0000000 --- a/python/models/common.py +++ /dev/null @@ -1,361 +0,0 @@ -from typing import cast - -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F - -__all__ = [ - "init_func", - "linear", - "Linear", - "RMSNorm", - "LayerNorm", - "ColumnParallelLinear", - "RowParallelLinear", - "ParallelEmbedding", -] - -from tilert.models.deepseek_config import ( - block_size, - gemm_impl, - get_rank, - get_world_size, - is_distributed, -) -from tilert.models.deepseek_v3_2.refs.kernel import act_quant, fp8_gemm, weight_dequant - - -def _get_scale_tensor(tensor: torch.Tensor) -> torch.Tensor: - """Return the dynamically attached ``scale`` tensor.""" - scale = getattr(tensor, "scale", None) - if scale is None: - raise AttributeError("Expected quantized tensor to carry a 'scale' attribute.") - return cast(torch.Tensor, scale) - - -def init_func(x_in: torch.Tensor) -> torch.Tensor: - x_dtype = x_in.dtype - x_fp32 = x_in.to(torch.float32) - if x_fp32.dim() >= 2: - initial_tensor = nn.init.kaiming_uniform_(x_fp32) - else: - initial_tensor = nn.init.uniform_(x_fp32) - return initial_tensor.to(x_dtype) - - -def linear( - x_in: torch.Tensor, - weight: torch.Tensor, - bias: torch.Tensor | None = None, - scale_fmt: str | None = None, -) -> torch.Tensor: - """ - Applies a linear transformation to the incoming data: y = xA^T + b. - - This function supports specialized implementations based on quantization - and tensor formats. - - Args: - x_in (torch.Tensor): The input tensor. - weight (torch.Tensor): The weight tensor. It may be quantized and - requires dequantization for certain cases. - bias (Optional[torch.Tensor]): The bias tensor to be added. Default is None. - - Returns: - torch.Tensor: The result of the linear transformation, which may involve - quantization-aware computations depending on the input parameters. - - Notes: - - If `weight` is quantized (e.g., `element_size() == 1`), a dequantized version is used - for computation. - - If `gemm_impl == "bf16"`, dequantization and a `bf16` GEMM operation are applied. - - For other cases, the function applies quantization to `x` and uses `fp8_gemm` - for computation. - """ - if weight.element_size() > 1: - return F.linear(x_in, weight, bias) - if gemm_impl == "bf16": - weight = weight_dequant(weight, _get_scale_tensor(weight)) - return F.linear(x_in, weight, bias) - - x_quant: torch.Tensor - scale: torch.Tensor - x_quant, scale = act_quant(x_in, block_size, scale_fmt) - y_out: torch.Tensor = fp8_gemm(x_quant, scale, weight, _get_scale_tensor(weight)) - if bias is not None: - y_out += bias - return y_out - - -class Linear(nn.Module): - """ - Custom linear layer with support for quantized weights and optional bias. - - Args: - in_features (int): Number of input features. - out_features (int): Number of output features. - bias (bool): Whether to include a bias term. Defaults to False. - dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`. - """ - - dtype = torch.bfloat16 - scale_fmt: str | None = None - - def __init__( - self, - in_features: int, - out_features: int, - bias: bool = False, - dtype: torch.dtype | None = None, - weight: torch.Tensor | None = None, - ): - super().__init__() - self.in_features = in_features - self.out_features = out_features - - if weight is not None: - self.weight = torch.nn.Parameter(weight) - else: - self.weight = nn.Parameter( - init_func(torch.empty(out_features, in_features, dtype=dtype or Linear.dtype)) - ) - - if self.weight.element_size() == 1: - scale_out_features = (out_features + block_size - 1) // block_size - scale_in_features = (in_features + block_size - 1) // block_size - scale_param = nn.Parameter( - init_func( - torch.empty( - scale_out_features, - scale_in_features, - dtype=torch.float32, - ) - ) - ) - self.scale = scale_param - self.weight.scale = scale_param # type: ignore[attr-defined] - else: - self.register_parameter("scale", None) - - if bias: - self.bias = nn.Parameter(init_func(torch.empty(out_features))) - else: - self.register_parameter("bias", None) - - def forward(self, x_in: torch.Tensor) -> torch.Tensor: - """ - Forward pass for the custom linear layer. - - Args: - x (torch.Tensor): Input tensor. - - Returns: - torch.Tensor: Transformed tensor after linear computation. - """ - return linear(x_in, self.weight, self.bias, self.scale_fmt) - - -class RMSNorm(nn.Module): - """ - Root Mean Square Layer Normalization (RMSNorm). - - Args: - dim (int): Dimension of the input tensor. - eps (float): Epsilon value for numerical stability. Defaults to 1e-6. - """ - - def __init__(self, dim: int, eps: float = 1e-6, weight: torch.Tensor | None = None): - super().__init__() - self.dim = dim - self.eps = eps - - if weight is None: - self.weight = nn.Parameter(init_func(torch.empty(dim, dtype=torch.float32))) - else: - self.weight = torch.nn.Parameter(weight) - - def forward( - self, x: torch.Tensor, residual: torch.Tensor | None = None - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - """ - Forward pass for RMSNorm. - - Args: - x (torch.Tensor): Input tensor. - - Returns: - torch.Tensor: Normalized tensor with the same shape as input. - """ - dtype = torch.bfloat16 # x.dtype - if residual is None: - x = x.float() - var_s = x.pow(2).mean(-1, keepdim=True) - x = x * torch.rsqrt(var_s + self.eps) - return (self.weight * x).to(dtype) - - x = residual = x.float() + residual.float() - var_s = x.pow(2).mean(-1, keepdim=True) - x = x * torch.rsqrt(var_s + self.eps) - return (self.weight * x).to(dtype), residual.to(dtype) - - -class LayerNorm(nn.Module): - """Layer Normalization.""" - - def __init__(self, dim: int, eps: float = 1e-6): - super().__init__() - self.dim = dim - self.eps = eps - self.weight = nn.Parameter(torch.ones(dim, dtype=torch.float32)) - self.bias = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return F.layer_norm(x.float(), (self.dim,), self.weight, self.bias, self.eps).type_as(x) - - -class ColumnParallelLinear(Linear): - """ - Column parallel linear layer. - - Linear layer with column parallelism, splitting output features across - distributed processes. - - Args: - in_features (int): Number of input features. - out_features (int): Total number of output features. - bias (bool): Whether to include a bias term. Defaults to False. - dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`. - """ - - def __init__( - self, - in_features: int, - out_features: int, - bias: bool = False, - dtype: torch.dtype | None = None, - ): - world_size = get_world_size() - assert ( - out_features % world_size == 0 - ), f"Output features must be divisible by world size {world_size}" - self.part_out_features = out_features // world_size - super().__init__(in_features, self.part_out_features, bias, dtype) - - def forward(self, x_in: torch.Tensor) -> torch.Tensor: - """ - Forward pass for column parallel linear layer. - - Args: - x (torch.Tensor): Input tensor. - - Returns: - torch.Tensor: Transformed tensor with column-parallel computation. - """ - return linear(x_in, self.weight, self.bias) - - -class RowParallelLinear(Linear): - """ - Linear layer with row parallelism, splitting input features across distributed processes. - - Args: - in_features (int): Total number of input features. - out_features (int): Number of output features. - bias (bool): Whether to include a bias term. Defaults to False. - dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`. - """ - - def __init__( - self, - in_features: int, - out_features: int, - bias: bool = False, - reduce_output: bool = True, - dtype: torch.dtype | None = None, - ): - - self.world_size = get_world_size() - - if in_features % self.world_size != 0: - raise ValueError( - f"Input features must be divisible by world size (world_size={self.world_size})" - ) - - self.part_in_features = in_features // self.world_size - self.reduce_output = reduce_output - - super().__init__(self.part_in_features, out_features, bias, dtype) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Forward pass for row parallel linear layer. - - Args: - x (torch.Tensor): Input tensor. - - Returns: - torch.Tensor: Transformed tensor with row-parallel computation. - """ - y = linear(x, self.weight, None, self.scale_fmt) - if self.reduce_output and is_distributed() > 1: - y = y.float() - dist.all_reduce(y) - if self.bias is not None: - y += self.bias - return y.type_as(x) - - -class ParallelEmbedding(nn.Module): - """ - Parallel embedding layer. - - Embedding layer with parallelism support across distributed processes. - - Args: - vocab_size (int): Vocabulary size. - dim (int): Embedding dimension. - """ - - def __init__(self, vocab_size: int, dim: int): - super().__init__() - self.vocab_size = vocab_size - self.dim = dim - - self.world_size = get_world_size() - self.rank = get_rank() - - assert ( - vocab_size % self.world_size == 0 - ), f"Vocabulary size must be divisible by world size {self.world_size}" - - self.part_vocab_size = vocab_size // self.world_size - self.vocab_start_idx = self.rank * self.part_vocab_size - self.vocab_end_idx = self.vocab_start_idx + self.part_vocab_size - - self.weight = nn.Parameter(init_func(torch.empty(self.part_vocab_size, self.dim))) - - def forward(self, x_in: torch.Tensor) -> torch.Tensor: - """ - Forward pass for parallel embedding layer. - - Args: - x (torch.Tensor): Input tensor containing token indices. - - Returns: - torch.Tensor: Embedded representations. - - Raises: - ValueError: If `world_size` is not defined. - """ - if self.world_size > 1: - mask = (x_in < self.vocab_start_idx) | (x_in >= self.vocab_end_idx) - x_in = x_in - self.vocab_start_idx - x_in[mask] = 0 - - y_out = F.embedding(x_in, self.weight) - - if is_distributed(): - y_out[mask] = 0 - dist.all_reduce(y_out) - return y_out diff --git a/python/models/deepseek_config.py b/python/models/deepseek_config.py deleted file mode 100644 index 6c7f5da..0000000 --- a/python/models/deepseek_config.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Global configuration for DeepSeek models.""" - -import os -from typing import Literal - -import torch -import torch.distributed as dist - -__all__ = [ - "get_world_size", - "get_rank", - "block_size", - "gemm_impl", - "attn_impl", -] - - -def is_distributed() -> bool: - return bool(dist.is_initialized()) - - -def get_world_size() -> int: - # NOTE: default world size is 8, since tilert kernels implemented for 8 GPUs. - # DO NOT modify this value unless you know how much it affects the tilert kernels. - return dist.get_world_size() if dist.is_initialized() else 8 - - -def get_rank() -> int: - return dist.get_rank() if dist.is_initialized() else 0 - - -def init_distributed_training() -> tuple[int, int, bool]: - """Initialize distributed training.""" - if "LOCAL_RANK" in os.environ: - local_rank = int(os.environ["LOCAL_RANK"]) - world_rank = int(os.environ["RANK"]) - world_size = int(os.environ["WORLD_SIZE"]) - is_distributed = True - else: - local_rank = 0 - world_rank = 0 - world_size = 1 - is_distributed = False - - torch.cuda.set_device(local_rank) - torch.set_default_device(f"cuda:{local_rank}") - torch.set_default_dtype(torch.bfloat16) - - if world_size > 1: - dist.init_process_group( - backend="nccl", - world_size=world_size, - rank=world_rank, - init_method="env://", - device_id=local_rank, - ) - - rank = get_rank() - world_size = get_world_size() - - return rank, world_size, is_distributed - - -block_size = 128 -gemm_impl: Literal["bf16", "fp8"] = "bf16" -attn_impl: Literal["naive", "absorb"] = "absorb" diff --git a/python/models/deepseek_v3_2/modules/mla.py b/python/models/deepseek_v3_2/modules/mla.py deleted file mode 100644 index 6f1d138..0000000 --- a/python/models/deepseek_v3_2/modules/mla.py +++ /dev/null @@ -1,107 +0,0 @@ -import torch - -from tilert.models.base import SerializableTileRTModule -from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.models.deepseek_v3_2.ops.layernorm_rope_rotate import LayerNormRoPERotate -from tilert.models.deepseek_v3_2.ops.projo_wkvb import ProjoWKVb -from tilert.models.deepseek_v3_2.ops.projq_wqb import ProjqWqb -from tilert.models.deepseek_v3_2.ops.projx_wis import ProjxWis -from tilert.models.deepseek_v3_2.ops.rmsnorm_kv import KVRMSNorm -from tilert.models.deepseek_v3_2.ops.rmsnorm_projq_wqib import ( - RmsnormProjqWqib, - RmsnormProjqWqibAlgorithm, -) -from tilert.models.deepseek_v3_2.ops.rmsnorm_projx_wqkvia import ( - RMSNormProjxWqkvia, - RMSNormProjxWqkviaAlgorithm, -) -from tilert.models.deepseek_v3_2.ops.unproj_o_allreduce import ( - UnProjOAllReduce, - UnProjOAllReduceAlgorithm, -) - - -class Mla(SerializableTileRTModule): - """Implement the MLA operations.""" - - def __init__(self, model_args: ModelArgs, device_id: int, num_devices: int): - super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) - - self.rmsnorm_projx_wqkvia = RMSNormProjxWqkvia( - model_args=model_args, device_id=device_id, num_devices=num_devices - ) - if model_args.arch_name == "glm_5": - self.rmsnorm_projx_wqkvia.algorithm = RMSNormProjxWqkviaAlgorithm.DECOUPLED - else: - self.rmsnorm_projx_wqkvia.algorithm = RMSNormProjxWqkviaAlgorithm.GENERAL - self.register_op(self.rmsnorm_projx_wqkvia) - - self.layernorm_rope_rotate = LayerNormRoPERotate( - model_args=model_args, device_id=device_id, num_devices=num_devices - ) - self.register_op(self.layernorm_rope_rotate) - - self.rmsnorm_projq_wqib = RmsnormProjqWqib( - model_args=model_args, device_id=device_id, num_devices=num_devices - ) - if model_args.arch_name == "glm_5": - self.rmsnorm_projq_wqib.algorithm = RmsnormProjqWqibAlgorithm.FP16MMA - else: - self.rmsnorm_projq_wqib.algorithm = RmsnormProjqWqibAlgorithm.BF16 - self.register_op(self.rmsnorm_projq_wqib) - - self.projx_wis = ProjxWis( - model_args=model_args, device_id=device_id, num_devices=num_devices - ) - self.register_op(self.projx_wis) - - self.projq_wqb = ProjqWqb( - model_args=model_args, device_id=device_id, num_devices=num_devices - ) - self.register_op(self.projq_wqb) - - self.rmsnorm_kv = KVRMSNorm( - model_args=model_args, device_id=device_id, num_devices=num_devices - ) - self.register_op(self.rmsnorm_kv) - - self.projo_wkvb = ProjoWKVb( - model_args=model_args, device_id=device_id, num_devices=num_devices - ) - self.register_op(self.projo_wkvb) - - self.unproj_o_allreduce = UnProjOAllReduce( - model_args=model_args, - device_id=device_id, - num_devices=num_devices, - algorithm=UnProjOAllReduceAlgorithm.FP8MMA, - ) - - if model_args.arch_name == "glm_5": - self.unproj_o_allreduce.algorithm = UnProjOAllReduceAlgorithm.FP16MMA - - self.register_op(self.unproj_o_allreduce) - - self.kv_cache: torch.Tensor | None = None - self.pe_cache: torch.Tensor | None = None - self.ki_cache: torch.Tensor | None = None - - def get_cache_vars(self) -> list[torch.Tensor]: - cache_seq_len = self.model_args.max_seq_len + self.model_args.kv_cache_pad - bs_args = (self.model_args.max_batch_size, cache_seq_len) - if self.kv_cache is None: - kv_dim = self.model_args.kv_lora_rank - self.kv_cache = torch.zeros( - *bs_args, kv_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" - ) - if self.pe_cache is None: - pe_dim = self.model_args.qk_rope_head_dim - self.pe_cache = torch.zeros( - *bs_args, pe_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" - ) - if self.ki_cache is None: - ki_dim = self.model_args.index_head_dim - self.ki_cache = torch.zeros( - *bs_args, ki_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" - ) - return [*super().get_cache_vars(), self.ki_cache, self.kv_cache, self.pe_cache] diff --git a/python/models/deepseek_v3_2/ops/__init__.py b/python/models/deepseek_v3_2/ops/__init__.py deleted file mode 100644 index e832905..0000000 --- a/python/models/deepseek_v3_2/ops/__init__.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Core operations for deepseek v3.2.""" - -from tilert.models.deepseek_v3_2.ops.down_allreduce import ( - DownAllReduce, - down_allreduce, - down_allreduce_glm5, -) -from tilert.models.deepseek_v3_2.ops.eh_proj_allreduce import EHProjAllReduce, eh_proj_allreduce -from tilert.models.deepseek_v3_2.ops.expert_down_allreduce import ( - ExpertDownAllReduce, - expert_down_allreduce, -) -from tilert.models.deepseek_v3_2.ops.expert_sel_up_gate_silu import ( - ExpertSelectUpGateSiLU, - ExpertSelectUpGateSiLUAlgorithm, -) -from tilert.models.deepseek_v3_2.ops.expert_select import expert_select -from tilert.models.deepseek_v3_2.ops.flash_sparse_mla import flash_sparse_mla -from tilert.models.deepseek_v3_2.ops.layernorm_rope_rotate import layernorm_rope_rotate -from tilert.models.deepseek_v3_2.ops.projo_wkvb import projo_wkvb -from tilert.models.deepseek_v3_2.ops.projq_wqb import projq_wqb -from tilert.models.deepseek_v3_2.ops.projx_wis import projx_wis -from tilert.models.deepseek_v3_2.ops.qkv_rope import ( - QKVRoPE, - QKVRoPERefWeightsAlias, - QKVRoPETilertWeightsAlias, - qkv_rope, -) -from tilert.models.deepseek_v3_2.ops.rmsnorm_expert_proj import RMSNormExpertProj -from tilert.models.deepseek_v3_2.ops.rmsnorm_head_proj import RMSNormHeadProj -from tilert.models.deepseek_v3_2.ops.rmsnorm_kv import rmsnorm_kv -from tilert.models.deepseek_v3_2.ops.rmsnorm_projq_wqib import ( - RmsnormProjqWqib, - RmsnormProjqWqibAlgorithm, - RmsnormProjqWqibWeightsConverter, -) -from tilert.models.deepseek_v3_2.ops.rmsnorm_projx_wqkvia import ( - RMSNormProjxWqkvia, - RMSNormProjxWqkviaAlgorithm, - projx_wqkvia, - rmsnorm_projx_wqkvia, -) -from tilert.models.deepseek_v3_2.ops.rmsnorm_quant import rmsnorm_quant -from tilert.models.deepseek_v3_2.ops.rmsnorm_up_gate_silu import ( - RMSNormUpGateSiLU, - RMSNormUpGateSiLUAlgorithm, -) -from tilert.models.deepseek_v3_2.ops.rotate import ( - Rotate, - RotateRefWeightsAlias, - RotateTilertWeightsAlias, - rotate, - rotate_activation, -) -from tilert.models.deepseek_v3_2.ops.sparse_index import sparse_index, sparse_index_topk -from tilert.models.deepseek_v3_2.ops.topk import TopK, topk_accurate, topk_approximate -from tilert.models.deepseek_v3_2.ops.unproj_o_allreduce import ( - UnProjOAllReduce, - UnProjOAllReduceAlgorithm, - unproj_o_allreduce, -) -from tilert.models.deepseek_v3_2.ops.up_gate_silu import up_gate_silu - -__all__ = [ - "down_allreduce", - "down_allreduce_glm5", - "DownAllReduce", - "expert_down_allreduce", - "ExpertDownAllReduce", - "expert_select", - "up_gate_silu", - "rmsnorm_projx_wqkvia", - "projx_wqkvia", - "rmsnorm_kv", - "unproj_o_allreduce", - "projo_wkvb", - "projq_wqb", - "rotate", - "rotate_activation", - "Rotate", - "RotateRefWeightsAlias", - "RotateTilertWeightsAlias", - "layernorm_rope_rotate", - "TopK", - "topk_approximate", - "topk_accurate", - "sparse_index", - "sparse_index_topk", - "flash_sparse_mla", - "projx_wis", - "qkv_rope", - "QKVRoPE", - "QKVRoPERefWeightsAlias", - "QKVRoPETilertWeightsAlias", - "eh_proj_allreduce", - "rmsnorm_quant", - "RmsnormProjqWqib", - "RmsnormProjqWqibAlgorithm", - "RmsnormProjqWqibWeightsConverter", - "RMSNormExpertProj", - "RMSNormProjxWqkvia", - "RMSNormProjxWqkviaAlgorithm", - "RMSNormUpGateSiLU", - "UnProjOAllReduce", - "UnProjOAllReduceAlgorithm", - "RMSNormHeadProj", - "ExpertSelectUpGateSiLU", - "ExpertSelectUpGateSiLUAlgorithm", -] diff --git a/python/models/deepseek_v3_2/ops/expert_select.py b/python/models/deepseek_v3_2/ops/expert_select.py deleted file mode 100644 index 6a16d76..0000000 --- a/python/models/deepseek_v3_2/ops/expert_select.py +++ /dev/null @@ -1,49 +0,0 @@ -"""ExpertSelect operation module.""" - -import torch - -__all__ = [ - "expert_select", - "expert_select_one_stage", -] - - -def expert_select( - scores_in: torch.Tensor, - bias_in: torch.Tensor, - expert_probs_out: torch.Tensor, - expert_indices_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """ - Expert Select operation. - - Original two-stage expert select operation used in DeepSeek V3.2. - """ - torch.ops.tilert.expert_select_op( - scores_in, - bias_in, - expert_probs_out, - expert_indices_out, - profile_logs, - ) - - -def expert_select_one_stage( - scores_in: torch.Tensor, - bias_in: torch.Tensor, - expert_probs_out: torch.Tensor, - expert_indices_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """Expert Select operation. - - Modified one-stage expert select operation used in Kimi and GLM. - """ - torch.ops.tilert.expert_select_glm5_op( - scores_in, - bias_in, - expert_probs_out, - expert_indices_out, - profile_logs, - ) diff --git a/python/models/deepseek_v3_2/ops/head_proj.py b/python/models/deepseek_v3_2/ops/head_proj.py deleted file mode 100644 index 5ab8b77..0000000 --- a/python/models/deepseek_v3_2/ops/head_proj.py +++ /dev/null @@ -1,22 +0,0 @@ -"""HeadProj operation module.""" - -import torch - -__all__ = [ - "head_proj", -] - - -def head_proj( - hidden_in: torch.Tensor, - weight_in: torch.Tensor, - logits_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """Head Projection operation.""" - torch.ops.tilert.head_proj_op( - hidden_in, - weight_in, - logits_out, - profile_logs, - ) diff --git a/python/models/deepseek_v3_2/ops/projo_wkvb.py b/python/models/deepseek_v3_2/ops/projo_wkvb.py deleted file mode 100644 index 618e5ee..0000000 --- a/python/models/deepseek_v3_2/ops/projo_wkvb.py +++ /dev/null @@ -1,283 +0,0 @@ -"""UnprojOB operation module.""" - -from dataclasses import dataclass -from enum import Enum - -import torch - -from tilert.models.base import TileRTModule, TilertWeightsConverter -from tilert.models.common import init_func, weight_dequant -from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.profiler.utils import parse_profile_log_tensor -from tilert.utils import get_profile_log_tensor - -__all__ = [ - "projo_wkvb", - "ProjoWKVb", - "ProjoWKVbAlgorithm", - "ProjoWKVbWeightsConverter", - "ProjoWKVbRefWeightsAlias", - "ProjoWKVbTilertWeightsAlias", -] - - -def projo_wkvb( - o_in: torch.Tensor, - wkv_b_b: torch.Tensor, - wkv_b_scales: torch.Tensor, - output: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """ - Define the UnprojOB operation. - - Args: - o_in: Input tensor. - wkv_b_b: Weight tensor. - wkv_b_scales: Scale tensor. - output: Output tensor. - profile_logs: Profile logs tensor. - """ - # Choose operation based on v_head_dim (128 for deepseek_v3_2, 256 for glm5) - if output.shape[-1] == 128: - torch.ops.tilert.projo_wkvb_op(o_in, wkv_b_b, wkv_b_scales, output, profile_logs) - elif output.shape[-1] == 256: - torch.ops.tilert.proj_ob_glm5_op(o_in, wkv_b_b, wkv_b_scales, output, profile_logs) - else: - raise ValueError(f"Unsupported v_head_dim: {output.shape[-1]}") - - -class ProjoWKVbAlgorithm(Enum): - """ProjoWKVb algorithm""" - - GENERAL = "general" - - -class ProjoWKVbWeightsConverter(TilertWeightsConverter): - def __init__(self, model_args: ModelArgs, num_devices: int): - super().__init__(model_args, num_devices) - - def convert_to_general(self, weights: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: - with torch.inference_mode(): - tilert_wkv_b_weights, tilert_wkv_b_scales = weights - - # Input weights are already in the correct shape from device_sharding: - # wkv_b_weights: (n_local_heads, v_head_dim, kv_lora_rank) - # wkv_b_scales: (n_local_heads, v_head_dim // block_size, kv_lora_rank // block_size) - wkv_b_b = tilert_wkv_b_weights.contiguous() - wkv_b_b_scales = tilert_wkv_b_scales.contiguous() - if self.model_args.arch_name == "glm_5": - if wkv_b_b_scales.dtype != torch.float32: - print( - "Warning: ProjoWKVbWeightsConverter: " - + f"wkv_b_b_scales.dtype: {wkv_b_b_scales.dtype} " - + "is not float32, convert to float32." - ) - wkv_b_b_scales = wkv_b_b_scales.to(torch.float32) - else: # DS v3.2, use bfloat16 for wkv_b_b_scales - wkv_b_b_scales = wkv_b_b_scales.to(torch.bfloat16) - - wkv_b_b = wkv_b_b.detach() - wkv_b_b_scales = wkv_b_b_scales.detach() - - return wkv_b_b, wkv_b_b_scales - - -@dataclass -class ProjoWKVbRefWeightsAlias: - """Reference weights alias for ProjoWKVb.""" - - wkv_b_weights = "self_attn.kv_b_proj.weight" - wkv_b_scales = "self_attn.kv_b_proj.weight_scale_inv" - - @property - def ref_tensor_alias(self) -> list[str]: - return [self.wkv_b_weights, self.wkv_b_scales] - - def __call__(self) -> list[str]: - return self.ref_tensor_alias - - -@dataclass -class ProjoWKVbTilertWeightsAlias: - """TileRT weights alias for ProjoWKVb.""" - - wkv_b_weights = "wkv_b2_weights" - wkv_b_scales = "wkv_b2_scales" - - @property - def tilert_tensor_alias(self) -> list[str]: - return [self.wkv_b_weights, self.wkv_b_scales] - - def __call__(self) -> list[str]: - return self.tilert_tensor_alias - - -class ProjoWKVb(TileRTModule): - """ProjoWKVb module: O projection (wkv_b) for output.""" - - def __init__( - self, - model_args: ModelArgs, - num_devices: int, - device_id: int = 0, - ref_weights_alias: ProjoWKVbRefWeightsAlias | None = None, - ): - super().__init__( - self.__class__.__name__, - model_args=model_args, - num_devices=num_devices, - device_id=device_id, - ) - - self.tilert_weights_alias = ProjoWKVbTilertWeightsAlias() - self.ref_weights_alias = ( - ref_weights_alias if ref_weights_alias is not None else ProjoWKVbRefWeightsAlias() - ) - - self.ref_wkv_b: torch.Tensor | None = None - self.tilert_wkv_b_b: torch.Tensor | None = None - self.tilert_wkv_b_b_scales: torch.Tensor | None = None - self.output: torch.Tensor | None = None - self.profile_logs: torch.Tensor | None = None - - self.num_local_heads = self.model_args.n_heads // self.num_devices - - # lora dim and quant block size - self.wkvb_lora_rank = self.model_args.kv_lora_rank - self.wkvb_lora_rank_qsize = self.wkvb_lora_rank // self.model_args.block_size - - self.wkvb_head_dim = self.model_args.qk_nope_head_dim + self.model_args.v_head_dim - self.wkvb_v_head_dim = self.model_args.v_head_dim - left_head_dim = self.wkvb_head_dim % self.model_args.block_size - if left_head_dim != 0: - assert self.model_args.block_size % left_head_dim == 0 - self.head_dim_block_size = left_head_dim - self.head_dim_scale_repeat = self.model_args.block_size // self.head_dim_block_size - else: - self.head_dim_scale_repeat = 1 - self.head_dim_block_size = self.model_args.block_size - self.wkvb_head_qsize = self.wkvb_head_dim // self.head_dim_block_size - self.wkvb_v_head_qsize = self.wkvb_v_head_dim // self.head_dim_block_size - - def get_weights_list(self) -> list[torch.Tensor]: - return [self.tilert_wkv_b_b, self.tilert_wkv_b_b_scales] - - def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - """ - Device sharding: split weights and scales per device. - - Args: - weights_map: Map from ref weight alias to tensor. - - Returns: - Map from tilert weight alias to (num_devices, ...) tensors. - """ - kv_b_proj_weight = weights_map[self.ref_weights_alias.wkv_b_weights] - kv_b_proj_weight_scale = weights_map[self.ref_weights_alias.wkv_b_scales] - - dev_heads = (self.num_devices, self.num_local_heads) - wkvb = kv_b_proj_weight.view(*dev_heads, self.wkvb_head_dim, self.wkvb_lora_rank)[ - :, :, -self.wkvb_v_head_dim : - ] - wkvb_scales = ( - kv_b_proj_weight_scale.view( - self.num_devices, - self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size, - 1, - self.wkvb_lora_rank_qsize, - ) - .contiguous() - .repeat(1, 1, self.head_dim_scale_repeat, 1) - .view( - self.num_devices, - self.num_local_heads, - self.wkvb_head_qsize, - self.wkvb_lora_rank_qsize, - ) - .contiguous()[:, :, -self.wkvb_v_head_qsize :] - ) - return { - self.tilert_weights_alias.wkv_b_weights: wkvb.contiguous(), - self.tilert_weights_alias.wkv_b_scales: wkvb_scales.contiguous(), - } - - def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: - sharding_size = self.num_local_heads * self.wkvb_head_dim - sharding_start = self.device_id * sharding_size - sharding_end = sharding_start + sharding_size - wkv_b = weight_dequant( - state_dict[self.ref_weights_alias.wkv_b_weights], - state_dict[self.ref_weights_alias.wkv_b_scales], - ) - wkv_b = wkv_b[sharding_start:sharding_end, :] - wkv_b = wkv_b.view(self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank) - self.ref_wkv_b = wkv_b[:, -self.wkvb_v_head_dim :] - - def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: - self.tilert_wkv_b_b, self.tilert_wkv_b_b_scales = ProjoWKVbWeightsConverter( - self.model_args, self.num_devices - ).dispatch( - ProjoWKVbAlgorithm.GENERAL, - [ - state_dict[self.tilert_weights_alias.wkv_b_weights], - state_dict[self.tilert_weights_alias.wkv_b_scales], - ], - ) - - def init_random_weights(self) -> None: - wkv_b = init_func( - torch.empty( - self.model_args.n_heads * self.wkvb_head_dim, - self.wkvb_lora_rank, - dtype=torch.float8_e4m3fn, - ) - ) - wkv_b_scales = init_func( - torch.empty( - # Block quant should be applied to the original weight dimension (including head - # dimension) - self.model_args.n_heads * self.wkvb_head_dim // self.model_args.block_size, - self.wkvb_lora_rank_qsize, - dtype=torch.float32, - ) - ) - ref_state_dict = dict( - zip( - self.ref_weights_alias(), - [wkv_b, wkv_b_scales], - ) - ) - self.init_reference_weights(ref_state_dict) - sharded = self.device_sharding(ref_state_dict) - self.init_tilert_weights({k: v[self.device_id] for k, v in sharded.items()}) - - def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: - self.output = torch.zeros( - (batch_size, seq_len, self.num_local_heads, self.wkvb_v_head_dim), - dtype=torch.bfloat16, - ) - self.profile_logs = get_profile_log_tensor() - self.is_var_init = True - - def golden_forward(self, x_out: torch.Tensor) -> torch.Tensor: - assert self.ref_wkv_b is not None - return torch.einsum("bshc,hdc->bshd", x_out, self.ref_wkv_b) - - def tilert_forward(self, x_out: torch.Tensor) -> torch.Tensor: - assert self.tilert_wkv_b_b is not None - assert self.tilert_wkv_b_b_scales is not None - assert self.output is not None - assert self.profile_logs is not None - projo_wkvb( - x_out, - self.tilert_wkv_b_b, - self.tilert_wkv_b_b_scales, - self.output, - self.profile_logs, - ) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) - return self.output diff --git a/python/models/deepseek_v3_2/ops/rmsnorm_proj_top1.py b/python/models/deepseek_v3_2/ops/rmsnorm_proj_top1.py deleted file mode 100644 index c6ec1a5..0000000 --- a/python/models/deepseek_v3_2/ops/rmsnorm_proj_top1.py +++ /dev/null @@ -1,29 +0,0 @@ -"""RMSNorm + head projection + top1 operation""" - -import torch - -__all__ = [ - "rmsnorm_proj_top1", -] - - -def rmsnorm_proj_top1( - hidden_in: torch.Tensor, - rmsnorm_gamma_in: torch.Tensor, - head_projection_weights_in: torch.Tensor, - token_id: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """ - Define the RMSNormProjTop1 operation. - - Args: - hidden_in: Input tensor. - rmsnorm_gamma_in: Weight tensor. - head_projection_weights_in: Weight tensor. - token_id: Output tensor. - profile_logs: Profile logs tensor. - """ - torch.ops.tilert.rmsnorm_proj_top1_op( - hidden_in, rmsnorm_gamma_in, head_projection_weights_in, token_id, profile_logs - ) diff --git a/python/models/deepseek_v3_2/ops/rmsnorm_projq_wqib.py b/python/models/deepseek_v3_2/ops/rmsnorm_projq_wqib.py deleted file mode 100644 index 7adcad6..0000000 --- a/python/models/deepseek_v3_2/ops/rmsnorm_projq_wqib.py +++ /dev/null @@ -1,689 +0,0 @@ -"""RmsnormProjqWqib operation module.""" - -from dataclasses import dataclass -from enum import Enum - -import torch -from einops import rearrange - -from tilert.models.base import TileRTModule, TilertWeightsConverter -from tilert.models.common import weight_dequant -from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.models.deepseek_v3_2.ops.expert_sel_up_gate_silu import ( - ExpertSelectUpGateSiLUWeightsConverter as WeightsConverter, -) -from tilert.profiler.utils import parse_profile_log_tensor -from tilert.utils import get_profile_log_tensor - -__all__ = [ - "RmsnormProjqWqib", - "RmsnormProjqWqibAlgorithm", - "RmsnormProjqWqibWeightsConverter", -] - - -def rmsnorm_projq_wqib_op( - q: torch.Tensor, - wq_b_full: torch.Tensor, - wq_b_full_scales: torch.Tensor, - q_norm_weight: torch.Tensor, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - iq: torch.Tensor, - profile_logs: torch.Tensor, - algorithm: str, -) -> None: - dim = q.shape[-1] - if dim == 1536: - impl_func = torch.ops.tilert.rmsnorm_proj_qb_iq_op - elif dim == 2048: - impl_func = torch.ops.tilert.rmsnorm_proj_qb_iq_glm5_op - else: - raise ValueError(f"Invalid dimension: {dim}") - impl_func( - q, - wq_b_full, - wq_b_full_scales, - q_norm_weight, - q_nope, - q_pe, - iq, - profile_logs, - algorithm, - ) - - -class RmsnormProjqWqibAlgorithm(Enum): - """RmsnormProjqWqib algorithm.""" - - BF16 = "bf16" - FP8 = "fp8" - FP16MMA = "fp16mma" - - -class RmsnormProjqWqibWeightsConverter(TilertWeightsConverter): - """Weights converter: common format to TileRT format.""" - - def __init__(self, model_args: ModelArgs, num_devices: int): - super().__init__(model_args=model_args, num_devices=num_devices) - - self.proc_groups = 8 - self.repeat = 16 - - self.block_size = self.model_args.block_size - self.n_local_heads = self.model_args.n_heads // self.num_devices - - self.q_lora_dim = self.model_args.q_lora_rank - self.q_lora_qdim = self.q_lora_dim // self.block_size - - self.qk_nope_head_dim = self.model_args.qk_nope_head_dim - self.qk_rope_head_dim = self.model_args.qk_rope_head_dim - self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim - self.qk_dim = self.qk_head_dim * self.n_local_heads - self.qk_qdim = self.qk_dim // self.block_size - - self.index_n_heads = self.model_args.index_n_heads - self.index_head_dim = self.index_n_heads * self.model_args.index_head_dim - self.index_head_qdim = self.index_head_dim // self.block_size - - def _common_to_tilert_bf16( - self, - wq_b: torch.Tensor, - wq_b_scales_raw: torch.Tensor, - wq_b_iq: torch.Tensor, - wq_b_iq_scales: torch.Tensor, - rmsnorm_gamma: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Convert common weights to TileRT BF16 layout.""" - wq_b = wq_b.reshape(self.n_local_heads, self.qk_head_dim, self.q_lora_dim) - wq_b_nope = wq_b[:, : self.qk_nope_head_dim, :] - wq_b_nope = wq_b_nope.reshape( - self.n_local_heads, - self.proc_groups, - self.qk_nope_head_dim // self.proc_groups, - self.q_lora_dim, - ) - wq_b_pe = wq_b[:, self.qk_nope_head_dim :, :] - wq_b_pe = wq_b_pe.reshape( - self.n_local_heads, - self.proc_groups, - self.qk_rope_head_dim // self.proc_groups, - self.q_lora_dim, - ) - wq_b = torch.cat([wq_b_nope, wq_b_pe], dim=2) - wq_b = wq_b.reshape(self.qk_dim, self.q_lora_dim) - wq_b_full = torch.cat([wq_b, wq_b_iq], dim=0) - - wq_b_scales_iq_raw = wq_b_iq_scales - wq_b_scales_t16 = ( - wq_b_scales_raw.reshape((self.qk_qdim, 1, self.q_lora_qdim)) - .repeat(1, self.repeat, 1) - .reshape(self.qk_qdim * self.repeat, self.q_lora_qdim) - ) - wq_b_scales_t16 = wq_b_scales_t16.reshape( - self.n_local_heads, self.qk_head_dim // self.proc_groups, self.q_lora_qdim - ) - wq_b_scales_t16_nope = wq_b_scales_t16[:, : self.qk_nope_head_dim // 8] - wq_b_scales_t16_pe = wq_b_scales_t16[:, self.qk_nope_head_dim // 8 :] - wq_b_scales_t16_nope = wq_b_scales_t16_nope.reshape( - self.n_local_heads, - self.proc_groups, - self.qk_nope_head_dim // 8 // self.proc_groups, - self.q_lora_qdim, - ) - wq_b_scales_t16_pe = wq_b_scales_t16_pe.reshape( - self.n_local_heads, - self.proc_groups, - self.qk_rope_head_dim // 8 // self.proc_groups, - self.q_lora_qdim, - ) - wq_b_scales_t16 = torch.cat([wq_b_scales_t16_nope, wq_b_scales_t16_pe], dim=2) - wq_b_scales_t16 = wq_b_scales_t16.reshape(-1, self.q_lora_qdim) - wq_b_scales_full = torch.cat([wq_b_scales_t16, wq_b_scales_iq_raw], dim=0) - - return ( - wq_b_full.detach().clone(), - wq_b_scales_full.detach().clone(), - rmsnorm_gamma.float().detach().clone(), - ) - - def _common_to_tilert_fp8( - self, - wq_b: torch.Tensor, - wq_b_scales_raw: torch.Tensor, - wq_b_iq: torch.Tensor, - wq_b_iq_scales_raw: torch.Tensor, - rmsnorm_gamma: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Convert common weights to TileRT FP8 MMA layout.""" - # Reshape wq_b: simple split of nope and pe, then concatenate - wq_b = wq_b.reshape(self.n_local_heads, self.qk_head_dim, self.q_lora_dim) - wq_b_nope = wq_b[:, : self.qk_nope_head_dim, :].reshape(-1, self.q_lora_dim) - wq_b_pe = wq_b[:, self.qk_nope_head_dim :, :].reshape(-1, self.q_lora_dim) - wq_b = torch.cat([wq_b_nope, wq_b_pe], dim=0) - - # Process scales: expand and split nope/pe similarly to weights - m_scale_group = self.block_size // self.repeat - wq_b_scales_t16 = ( - wq_b_scales_raw.reshape((self.qk_qdim, 1, self.q_lora_qdim)) - .repeat(1, self.repeat, 1) - .reshape(-1, self.qk_head_dim // m_scale_group, self.q_lora_qdim) - ) - - # Split nope and pe parts - wq_b_scales_nope = wq_b_scales_t16[:, : self.qk_nope_head_dim // m_scale_group, :].reshape( - [-1, self.q_lora_qdim] - ) - wq_b_scales_pe = wq_b_scales_t16[:, self.qk_nope_head_dim // m_scale_group :, :].reshape( - [-1, self.q_lora_qdim] - ) - wq_b_scales_t16 = torch.cat([wq_b_scales_nope, wq_b_scales_pe], dim=0) - - # Process wq_b_iq scales - wq_b_iq_scales_t16 = ( - wq_b_iq_scales_raw.reshape([self.index_head_qdim, 1, self.q_lora_qdim]) - .repeat([1, self.repeat, 1]) - .reshape((-1, self.q_lora_qdim)) - ) - - # Concatenate weights and scales - wq_b_raw = torch.cat([wq_b, wq_b_iq], dim=0) - page_k = self.q_lora_qdim - total_out_dim = self.qk_dim + self.index_head_dim - total_out_qdim = total_out_dim // self.block_size - wq_b_scales_full = ( - torch.cat( - [wq_b_scales_t16.to(torch.float32), wq_b_iq_scales_t16.to(torch.float32)], dim=0 - ) - .reshape((total_out_qdim, self.repeat, page_k, self.q_lora_qdim // page_k)) - .permute([0, 2, 1, 3]) - .contiguous() - .view(torch.float8_e4m3fn) - ) - - wq_b_raw = wq_b_raw.reshape( - [total_out_qdim, 128 // 16, 16, page_k, self.q_lora_dim // 32 // page_k, 32] - ).permute([0, 3, 1, 4, 2, 5]) - wq_b_raw = WeightsConverter._swizzle_mma_16x32(wq_b_raw) - - tilert_wq_b_full = torch.cat( - [ - wq_b_raw.reshape((total_out_qdim, page_k, -1)), - wq_b_scales_full.reshape([total_out_qdim, page_k, -1]), - ], - -1, - ).contiguous() - # TODO: use fp32 scale for glm_5 - tilert_wq_b_full_scales = torch.zeros(1, dtype=torch.bfloat16) - tilert_q_norm_weight = rmsnorm_gamma.float().detach().clone() - return tilert_wq_b_full, tilert_wq_b_full_scales, tilert_q_norm_weight - - @staticmethod - def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: - assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 - # PTX isa fig.88 - pre_shape = mat_in.shape[:-2] - mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) - return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) - - @staticmethod - def _swizzle_mma_16x16_for_16x2048_4pages(mat_in: torch.Tensor) -> torch.Tensor: - assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 2048 - pre_shape = mat_in.shape[:-2] - mat_in = mat_in.reshape(*pre_shape, 16, 4, 512).transpose(-3, -2) - mat_in = mat_in.reshape(*pre_shape, 4, 16, 32, 16).transpose(-3, -2) - mat_in = RmsnormProjqWqibWeightsConverter._swizzle_mma_16x16(mat_in) - return mat_in.contiguous() - - def _common_to_tilert_fp16mma( - self, - wq_b: torch.Tensor, - wq_b_scale: torch.Tensor, - wq_b_iq: torch.Tensor, - wq_b_iq_scale: torch.Tensor, - q_norm_weight: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Convert common weights to TileRT FP16 MMA layout.""" - assert self.model_args.arch_name == "glm_5", "Only GLM-5 supports FP16 MMA" - - if wq_b_scale.dtype != torch.float32: - print( - "Warning: RmsnormProjqWqibWeightsConverter: " - + f"wq_b_scale.dtype: {wq_b_scale.dtype} " - + "is not float32, convert to float32." - ) - wq_b_scale = wq_b_scale.to(torch.float32) - if wq_b_iq_scale.dtype != torch.float32: - print( - "Warning: RmsnormProjqWqibWeightsConverter: " - + f"wq_b_iq_scale.dtype: {wq_b_iq_scale.dtype} " - + "is not float32, convert to float32." - ) - wq_b_iq_scale = wq_b_iq_scale.to(torch.float32) - - sms = 128 # use 128 sms for glm_5 - pages = 4 - qk_dim = self.qk_head_dim * self.n_local_heads - qk_dim_per_sm = qk_dim // sms # 16 per sm - qk_nope_dim = self.n_local_heads * self.qk_nope_head_dim - qk_pe_dim = self.n_local_heads * self.qk_rope_head_dim - iq_dim_per_sm = self.index_head_dim // sms # 32 per sm - - wq_b_scale = wq_b_scale.reshape( - self.n_local_heads, self.qk_head_dim // self.block_size, 1, self.q_lora_qdim - ).repeat( - 1, 1, self.block_size, 1 - ) # 2048, 2048//128 - - wq_b_scale = wq_b_scale.reshape(self.n_local_heads, self.qk_head_dim, -1) - wq_b_nope_scale = ( - wq_b_scale[:, : self.qk_nope_head_dim, :] - .reshape(qk_nope_dim // qk_dim_per_sm, qk_dim_per_sm, pages, self.q_lora_qdim // pages) - .transpose(1, 2) # (96, 4, 16, 4) for glm_5 - ) - - wq_b_pe_scale = ( - wq_b_scale[:, self.qk_nope_head_dim :, :] - .reshape(qk_pe_dim // qk_dim_per_sm, qk_dim_per_sm, pages, self.q_lora_qdim // pages) - .transpose(1, 2) # (32, 4, 16, 4) for glm_5 - ) - wq_b_scale = torch.cat([wq_b_nope_scale, wq_b_pe_scale], dim=0) - wq_b_scale = wq_b_scale[:, :, 0, :] # (128, 4, 4) for glm_5 - - wq_b_iq_scale = wq_b_iq_scale.reshape(self.index_head_qdim, 1, self.q_lora_qdim).repeat( - 1, self.block_size, 1 - ) # (4096, 16) for glm_5 - wq_b_iq_scale = wq_b_iq_scale.reshape( - sms, iq_dim_per_sm, pages, self.q_lora_qdim // pages - ).transpose(1, 2) - wq_b_iq_scale = wq_b_iq_scale[:, :, 0, :] # (128, 4, 4) for glm_5 - - wq_b_full_scales = ( - torch.cat([wq_b_scale, wq_b_iq_scale], dim=-1).contiguous().view(torch.float8_e4m3fn) - ) # (128, 4, 8x4) for glm_5 - - wq_b = wq_b.reshape(self.n_local_heads, self.qk_head_dim, self.q_lora_dim) - wq_b_nope = wq_b[:, : self.qk_nope_head_dim, :].reshape(-1, self.q_lora_dim) # 8x192, 2048 - wq_b_nope = RmsnormProjqWqibWeightsConverter._swizzle_mma_16x16_for_16x2048_4pages( - wq_b_nope.reshape(qk_nope_dim // qk_dim_per_sm, qk_dim_per_sm, self.q_lora_dim) - ) - wq_b_nope = wq_b_nope.reshape(qk_nope_dim // qk_dim_per_sm, pages, qk_dim_per_sm, -1) - # (96, 4, 16, 512) for glm_5 - - wq_b_pe = wq_b[:, self.qk_nope_head_dim :, :].reshape(-1, self.q_lora_dim) # 8x64, 2048 - wq_b_pe = RmsnormProjqWqibWeightsConverter._swizzle_mma_16x16_for_16x2048_4pages( - wq_b_pe.reshape(qk_pe_dim // qk_dim_per_sm, qk_dim_per_sm, self.q_lora_dim) - ) - wq_b_pe = wq_b_pe.reshape(qk_pe_dim // qk_dim_per_sm, pages, qk_dim_per_sm, -1) - # (32, 4, 16, 512) for glm_5 - wq_b = torch.cat([wq_b_nope, wq_b_pe], dim=0) - # (128, 4, 16, 512) for glm_5 - - wq_b_iq = RmsnormProjqWqibWeightsConverter._swizzle_mma_16x16_for_16x2048_4pages( - wq_b_iq.reshape(sms, 2, iq_dim_per_sm // 2, self.q_lora_dim) - ) - wq_b_iq = ( - wq_b_iq.reshape(sms, 2, pages, iq_dim_per_sm // 2, -1) - .transpose(1, 2) - .reshape(sms, pages, iq_dim_per_sm, -1) - ) - # (128, 4, 32, 512) for glm_5 - wq_b = torch.cat([wq_b, wq_b_iq], dim=2) - wq_b = wq_b.reshape(sms, pages, -1) - # (128, 4, 48*512) for glm_5 - wq_b_scales_padding = torch.zeros( - sms, - pages, - 128 - wq_b_full_scales.shape[-1], - dtype=torch.float8_e4m3fn, - device=wq_b.device, - ) # append 128-byte aligned scale: (128, 4, 24704) for glm_5 - tilert_wq_b_full = torch.cat( - [wq_b, wq_b_full_scales, wq_b_scales_padding], dim=-1 - ).contiguous() - tilert_wq_b_dummy_scales = torch.zeros(1, dtype=torch.bfloat16) - tilert_q_norm_weight = q_norm_weight.float().detach().clone() - return tilert_wq_b_full, tilert_wq_b_dummy_scales, tilert_q_norm_weight - - def convert_to_bf16( - self, weights: list[torch.Tensor] - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Convert common-format weights to TileRT BF16 layout. - - Args: - weights: [q_norm_weight, wq_b, wq_b_scale, wq_b_iq, wq_b_iq_scale]. - """ - with torch.inference_mode(): - wq_b, wq_b_scale, wq_b_iq, wq_b_iq_scale, q_norm_weight = weights - if self.model_args.arch_name == "glm_5": - if wq_b_scale.dtype != torch.float32: - print( - "Warning: RmsnormProjqWqibWeightsConverter: " - + f"wq_b_scale.dtype: {wq_b_scale.dtype} " - + "is not float32, convert to float32." - ) - wq_b_scales = wq_b_scale.to(torch.float32) - wq_b_iq_scales = wq_b_iq_scale.to(torch.float32) - return self._common_to_tilert_bf16( - wq_b, - wq_b_scales, - wq_b_iq, - wq_b_iq_scales, - q_norm_weight, - ) - - # DS v3.2, use bfloat16 for wq_b_scale and wq_b_iq_scale - wq_b_scales_bf16 = wq_b_scale.to(torch.bfloat16) - wq_b_iq_scales_bf16 = wq_b_iq_scale.to(torch.bfloat16) - return self._common_to_tilert_bf16( - wq_b, - wq_b_scales_bf16, - wq_b_iq, - wq_b_iq_scales_bf16, - q_norm_weight, - ) - - def convert_to_fp8( - self, weights: list[torch.Tensor] - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Convert common-format weights to TileRT FP8 MMA layout. - - Args: - weights: [q_norm_weight, wq_b, wq_b_scale, wq_b_iq, wq_b_iq_scale]. - """ - with torch.inference_mode(): - wq_b, wq_b_scale, wq_b_iq, wq_b_iq_scale, q_norm_weight = weights - return self._common_to_tilert_fp8( - wq_b, - wq_b_scale, - wq_b_iq, - wq_b_iq_scale, - q_norm_weight, - ) - - def convert_to_fp16mma( - self, weights: list[torch.Tensor] - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Convert common-format weights to TileRT FP16 MMA layout. - - Args: - weights: [q_norm_weight, wq_b, wq_b_scale, wq_b_iq, wq_b_iq_scale]. - """ - with torch.inference_mode(): - wq_b, wq_b_scale, wq_b_iq, wq_b_iq_scale, q_norm_weight = weights - return self._common_to_tilert_fp16mma( - wq_b, - wq_b_scale, - wq_b_iq, - wq_b_iq_scale, - q_norm_weight, - ) - - -@dataclass -class RmsnormProjqWqibRefWeightsAlias: - """Reference weights alias for RmsnormProjqWqib.""" - - rmsnorm_gamma = "self_attn.q_a_layernorm.weight" - wqb_weights = "self_attn.q_b_proj.weight" - wqb_scales = "self_attn.q_b_proj.weight_scale_inv" - wi_weights = "self_attn.indexer.wq_b.weight" - wi_scales = "self_attn.indexer.wq_b.weight_scale_inv" - - @property - def ref_tensor_alias(self) -> list[str]: - return [ - self.rmsnorm_gamma, - self.wqb_weights, - self.wqb_scales, - self.wi_weights, - self.wi_scales, - ] - - def __call__(self) -> list[str]: - return self.ref_tensor_alias - - -@dataclass -class RmsnormProjqWqibTilertWeightsAlias: - """TileRT weights alias for RmsnormProjqWqib.""" - - rmsnorm_gamma = "q_rmsnorm_gamma" - wqb_weights = "wqb_weights" - wqb_scales = "wqb_scales" - wi_weights = "wi_weights" - wi_scales = "wi_scales" - - @property - def tilert_tensor_alias(self) -> list[str]: - return [ - self.rmsnorm_gamma, - self.wqb_weights, - self.wqb_scales, - self.wi_weights, - self.wi_scales, - ] - - def __call__(self) -> list[str]: - return self.tilert_tensor_alias - - -class RmsnormProjqWqib(TileRTModule): - """RmsnormProjqWqib module: RMSNorm + Q projection (wq_b + wq_b_iq).""" - - def __init__( - self, - model_args: ModelArgs, - device_id: int, - num_devices: int, - ref_weights_alias: RmsnormProjqWqibRefWeightsAlias | None = None, - ): - super().__init__( - self.__class__.__name__, - model_args=model_args, - device_id=device_id, - num_devices=num_devices, - ) - - self.tilert_weights_alias = RmsnormProjqWqibTilertWeightsAlias() - self.ref_weights_alias = ( - ref_weights_alias - if ref_weights_alias is not None - else RmsnormProjqWqibRefWeightsAlias() - ) - - self.n_local_heads = model_args.n_heads // num_devices - self.q_lora_rank = model_args.q_lora_rank - self.index_n_heads = model_args.index_n_heads - self.head_dim = model_args.index_head_dim - self.index_head_dim = model_args.index_n_heads * model_args.index_head_dim - self.n_heads = model_args.n_heads - self.qk_head_dim = model_args.qk_nope_head_dim + model_args.qk_rope_head_dim - self.qk_local_dim = self.qk_head_dim * self.n_local_heads - self.qk_nope_head_dim = model_args.qk_nope_head_dim - self.qk_rope_head_dim = model_args.qk_rope_head_dim - - # quantize block size - self.block_size = model_args.block_size - self.q_lora_qdim = self.q_lora_rank // self.block_size - self.qk_local_qdim = self.qk_local_dim // self.block_size - self.index_head_qdim = self.index_head_dim // self.block_size - self.eps = model_args.eps - - self.ref_q_norm: torch.Tensor | None = None - self.ref_wq_b: torch.Tensor | None = None - self.ref_wq_b_iq: torch.Tensor | None = None - - self.tilert_wq_b_full: torch.Tensor | None = None - self.tilert_wq_b_full_scales: torch.Tensor | None = None - self.tilert_q_norm_weight: torch.Tensor | None = None - - self.q_nope: torch.Tensor | None = None - self.q_pe: torch.Tensor | None = None - self.iq: torch.Tensor | None = None - - self.profile_logs: torch.Tensor | None = None - - def get_weights_list(self) -> list[torch.Tensor]: - return [self.tilert_q_norm_weight, self.tilert_wq_b_full, self.tilert_wq_b_full_scales] - - def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - """Device sharding.""" - gamma = weights_map[self.ref_weights_alias.rmsnorm_gamma][None, ...].repeat( - self.num_devices, 1 - ) - - sharded_wqb_weights = weights_map[self.ref_weights_alias.wqb_weights].reshape( - self.num_devices, self.qk_local_dim, self.q_lora_rank - ) - sharded_wi_weights = weights_map[self.ref_weights_alias.wi_weights][None, ...].repeat( - self.num_devices, 1, 1 - ) - - sharded_wqb_scales = weights_map[self.ref_weights_alias.wqb_scales].reshape( - self.num_devices, self.qk_local_qdim, self.q_lora_qdim - ) - sharded_wi_scales = weights_map[self.ref_weights_alias.wi_scales][None, ...].repeat( - self.num_devices, 1, 1 - ) - - return { - self.tilert_weights_alias.rmsnorm_gamma: gamma, - self.tilert_weights_alias.wqb_weights: sharded_wqb_weights, - self.tilert_weights_alias.wqb_scales: sharded_wqb_scales, - self.tilert_weights_alias.wi_weights: sharded_wi_weights, - self.tilert_weights_alias.wi_scales: sharded_wi_scales, - } - - def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: - """Initialize reference weights from common-format state dict.""" - self.ref_q_norm = state_dict[self.ref_weights_alias.rmsnorm_gamma] - qk_local_dim_start = self.qk_local_dim * self.device_id - qk_local_qdim_start = qk_local_dim_start // self.block_size - qk_local_dim_end = qk_local_dim_start + self.qk_local_dim - qk_local_qdim_end = qk_local_dim_end // self.block_size - wq_b = weight_dequant( - state_dict[self.ref_weights_alias.wqb_weights][qk_local_dim_start:qk_local_dim_end], - state_dict[self.ref_weights_alias.wqb_scales][qk_local_qdim_start:qk_local_qdim_end], - ) - wq_b_iq = weight_dequant( - state_dict[self.ref_weights_alias.wi_weights], - state_dict[self.ref_weights_alias.wi_scales], - ) - self.ref_wq_b = wq_b.contiguous() - self.ref_wq_b_iq = wq_b_iq.contiguous() - - def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: - """Initialize TileRT weights from common-format state dict.""" - weights = [ - state_dict[_k] - for _k in [ - self.tilert_weights_alias.wqb_weights, - self.tilert_weights_alias.wqb_scales, - self.tilert_weights_alias.wi_weights, - self.tilert_weights_alias.wi_scales, - self.tilert_weights_alias.rmsnorm_gamma, - ] - ] - assert self.algorithm is not None, "Algorithm is not set" - self.tilert_wq_b_full, self.tilert_wq_b_full_scales, self.tilert_q_norm_weight = ( - RmsnormProjqWqibWeightsConverter(self.model_args, self.num_devices).dispatch( - self.algorithm, weights - ) - ) - - def init_random_weights(self) -> None: - """Initialize random reference and TileRT weights for testing.""" - q_norm = torch.randn(self.q_lora_rank, dtype=torch.float32) - wq_b = torch.randn( - self.num_devices * self.qk_local_dim, self.q_lora_rank, dtype=torch.bfloat16 - ).to(torch.float8_e4m3fn) - scale_dtype = torch.float32 if self.model_args.arch_name == "glm_5" else torch.bfloat16 - wq_b_scale = torch.randn( - self.num_devices * self.qk_local_qdim, self.q_lora_qdim, dtype=scale_dtype - ) - wq_b_iq = torch.randn(self.index_head_dim, self.q_lora_rank, dtype=torch.bfloat16).to( - torch.float8_e4m3fn - ) - wq_b_iq_scale = torch.randn(self.index_head_qdim, self.q_lora_qdim, dtype=scale_dtype) - ref_state = { - self.ref_weights_alias.rmsnorm_gamma: q_norm, - self.ref_weights_alias.wqb_weights: wq_b, - self.ref_weights_alias.wqb_scales: wq_b_scale, - self.ref_weights_alias.wi_weights: wq_b_iq, - self.ref_weights_alias.wi_scales: wq_b_iq_scale, - } - - self.init_reference_weights(ref_state) - self.init_tilert_weights( - {_k: _v[self.device_id] for _k, _v in self.device_sharding(ref_state).items()} - ) - - def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: - """Allocate TileRT output buffers.""" - self.q_nope = torch.zeros( - batch_size, seq_len, self.n_local_heads, self.qk_nope_head_dim, dtype=torch.bfloat16 - ) - self.q_pe = torch.zeros( - batch_size, seq_len, self.n_local_heads, self.qk_rope_head_dim, dtype=torch.bfloat16 - ) - self.iq = torch.zeros( - batch_size, seq_len, self.index_n_heads, self.head_dim, dtype=torch.bfloat16 - ) - self.profile_logs = get_profile_log_tensor() - self.is_var_init = True - - def golden_forward(self, q: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Reference forward: RMSNorm + linear projections.""" - assert self.ref_q_norm is not None - assert self.ref_wq_b is not None - assert self.ref_wq_b_iq is not None - - bsz, seqlen, _ = q.shape - if bsz != 1 or seqlen not in [1, 2, 4]: - raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") - - qr = torch.nn.functional.rms_norm(q.float(), [q.size(-1)], self.ref_q_norm, self.eps).to( - q.dtype - ) - - q = torch.matmul(qr, self.ref_wq_b.T) - q = q.view(bsz, seqlen, self.n_local_heads, self.qk_head_dim) - q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - q_idx = torch.matmul(qr, self.ref_wq_b_iq.T) - q_idx = rearrange(q_idx, "b s (h d) -> b s h d", d=self.head_dim) - return q_nope, q_pe, q_idx - - def tilert_forward(self, q: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - assert self.tilert_wq_b_full is not None - assert self.tilert_wq_b_full_scales is not None - assert self.tilert_q_norm_weight is not None - assert self.q_nope is not None - assert self.q_pe is not None - assert self.iq is not None - assert self.profile_logs is not None - - bsz, seqlen, _ = q.shape - if bsz != 1 or seqlen not in [1, 2, 4]: - raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") - - assert self.algorithm is not None, "Algorithm is not set" - - rmsnorm_projq_wqib_op( - q, - self.tilert_wq_b_full, - self.tilert_wq_b_full_scales, - self.tilert_q_norm_weight, - self.q_nope, - self.q_pe, - self.iq, - self.profile_logs, - self.algorithm.value, - ) - - if self.flag_enable_profiling_log: - torch.cuda.synchronize() - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) - return self.q_nope, self.q_pe, self.iq diff --git a/python/models/deepseek_v3_2/ops/rmsnorm_projx_wqkvia.py b/python/models/deepseek_v3_2/ops/rmsnorm_projx_wqkvia.py deleted file mode 100644 index d6538ed..0000000 --- a/python/models/deepseek_v3_2/ops/rmsnorm_projx_wqkvia.py +++ /dev/null @@ -1,1095 +0,0 @@ -"""RMSNormProjxWqkvia operation module.""" - -from collections.abc import Callable -from dataclasses import dataclass -from enum import Enum - -# from typing import Any -import torch - -from tilert.models.base import TileRTModule, TilertWeightsConverter -from tilert.models.common import weight_dequant -from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.models.deepseek_v3_2.ops.rmsnorm_quant import rmsnorm_quant -from tilert.profiler.utils import parse_profile_log_tensor -from tilert.utils import get_profile_log_tensor - -__all__ = [ - "RMSNormProjQAKVAKIWeightsConverter", - "RMSNormProjxWqkviaAlgorithm", - "RMSNormProjxWqkvia", - "RMSNormProjxWqkviaRefWeightsAlias", - "RMSNormProjxWqkviaTilertWeightsAlias", - "rmsnorm_projx_wqkvia", - "projx_wqkvia", -] - - -def rmsnorm_projx_wqkvia( - x_in: torch.Tensor, - wqkv_a: torch.Tensor, - wqkv_a_scales: torch.Tensor, - rmsnorm_gamma: torch.Tensor, - cur_pos: torch.Tensor, - q_out: torch.Tensor, - kv_out: torch.Tensor, - pe_cache: torch.Tensor, - ki_out: torch.Tensor, - x_rmsnorm_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """ - rmsnorm_projx_wqkvia operation. - - Args: - x_in: Input tensor. - wqkv_a: QKV weights. - wqkv_a_scales: QKV scales. - rmsnorm_gamma: RMSNorm gamma. - cur_pos: Current position. - q_out: Q output tensor. - kv_out: KV output tensor. - pe_cache: PE cache tensor. - ki_out: Ki output tensor. - x_rmsnorm_out: RMSNorm output tensor. - profile_logs: Profile logs tensor. - """ - torch.ops.tilert.rmsnorm_proj_qa_kva_ki_op( - x_in, - wqkv_a, - wqkv_a_scales, - rmsnorm_gamma, - cur_pos, - q_out, - kv_out, - pe_cache, - ki_out, - x_rmsnorm_out, - profile_logs, - ) - - -def projx_wqkvia( - x_quant: torch.Tensor, - x_scale: torch.Tensor, - wqkvia: torch.Tensor, - cur_pos: torch.Tensor, - out_q: torch.Tensor, - out_kv: torch.Tensor, - pe_cache: torch.Tensor, - out_ki: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """ - Define the ProjXWQKVIa operation. - - Args: - x_quant: Input tensor. - x_scale: Weight tensor. - wqkvia: Weight tensor. - cur_pos: Current position tensor. - out_q: Output tensor. - out_kv: Output tensor. - pe_cache: Output tensor. - out_ki: Output tensor. - profile_logs: Profile logs tensor. - """ - dim = x_quant.shape[-1] - if dim == 6144: - func_call = torch.ops.tilert.projx_wqkvia_glm5 - elif dim == 7168: - func_call = torch.ops.tilert.projx_wqkvia_op - else: - raise ValueError(f"Unsupported dimension: {dim}") - func_call(x_quant, x_scale, wqkvia, cur_pos, out_q, out_kv, pe_cache, out_ki, profile_logs) - - -class RMSNormProjxWqkviaAlgorithm(Enum): - """RMSNormProjxWqkvia algorithm""" - - GENERAL = "general" # fused - DECOUPLED = "decoupled" # rmsnorm_quant + projx_wqkvia - - -class RMSNormProjQAKVAKIWeightsConverter: - """Weights converter class.""" - - @staticmethod - def _swizzle_mma_16x32(mat_in: torch.Tensor) -> torch.Tensor: - assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 - # PTX isa fig.88 - pre_shape = mat_in.shape[:-2] - mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) - return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) - - @staticmethod - def tilert_to_common( - tilert_wqkv_a: torch.Tensor, - tilert_wqkv_a_scales: torch.Tensor, - tilert_attn_norm_weight: torch.Tensor, - ) -> tuple[ - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - ]: - """ - Convert tilert weights to common weights. - - Args: - tilert_wqkv_a: Tilert weight tensor. - tilert_wqkv_a_scales: Tilert weight scale tensor. - tilert_attn_norm_weight: Tilert attention norm weight tensor. - Returns: - tuple: Common weights. - """ - wq_a = tilert_wqkv_a[:1536] # 1536, 7168 - wkv_a = tilert_wqkv_a[1536 : 1536 + 576] # 576, 7168 - wk = tilert_wqkv_a[1536 + 576 :] # 128, 7168 - - wqkv_a_scales_0 = tilert_wqkv_a_scales[:128, :].reshape(16, 8, 64) - wqkv_a_scales_0 = wqkv_a_scales_0[:, 0, :].reshape(16, 64) - wqkv_a_scales_1 = tilert_wqkv_a_scales[128:129, :] # 1, 64 - wqkv_a_scales_2 = tilert_wqkv_a_scales[129:, :] # 1, 64 - wqkv_a_scales_swizzled = torch.cat( - [wqkv_a_scales_0, wqkv_a_scales_1, wqkv_a_scales_2], dim=0 - ) - wqkv_scales = torch.zeros( - (18, 56), dtype=torch.bfloat16, device=tilert_wqkv_a_scales.device - ) - - for i in range(64): - if ((i % 8) * 8 + i // 8) < 56: - wqkv_scales[:, ((i % 8) * 8 + i // 8)] = wqkv_a_scales_swizzled[:, i] - wq_a_scale = wqkv_scales[:12, :] # 12, 56 - wkv_a_scale = wqkv_scales[12:17, :] # 5, 56 - wk_scale = wqkv_scales[17:, :] # 1, 56 - - attn_norm_weight = tilert_attn_norm_weight - return wq_a, wq_a_scale, wkv_a, wkv_a_scale, wk, wk_scale, attn_norm_weight - - @staticmethod - def common_to_tilert( - wq_a: torch.Tensor, - wq_a_scale: torch.Tensor, - wkv_a: torch.Tensor, - wkv_a_scale: torch.Tensor, - wk: torch.Tensor, - wk_scale: torch.Tensor, - attn_norm_weight: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Convert common weights to tilert weights. - - Args: - wq_a: Common weight tensor. - wq_a_scale: Common weight scale tensor. - wkv_a: Common weight tensor. - wkv_a_scale: Common weight scale tensor. - wk: Common weight tensor. - wk_scale: Common weight scale tensor. - attn_norm_weight: Common attention norm weight tensor. - Returns: - tuple: Tilert weights. - """ - wqkv_a = torch.cat([wq_a, wkv_a, wk], dim=0) - wqkv_a_scales_raw = torch.cat([wq_a_scale, wkv_a_scale, wk_scale], dim=0) - - wqkv_a_scales = torch.zeros((18, 64), dtype=torch.bfloat16, device=wq_a_scale.device) - for i in range(64): - wqkv_a_scales[:, i] = wqkv_a_scales_raw[:, ((i % 8) * 8 + i // 8) % 56] - if ((i % 8) * 8 + i // 8) >= 56: - wqkv_a_scales[:, i] = 0.0 - wqkv_a_scales_0 = wqkv_a_scales[:16, :] - wqkv_a_scales_1 = wqkv_a_scales[16:17, :] - wqkv_a_scales_2 = wqkv_a_scales[17:, :] - - wqkv_a_scales_0 = wqkv_a_scales_0.reshape((16, 1, 64)).repeat(1, 8, 1).reshape(-1, 64) - wqkv_a_scales = torch.cat([wqkv_a_scales_0, wqkv_a_scales_1, wqkv_a_scales_2], dim=0) - assert wqkv_a_scales.shape == (130, 64) - return wqkv_a.contiguous(), wqkv_a_scales.contiguous(), attn_norm_weight.clone() - - @staticmethod - def common_to_tilert_fp8( - wq_a: torch.Tensor, - wq_a_scale: torch.Tensor, - wkv_a: torch.Tensor, - wkv_a_scale: torch.Tensor, - wk: torch.Tensor, - wk_scale: torch.Tensor, - attn_norm_weight: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - """ - Convert common weights to tilert weights. - - Args: - wq_a: Common weight tensor. - wq_a_scale: Common weight scale tensor. - wkv_a: Common weight tensor. - wkv_a_scale: Common weight scale tensor. - wk: Common weight tensor. - wk_scale: Common weight scale tensor. - attn_norm_weight: Common attention norm weight tensor. - Returns: - tuple: Tilert fp8 weights. - """ - wq_a_raw: torch.Tensor = wq_a.detach().clone() - wkv_a_raw: torch.Tensor = wkv_a.detach().clone() - wq_a_raw = torch.cat([wq_a_raw, wkv_a_raw[:512], wk, wkv_a_raw[512:]], dim=0) - - wq_a_raw = wq_a_raw.reshape(35, 64, 14, 512) - wq_a_raw = wq_a_raw.permute(0, 2, 1, 3) - - wq_a_raw = wq_a_raw.reshape(35, 14, 16, 4, 4, 128) - wq_a_copy = wq_a_raw.contiguous().clone() - wq_a_raw[:, :, 1::2, :, :, :64] = wq_a_copy[:, :, 1::2, :, :, 64:] - wq_a_raw[:, :, 1::2, :, :, 64:] = wq_a_copy[:, :, 1::2, :, :, :64] - wq_a_raw = wq_a_raw.reshape(35, 14, 16, 4, 4, 2, 64) - wq_a_copy = wq_a_raw.contiguous().clone() - wq_a_raw[:, :, :, 2:, :, :, :32] = wq_a_copy[:, :, :, 2:, :, :, 32:] - wq_a_raw[:, :, :, 2:, :, :, 32:] = wq_a_copy[:, :, :, 2:, :, :, :32] - wq_a_raw = wq_a_raw.reshape(35, 14, 16, 4, 4, 2, 2, 32) - wq_a_copy = wq_a_raw.contiguous().clone() - wq_a_raw[:, :, :, 1::2, :, :, :, :16] = wq_a_copy[:, :, :, 1::2, :, :, :, 16:] - wq_a_raw[:, :, :, 1::2, :, :, :, 16:] = wq_a_copy[:, :, :, 1::2, :, :, :, :16] - - wq_a_raw = wq_a_raw.reshape(35, 14, 16, 4, 4, 128) - wq_a_raw = wq_a_raw.permute(0, 1, 4, 2, 3, 5).reshape(35, 14, -1).contiguous() - wq_a_raw = wq_a_raw.reshape(35, 14, -1).contiguous() - - wq_s_raw: torch.Tensor = wq_a_scale.detach().clone() - wkv_s_raw: torch.Tensor = wkv_a_scale.detach().clone() - wq_s_raw = torch.cat([wq_s_raw, wkv_s_raw[:4], wk_scale, wkv_s_raw[4:]], dim=0) - wq_s_raw = wq_s_raw.reshape(18, 1, 14, 4).repeat(1, 2, 1, 1).reshape(36, 1, 14, 4) - wq_s_raw = wq_s_raw[:35].reshape(35, 14, -1).contiguous() - wq_s_raw = wq_s_raw.view(torch.float8_e4m3fn) - wq_as_raw = torch.cat([wq_a_raw, wq_s_raw], dim=-1) - - return wq_as_raw.contiguous(), attn_norm_weight.clone() - - @staticmethod - def common_to_tilert_native_bf16( - wq_a: torch.Tensor, - wq_a_scale: torch.Tensor, - wkv_a: torch.Tensor, - wkv_a_scale: torch.Tensor, - wk: torch.Tensor, - wk_scale: torch.Tensor, - attn_norm_weight: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - """ - Convert common weights to weights for tilert native bf16 op. - - Args: - wq_a: Common weight tensor. - wq_a_scale: Common weight scale tensor. - wkv_a: Common weight tensor. - wkv_a_scale: Common weight scale tensor. - wk: Common weight tensor. - wk_scale: Common weight scale tensor. - attn_norm_weight: Common attention norm weight tensor. - Returns: - tuple: Tilert weights for native bf16 op. - """ - wq_a_scale = wq_a_scale.reshape((12, 56, 1)).repeat(1, 1, 128).reshape((12, 1, 7168)) - wq_a_scale = wq_a_scale.repeat(1, 128, 1).reshape((1536, 7168)) - wkv_a_scale = wkv_a_scale.reshape((5, 56, 1)).repeat(1, 1, 128).reshape((5, 1, 7168)) - wkv_a_scale = wkv_a_scale.repeat(1, 128, 1).reshape((-1, 7168)) - wkv_a_scale = wkv_a_scale[:576] - wk_scale = wk_scale.reshape((1, 56, 1)).repeat(1, 1, 128).reshape((1, 1, 7168)) - wk_scale = wk_scale.repeat(1, 128, 1).reshape((128, 7168)) - wq_a = wq_a.reshape((1536, 7168)).float() * wq_a_scale.float() - wkv_a = wkv_a.reshape((576, 7168)).float() * wkv_a_scale.float() - wk = wk.reshape((128, 7168)).float() * wk_scale.float() - weights = torch.cat([wq_a, wkv_a, wk], dim=0) - assert weights.shape == (1536 + 576 + 128, 7168) - return weights.to(torch.bfloat16).contiguous(), attn_norm_weight.clone() - - @staticmethod - def common_to_tilert_native_bf16_warp_gemv( - wq_a: torch.Tensor, - wq_a_scale: torch.Tensor, - wkv_a: torch.Tensor, - wkv_a_scale: torch.Tensor, - wk: torch.Tensor, - wk_scale: torch.Tensor, - attn_norm_weight: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - """ - Convert common weights to weights for tilert native bf16 warp gemv op. - - Args: - wq_a: Common weight tensor. - wq_a_scale: Common weight scale tensor. - wkv_a: Common weight tensor. - wkv_a_scale: Common weight scale tensor. - wk: Common weight tensor. - wk_scale: Common weight scale tensor. - attn_norm_weight: Common attention norm weight tensor. - Returns: - tuple: Tilert weights for native bf16 warp gemv op. - """ - wq_a_scale = wq_a_scale.reshape((12, 56, 1)).repeat(1, 1, 128).reshape((12, 1, 7168)) - wq_a_scale = wq_a_scale.repeat(1, 128, 1).reshape((1536, 7168)) - wkv_a_scale = wkv_a_scale.reshape((5, 56, 1)).repeat(1, 1, 128).reshape((5, 1, 7168)) - wkv_a_scale = wkv_a_scale.repeat(1, 128, 1).reshape((-1, 7168)) - wkv_a_scale = wkv_a_scale[:576] - wk_scale = wk_scale.reshape((1, 56, 1)).repeat(1, 1, 128).reshape((1, 1, 7168)) - wk_scale = wk_scale.repeat(1, 128, 1).reshape((128, 7168)) - wq_a = wq_a.reshape((1536, 7168)).float() * wq_a_scale.float() - wkv_a = wkv_a.reshape((576, 7168)).float() * wkv_a_scale.float() - wk = wk.reshape((128, 7168)).float() * wk_scale.float() - # concatenate the weights - weights = torch.cat([wq_a, wkv_a, wk], dim=0) - assert weights.shape == (1536 + 576 + 128, 7168) - - weights = weights.reshape(140, 16, 7, 1024) - weights = weights.transpose(1, 2) # 140, 7, 16, 1024 - return weights.to(torch.bfloat16).contiguous(), attn_norm_weight.clone() - - @staticmethod - def common_to_tilert_dequant_bf16( - wq_a: torch.Tensor, - wq_a_scale: torch.Tensor, - wkv_a: torch.Tensor, - wkv_a_scale: torch.Tensor, - wk: torch.Tensor, - wk_scale: torch.Tensor, - attn_norm_weight: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - """ - Convert common weights to weights for tilert dequant bf16 op. - - Args: - wq_a: Common weight tensor. - wq_a_scale: Common weight scale tensor. - wkv_a: Common weight tensor. - wkv_a_scale: Common weight scale tensor. - wk: Common weight tensor. - wk_scale: Common weight scale tensor. - attn_norm_weight: Common attention norm weight tensor. - Returns: - tuple: Tilert weights for dequant bf16 op. - """ - wq_a = wq_a.reshape((384, 4, 7168)) - wkv_a = wkv_a.reshape((144, 4, 7168)) - wk = wk.reshape((32, 4, 7168)) - wqkv = torch.cat([wq_a, wkv_a, wk], dim=0).reshape(140, 4, 4 * 7168) - - wq_a_scale = wq_a_scale.reshape((12, 1, 56)).repeat(1, 32, 1).reshape((384, 1, 56)) - wkv_a_scale = wkv_a_scale.reshape((5, 1, 56)).repeat(1, 32, 1).reshape((160, 1, 56))[:144] - wk_scale = wk_scale.reshape((1, 1, 56)).repeat(1, 32, 1).reshape((32, 1, 56)) - wqkv_scales = torch.cat([wq_a_scale, wkv_a_scale, wk_scale], dim=0).reshape(140, 4, 56) - wqkv_scales_swizzled = torch.zeros(140, 4, 64, dtype=torch.bfloat16, device=wq_a.device) - # swizzle - for i in range(64): - wqkv_scales_swizzled[..., i] = wqkv_scales[..., ((i % 8) * 8 + i // 8) % 56] - weights = torch.zeros( - 140, 4, 4 * 7168 + 64 * 2, dtype=torch.float8_e4m3fn, device=wq_a.device - ) - weights_part = weights[:, :, : 4 * 7168] - scales_part = weights[:, :, 4 * 7168 :] - weights_part.copy_(wqkv) - scales_part.copy_(wqkv_scales_swizzled.view(dtype=torch.float8_e4m3fn)) - return weights.contiguous(), attn_norm_weight.clone() - - @staticmethod - def common_to_tilert_fp8_mma( - wq_a: torch.Tensor, - wq_a_scale: torch.Tensor, - wkv_a: torch.Tensor, - wkv_a_scale: torch.Tensor, - wk: torch.Tensor, - wk_scale: torch.Tensor, - rmsnorm_gamma: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - """ - Convert common weights to weights for tilert fp8 mma op. - - Args: - wq_a: Common weight tensor. - wq_a_scale: Common weight scale tensor. - wkv_a: Common weight tensor. - wkv_a_scale: Common weight scale tensor. - wk: Common weight tensor. - wk_scale: Common weight scale tensor. - rmsnorm_gamma: Common rmsnorm gamma tensor. - Returns: - tuple: Tilert weights for fp8 mma op. - """ - assert wq_a.shape == (1536, 7168) - assert wq_a_scale.shape == (12, 56) - assert wkv_a.shape == (576, 7168) - assert wkv_a_scale.shape == (5, 56) - assert wk.shape == (128, 7168) - assert wk_scale.shape == (1, 56) - wq_a = wq_a.reshape(96, 16, 7168) - wq_a_scale = wq_a_scale.reshape(12, 1, 56).repeat(1, 8, 1).reshape(96, 56) - wkv_a = wkv_a.reshape(36, 16, 7168) - wkv_a_scale = wkv_a_scale.reshape(5, 1, 56).repeat(1, 8, 1).reshape(40, 56) - wkv_a_scale = wkv_a_scale[:36] - - wk = wk.reshape(8, 16, 7168) - wk_scale = wk_scale.reshape(1, 1, 56).repeat(1, 8, 1).reshape(8, 56) - wqkvia = torch.cat([wq_a, wkv_a, wk], dim=0) # 140, 7168 - wqkvia_scale = torch.cat([wq_a_scale, wkv_a_scale, wk_scale], dim=0) # 140, 56 - - wqkvia_0 = wqkvia[..., :2048] - wqkvia_0_scale = wqkvia_scale[..., :16].contiguous().view(torch.float8_e4m3fn) - wqkvia_1 = wqkvia[..., 2048:4096] - wqkvia_1_scale = wqkvia_scale[..., 16:32].contiguous().view(torch.float8_e4m3fn) - wqkvia_2 = wqkvia[..., 4096:6144] - wqkvia_2_scale = wqkvia_scale[..., 32:48].contiguous().view(torch.float8_e4m3fn) - wqkvia_3 = wqkvia[..., 6144:7168] - wqkvia_3_scale = wqkvia_scale[..., 48:56].contiguous().view(torch.float8_e4m3fn) - - wqkvia_0 = wqkvia_0.reshape(140, 16, 64, 32).transpose(1, 2) - wqkvia_0 = RMSNormProjQAKVAKIWeightsConverter._swizzle_mma_16x32(wqkvia_0) - wqkvia_0 = wqkvia_0.reshape(140, 16 * 2048) - - wqkvia_1 = wqkvia_1.reshape(140, 16, 64, 32).transpose(1, 2) - wqkvia_1 = RMSNormProjQAKVAKIWeightsConverter._swizzle_mma_16x32(wqkvia_1) - wqkvia_1 = wqkvia_1.reshape(140, 16 * 2048) - - wqkvia_2 = wqkvia_2.reshape(140, 16, 64, 32).transpose(1, 2) - wqkvia_2 = RMSNormProjQAKVAKIWeightsConverter._swizzle_mma_16x32(wqkvia_2) - wqkvia_2 = wqkvia_2.reshape(140, 16 * 2048) - - wqkvia_3 = wqkvia_3.reshape(140, 16, 32, 32).transpose(1, 2) - wqkvia_3 = RMSNormProjQAKVAKIWeightsConverter._swizzle_mma_16x32(wqkvia_3) - wqkvia_3 = wqkvia_3.reshape(140, 16 * 1024) - padding_scale0 = torch.zeros((140, 48), dtype=torch.bfloat16, device=wq_a.device).view( - torch.float8_e4m3fn - ) - padding_scale1 = torch.zeros((140, 48), dtype=torch.bfloat16, device=wq_a.device).view( - torch.float8_e4m3fn - ) - padding_scale2 = torch.zeros((140, 48), dtype=torch.bfloat16, device=wq_a.device).view( - torch.float8_e4m3fn - ) - padding_scale3 = torch.zeros((140, 56), dtype=torch.bfloat16, device=wq_a.device).view( - torch.float8_e4m3fn - ) - wqkvia = torch.cat( - [ - wqkvia_0, - wqkvia_0_scale, - padding_scale0, - wqkvia_1, - wqkvia_1_scale, - padding_scale1, - wqkvia_2, - wqkvia_2_scale, - padding_scale2, - wqkvia_3, - wqkvia_3_scale, - padding_scale3, - ], - dim=1, - ) - - return wqkvia.contiguous(), rmsnorm_gamma.contiguous() - - -class RMSNormProjxWqkviaWeightsConverter(TilertWeightsConverter): - """RMSNormProjxWqkvia weights converter""" - - @staticmethod - def _swizzle_qmma_16x32(mat_in: torch.Tensor) -> torch.Tensor: - assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 - assert mat_in.dtype == torch.float8_e4m3fn - # PTX isa fig.88 - pre_shape = mat_in.shape[:-2] - mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) - return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) - - def convert_to_general(self, weights: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: - """ - Convert the weights to general format. - - Args: - weights: List of weights. - - Returns: - Tuple of weights. - """ - # Specialized for DS v3.2 model - args = self.model_args - assert ( - args.arch_name == "deepseek_v3_2" - ), f"arch_name must be deepseek_v3_2, but got {args.arch_name}" - with torch.inference_mode(): - x_rmsnorm_gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, wk, wk_scale = weights - q_lora_rank_scale_dim = args.q_lora_rank // args.block_size - kv_lora_rank_scale_dim = args.kv_lora_rank // args.block_size + 1 - x_scale_dim = args.dim // args.block_size - - wq_a_scale = ( - wq_a_scale.reshape((q_lora_rank_scale_dim, x_scale_dim, 1)) - .repeat(1, 1, args.block_size) - .reshape((q_lora_rank_scale_dim, 1, args.dim)) - ) - wq_a_scale = wq_a_scale.repeat(1, args.block_size, 1).reshape( - (args.q_lora_rank, args.dim) - ) - wkv_a_scale = ( - wkv_a_scale.reshape((kv_lora_rank_scale_dim, x_scale_dim, 1)) - .repeat(1, 1, args.block_size) - .reshape((kv_lora_rank_scale_dim, 1, args.dim)) - ) - wkv_a_scale = wkv_a_scale.repeat(1, args.block_size, 1).reshape((-1, args.dim)) - wkv_a_scale = wkv_a_scale[: args.kv_lora_rank + args.qk_rope_head_dim] - wk_scale = ( - wk_scale.reshape((1, x_scale_dim, 1)) - .repeat(1, 1, args.block_size) - .reshape((1, 1, args.dim)) - ) - wk_scale = wk_scale.repeat(1, args.block_size, 1).reshape( - (args.index_head_dim, args.dim) - ) - wq_a = wq_a.reshape((args.q_lora_rank, args.dim)).float() * wq_a_scale.float() - wkv_a = ( - wkv_a.reshape((args.kv_lora_rank + args.qk_rope_head_dim, args.dim)).float() - * wkv_a_scale.float() - ) - wk = wk.reshape((args.index_head_dim, args.dim)).float() * wk_scale.float() - # concatenate the weights - weights_tensor: torch.Tensor = torch.cat([wq_a, wkv_a, wk], dim=0) - assert weights_tensor.shape == ( - args.q_lora_rank + args.kv_lora_rank + args.qk_rope_head_dim + args.index_head_dim, - args.dim, - ) - # hard-coded scheduling: reshape to 140, 16, 7, 1024 - weights_tensor = weights_tensor.reshape(140, 16, 7, 1024) - weights_tensor = weights_tensor.transpose(1, 2) # 140, 7, 16, 1024 - return x_rmsnorm_gamma, weights_tensor.to(torch.bfloat16).contiguous() - - def convert_to_decoupled( - self, weights: list[torch.Tensor] - ) -> tuple[torch.Tensor, torch.Tensor]: - """ - Convert the weights to decoupled format. - - Args: - weights: List of weights. - - Returns: - Tuple of weights. - """ - arch_name = self.model_args.arch_name - wqkvia_and_scales = None - with torch.inference_mode(): - x_rmsnorm_gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, wk, wk_scale = weights - # Ensure the scales are in bfloat16 - if arch_name == "deepseek_v3_2": # DS v3.2 - # Ensure the scales are in bfloat16 for DS v3.2 - wq_a_scale = wq_a_scale.to(torch.bfloat16) - wkv_a_scale = wkv_a_scale.to(torch.bfloat16) - wk_scale = wk_scale.to(torch.bfloat16) - assert wq_a.shape == (1536, 7168) - assert wq_a_scale.shape == (12, 56) - assert wkv_a.shape == (576, 7168) - assert wkv_a_scale.shape == (5, 56) - assert wk.shape == (128, 7168) - assert wk_scale.shape == (1, 56) - wq_a = wq_a.reshape(96, 16, 7168) - wq_a_scale = wq_a_scale.reshape(12, 1, 56).repeat(1, 8, 1).reshape(96, 56) - wkv_a = wkv_a.reshape(36, 16, 7168) - wkv_a_scale = wkv_a_scale.reshape(5, 1, 56).repeat(1, 8, 1).reshape(40, 56) - wkv_a_scale = wkv_a_scale[:36] - - wk = wk.reshape(8, 16, 7168) - wk_scale = wk_scale.reshape(1, 1, 56).repeat(1, 8, 1).reshape(8, 56) - wqkvia = torch.cat([wq_a, wkv_a, wk], dim=0) # 140, 7168 - wqkvia_scale = torch.cat([wq_a_scale, wkv_a_scale, wk_scale], dim=0) # 140, 56 - - wqkvia_0 = wqkvia[..., :2048] - wqkvia_0_scale = wqkvia_scale[..., :16].contiguous().view(torch.float8_e4m3fn) - wqkvia_1 = wqkvia[..., 2048:4096] - wqkvia_1_scale = wqkvia_scale[..., 16:32].contiguous().view(torch.float8_e4m3fn) - wqkvia_2 = wqkvia[..., 4096:6144] - wqkvia_2_scale = wqkvia_scale[..., 32:48].contiguous().view(torch.float8_e4m3fn) - wqkvia_3 = wqkvia[..., 6144:7168] - wqkvia_3_scale = wqkvia_scale[..., 48:56].contiguous().view(torch.float8_e4m3fn) - - wqkvia_0 = wqkvia_0.reshape(140, 16, 64, 32).transpose(1, 2) - wqkvia_0 = self._swizzle_qmma_16x32(wqkvia_0) - wqkvia_0 = wqkvia_0.reshape(140, 16 * 2048) - - wqkvia_1 = wqkvia_1.reshape(140, 16, 64, 32).transpose(1, 2) - wqkvia_1 = self._swizzle_qmma_16x32(wqkvia_1) - wqkvia_1 = wqkvia_1.reshape(140, 16 * 2048) - - wqkvia_2 = wqkvia_2.reshape(140, 16, 64, 32).transpose(1, 2) - wqkvia_2 = self._swizzle_qmma_16x32(wqkvia_2) - wqkvia_2 = wqkvia_2.reshape(140, 16 * 2048) - - wqkvia_3 = wqkvia_3.reshape(140, 16, 32, 32).transpose(1, 2) - wqkvia_3 = self._swizzle_qmma_16x32(wqkvia_3) - wqkvia_3 = wqkvia_3.reshape(140, 16 * 1024) - padding_scale0 = torch.zeros( - (140, 48), dtype=torch.bfloat16, device=wq_a.device - ).view(torch.float8_e4m3fn) - padding_scale1 = torch.zeros( - (140, 48), dtype=torch.bfloat16, device=wq_a.device - ).view(torch.float8_e4m3fn) - padding_scale2 = torch.zeros( - (140, 48), dtype=torch.bfloat16, device=wq_a.device - ).view(torch.float8_e4m3fn) - padding_scale3 = torch.zeros( - (140, 56), dtype=torch.bfloat16, device=wq_a.device - ).view(torch.float8_e4m3fn) - wqkvia_and_scales = torch.cat( - [ - wqkvia_0, - wqkvia_0_scale, - padding_scale0, - wqkvia_1, - wqkvia_1_scale, - padding_scale1, - wqkvia_2, - wqkvia_2_scale, - padding_scale2, - wqkvia_3, - wqkvia_3_scale, - padding_scale3, - ], - dim=1, - ) - elif arch_name == "glm_5": # GLM5 - # Ensure the scales are in float32 for DS v3.2 - if wq_a_scale.dtype != torch.float32: - # TODO: remove this after the source weights are converted to float32 - print( - "Warning: RMSNormProjxWqkviaWeightsConverter: " - + "wq_a_scale is not in float32, converting to float32." - ) - wq_a_scale = wq_a_scale.to(torch.float32) - wkv_a_scale = wkv_a_scale.to(torch.float32) - wk_scale = wk_scale.to(torch.float32) - # (2048 + 576 + 128, 6144) - wqkvia = torch.cat([wq_a, wkv_a, wk], dim=0).reshape(86, 32, 6144) - # (16+5+1, 48) - wq_a_scale = wq_a_scale.reshape((16, 1, 48)).repeat(1, 4, 1).reshape(64, 48) - wkv_a_scale = wkv_a_scale.reshape((5, 1, 48)).repeat(1, 4, 1).reshape(20, 48)[:18] - wk_scale = wk_scale.reshape((1, 1, 48)).repeat(1, 4, 1).reshape(4, 48) - wqkvia_scales = torch.cat([wq_a_scale, wkv_a_scale, wk_scale], dim=0) # (86, 48) - wqkvia = wqkvia.reshape(86, 32, 6, 1024).transpose(1, 2).reshape(86, 6, 2, 16, 1024) - wqkvia = wqkvia.reshape(86, 6, 2, 16, 32, 32).transpose(3, 4) - wqkvia = self._swizzle_qmma_16x32(wqkvia).reshape(86, 6, 32 * 1024) - wqkvia_scales = wqkvia_scales.reshape(86, 6, 8).view(torch.float8_e4m3fn) - wqkvia_padding = torch.zeros( - (86, 6, 128 - wqkvia_scales.shape[-1]), - dtype=torch.float8_e4m3fn, - device=wq_a.device, - ) - wqkvia_and_scales = torch.cat([wqkvia, wqkvia_scales, wqkvia_padding], dim=-1) - else: - raise ValueError(f"Unsupported architecture: {arch_name}") - assert wqkvia_and_scales is not None - return x_rmsnorm_gamma.float(), wqkvia_and_scales.contiguous() - - -@dataclass -class RMSNormProjxWqkviaRefWeightsAlias: - """Reference weights alias for RMSNormProjxWqkvia.""" - - x_rmsnorm_gamma = "input_layernorm.weight" - q_a_weights = "self_attn.q_a_proj.weight" - q_a_scales = "self_attn.q_a_proj.weight_scale_inv" - kv_a_with_mqa_weights = "self_attn.kv_a_proj_with_mqa.weight" - kv_a_with_mqa_scales = "self_attn.kv_a_proj_with_mqa.weight_scale_inv" - wk_weights = "self_attn.indexer.wk.weight" - wk_scales = "self_attn.indexer.wk.weight_scale_inv" - - @property - def ref_tensor_alias(self) -> list[str]: - return [ - self.x_rmsnorm_gamma, - self.q_a_weights, - self.q_a_scales, - self.kv_a_with_mqa_weights, - self.kv_a_with_mqa_scales, - self.wk_weights, - self.wk_scales, - ] - - def __call__(self) -> list[str]: - return self.ref_tensor_alias - - -@dataclass -class RMSNormProjxWqkviaTilertWeightsAlias: - """TileRT weights alias for RMSNormProjxWqkvia.""" - - x_rmsnorm_gamma = "x_rmsnorm_gamma" - q_a_weights = "q_a_weights" - q_a_scales = "q_a_scales" - kv_a_with_mqa_weights = "kv_a_with_mqa_weights" - kv_a_with_mqa_scales = "kv_a_with_mqa_scales" - wk_weights = "wk_weights" - wk_scales = "wk_scales" - - @property - def tilert_tensor_alias(self) -> list[str]: - return [ - self.x_rmsnorm_gamma, - self.q_a_weights, - self.q_a_scales, - self.kv_a_with_mqa_weights, - self.kv_a_with_mqa_scales, - self.wk_weights, - self.wk_scales, - ] - - def __call__(self) -> list[str]: - return self.tilert_tensor_alias - - -class RMSNormProjxWqkvia(TileRTModule): - """RMSNormProjxWqkvia module""" - - def __init__( - self, - model_args: ModelArgs, - num_devices: int, - device_id: int, - ref_weights_alias: RMSNormProjxWqkviaRefWeightsAlias | None = None, - algorithm: RMSNormProjxWqkviaAlgorithm = RMSNormProjxWqkviaAlgorithm.GENERAL, - ): - super().__init__( - self.__class__.__name__, - model_args=model_args, - num_devices=num_devices, - device_id=device_id, - ) - - self.tilert_weights_alias = RMSNormProjxWqkviaTilertWeightsAlias() - self.ref_weights_alias = ( - ref_weights_alias - if ref_weights_alias is not None - else RMSNormProjxWqkviaRefWeightsAlias() - ) - - self.arch_name = self.model_args.arch_name - self.dim = self.model_args.dim - self.q_lora_rank = self.model_args.q_lora_rank - self.kv_lora_rank = self.model_args.kv_lora_rank - self.qk_rope_head_dim = self.model_args.qk_rope_head_dim - self.idx_head_dim = self.model_args.index_head_dim - self.block_size = self.model_args.block_size - self.eps = self.model_args.eps - self.algorithm: RMSNormProjxWqkviaAlgorithm = algorithm - - # reference weights - self.ref_norm_gamma: torch.Tensor | None = None - self.ref_wq_a: torch.Tensor | None = None - self.ref_wkv_a: torch.Tensor | None = None - self.ref_wk: torch.Tensor | None = None - - # tilert weights - self.tilert_norm_gamma: torch.Tensor | None = None - self.tilert_wqkv_a: torch.Tensor | None = None - # Legacy scale tensor for compatibility, to be removed in the future - self.tilert_wqkv_a_scales = torch.zeros((130, 64), dtype=torch.bfloat16) - - # tilert vars - self.x_rmsnorm_out: torch.Tensor | None = None - self.q_out: torch.Tensor | None = None - self.kv_out: torch.Tensor | None = None - self.ki_out: torch.Tensor | None = None - self.x_rmsnorm_quant_out: torch.Tensor | None = None - self.x_rmsnorm_quant_scale_out: torch.Tensor | None = None - - self.profile_logs: torch.Tensor | None = None - self.is_init = False - - # tilert_funcs - self.rmsnorm_proj_func: Callable | None = None - self.rmsnorm_func: Callable | None = None - self.proj_func: Callable | None = None - - if self.arch_name == "deepseek_v3_2": - self.rmsnorm_proj_func = rmsnorm_projx_wqkvia - self.rmsnorm_func = rmsnorm_quant - self.proj_func = projx_wqkvia - elif self.arch_name == "glm_5": - # Lazy import to avoid circular import - self.rmsnorm_proj_func = None - self.rmsnorm_func = rmsnorm_quant - self.proj_func = projx_wqkvia - else: - raise ValueError(f"Unsupported architecture: {self.arch_name}") - - # tilert tensor aliases (3 output weight names for get_weights_list) - self.tilert_tensor_alias: list[str] = [ - "x_rmsnorm_gamma", - "qkv_wa_weights", - "qkv_wa_scales", - ] - - def get_weights_list(self) -> list[torch.Tensor]: - """ - Get the weights list. - - Returns: - List of weights. - """ - assert self.algorithm is not None, "Algorithm is not set" - if self.algorithm == RMSNormProjxWqkviaAlgorithm.GENERAL: - return [self.tilert_norm_gamma, self.tilert_wqkv_a, self.tilert_wqkv_a_scales] - return [self.tilert_norm_gamma, self.tilert_wqkv_a] - - def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - """ - Device sharding. - - Args: - input_layernorm_weight: Input layernorm weight. - q_a_proj_weight: Q A proj weight. - q_a_proj_weight_scale: Q A proj weight scale. - kv_a_proj_weight: KV A proj weight. - kv_a_proj_weight_scale: KV A proj weight scale. - indexer_wk_weight: Indexer WK weight. - indexer_wk_weight_scale: Indexer WK weight scale. - - Returns: - Tuple of weights. - """ - # repeat n times for device sharding - # Using float to support both bfloat16 and float - input_layernorm_weight = ( - weights_map[self.ref_weights_alias.x_rmsnorm_gamma][None, ...] - .float() - .repeat(self.num_devices, 1) - ) - q_a_proj_weight = weights_map[self.ref_weights_alias.q_a_weights][None, ...].repeat( - self.num_devices, 1, 1 - ) - q_a_proj_weight_scale = weights_map[self.ref_weights_alias.q_a_scales][None, ...].repeat( - self.num_devices, 1, 1 - ) - kv_a_proj_weight = weights_map[self.ref_weights_alias.kv_a_with_mqa_weights][ - None, ... - ].repeat(self.num_devices, 1, 1) - kv_a_proj_weight_scale = weights_map[self.ref_weights_alias.kv_a_with_mqa_scales][ - None, ... - ].repeat(self.num_devices, 1, 1) - indexer_wk_weight = weights_map[self.ref_weights_alias.wk_weights][None, ...].repeat( - self.num_devices, 1, 1 - ) - indexer_wk_weight_scale = weights_map[self.ref_weights_alias.wk_scales][None, ...].repeat( - self.num_devices, 1, 1 - ) - return { - self.tilert_weights_alias.x_rmsnorm_gamma: input_layernorm_weight, - self.tilert_weights_alias.q_a_weights: q_a_proj_weight, - self.tilert_weights_alias.q_a_scales: q_a_proj_weight_scale, - self.tilert_weights_alias.kv_a_with_mqa_weights: kv_a_proj_weight, - self.tilert_weights_alias.kv_a_with_mqa_scales: kv_a_proj_weight_scale, - self.tilert_weights_alias.wk_weights: indexer_wk_weight, - self.tilert_weights_alias.wk_scales: indexer_wk_weight_scale, - } - - def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: - """ - Initialize the reference weights. - - Args: - state_dict: State dictionary. - """ - self.ref_norm_gamma = state_dict[self.ref_weights_alias()[0]] - self.ref_wq_a = weight_dequant( - state_dict[self.ref_weights_alias()[1]], state_dict[self.ref_weights_alias()[2]] - ) - self.ref_wkv_a = weight_dequant( - state_dict[self.ref_weights_alias()[3]], state_dict[self.ref_weights_alias()[4]] - ) - self.ref_wk = weight_dequant( - state_dict[self.ref_weights_alias()[5]], state_dict[self.ref_weights_alias()[6]] - ) - - assert self.ref_norm_gamma is not None - assert self.ref_wq_a is not None - assert self.ref_wkv_a is not None - assert self.ref_wk is not None - - assert ( - self.ref_norm_gamma.shape[-1] == self.dim - ), f"norm_gamma shape must be {self.dim}, but got {self.ref_norm_gamma.shape[-1]}" - assert self.ref_wq_a.shape[-2] == self.q_lora_rank, ( - f"wq_a shape must be {self.q_lora_rank}, " + f"but got {self.ref_wq_a.shape[-2]}" - ) - assert ( - self.ref_wq_a.shape[-1] == self.dim - ), f"wq_a shape must be {self.dim}, but got {self.ref_wq_a.shape[-1]}" - assert self.ref_wkv_a.shape[-2] == self.kv_lora_rank + self.qk_rope_head_dim, ( - f"wkv_a shape must be {self.kv_lora_rank + self.qk_rope_head_dim}, " - + f"but got {self.ref_wkv_a.shape[-2]}" - ) - assert ( - self.ref_wkv_a.shape[-1] == self.dim - ), f"wkv_a shape must be {self.dim}, but got {self.ref_wkv_a.shape[-1]}" - assert ( - self.ref_wk.shape[-2] == self.idx_head_dim - ), f"wk shape must be {self.idx_head_dim}, but got {self.ref_wk.shape[-2]}" - assert ( - self.ref_wk.shape[-1] == self.dim - ), f"wk shape must be {self.dim}, but got {self.ref_wk.shape[-1]}" - - def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: - """ - Initialize the tilert weights. - - Args: - state_dict: State dictionary. - """ - assert self.algorithm is not None, "Algorithm is not set" - self.tilert_norm_gamma, self.tilert_wqkv_a = RMSNormProjxWqkviaWeightsConverter( - self.model_args, self.num_devices - ).dispatch(self.algorithm, [state_dict[alias] for alias in self.tilert_weights_alias()]) - - def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: - """ - Initialize the tilert variables. - - Args: - batch_size: Batch size. - seq_len: Sequence length. - """ - self.q_out = torch.zeros((batch_size, seq_len, self.q_lora_rank), dtype=torch.bfloat16) - self.kv_out = torch.zeros((batch_size, seq_len, self.kv_lora_rank), dtype=torch.bfloat16) - self.ki_out = torch.zeros((batch_size, seq_len, self.idx_head_dim), dtype=torch.bfloat16) - self.x_rmsnorm_out = torch.zeros((batch_size, seq_len, self.dim), dtype=torch.bfloat16) - if self.algorithm == RMSNormProjxWqkviaAlgorithm.DECOUPLED: - self.x_rmsnorm_quant_out = torch.zeros( - (batch_size, seq_len, self.dim), dtype=torch.float8_e4m3fn - ) - self.x_rmsnorm_quant_scale_out = torch.zeros( - (batch_size, seq_len, self.dim // self.block_size), dtype=torch.float32 - ) - self.profile_logs = get_profile_log_tensor() - self.is_init = True - - def init_random_weights(self) -> None: - """ - Initialize the random weights. - - Returns: - None - """ - q_scale_dim = self.q_lora_rank // self.block_size - kv_scale_dim = (self.kv_lora_rank + self.qk_rope_head_dim) // self.block_size + 1 - wk_scale_dim = self.idx_head_dim // self.block_size - dim_scale_dim = self.dim // self.block_size - scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 - - tensor_list = [ - torch.randn(self.dim, dtype=torch.float32), - torch.randn(self.q_lora_rank, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), - torch.randn(q_scale_dim, dim_scale_dim, dtype=scale_dtype), - torch.randn( - self.kv_lora_rank + self.qk_rope_head_dim, self.dim, dtype=torch.bfloat16 - ).to(torch.float8_e4m3fn), - torch.randn(kv_scale_dim, dim_scale_dim, dtype=scale_dtype), - torch.randn(self.idx_head_dim, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), - torch.randn(wk_scale_dim, dim_scale_dim, dtype=scale_dtype), - ] - ref_state_dict = dict(zip(self.ref_weights_alias(), tensor_list)) - self.init_reference_weights(ref_state_dict) - self.init_tilert_weights( - {_k: _v[self.device_id] for _k, _v in self.device_sharding(ref_state_dict).items()} - ) - - def golden_forward( - self, - x: torch.Tensor, - pe_cache: torch.Tensor, - start_pos: int, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - - assert self.ref_norm_gamma is not None - assert self.ref_wq_a is not None - assert self.ref_wkv_a is not None - assert self.ref_wk is not None - - x_rmsnorm_out = torch.nn.functional.rms_norm( - x.float(), [x.size(-1)], self.ref_norm_gamma, self.eps - ) - - q_out = torch.matmul(x_rmsnorm_out.float(), self.ref_wq_a.transpose(0, 1).float()) - kv_out = torch.matmul(x_rmsnorm_out.float(), self.ref_wkv_a.transpose(0, 1).float()) - kv_out, k_pe = torch.split(kv_out, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - bsz = k_pe.shape[0] - seq_len = k_pe.shape[1] - pe_cache[:bsz, start_pos : start_pos + seq_len].copy_(k_pe.to(torch.bfloat16)) - ki_out = torch.matmul(x_rmsnorm_out.float(), self.ref_wk.transpose(0, 1).float()) - return ( - x_rmsnorm_out.to(torch.bfloat16), - q_out.to(torch.bfloat16), - kv_out.to(torch.bfloat16), - ki_out.to(torch.bfloat16), - ) - - def tilert_forward( - self, - x: torch.Tensor, - pe_cache: torch.Tensor, - start_pos: int, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - if self.algorithm == RMSNormProjxWqkviaAlgorithm.GENERAL: - assert self.rmsnorm_proj_func is not None - self.rmsnorm_proj_func( - x.to(torch.bfloat16), - self.tilert_wqkv_a, - self.tilert_wqkv_a_scales, - self.tilert_norm_gamma, - torch.tensor([start_pos], dtype=torch.int32, device=x.device), - self.q_out, - self.kv_out, - pe_cache, - self.ki_out, - self.x_rmsnorm_out, - self.profile_logs, - ) - elif self.algorithm == RMSNormProjxWqkviaAlgorithm.DECOUPLED: - assert self.rmsnorm_func is not None - assert self.proj_func is not None - self.rmsnorm_func( - x.to(torch.bfloat16), - self.tilert_norm_gamma, - self.x_rmsnorm_out, - self.x_rmsnorm_quant_out, - self.x_rmsnorm_quant_scale_out, - self.profile_logs, - ) - self.proj_func( - self.x_rmsnorm_quant_out, - self.x_rmsnorm_quant_scale_out, - self.tilert_wqkv_a, - torch.tensor([start_pos], dtype=torch.int32, device=x.device), - self.q_out, - self.kv_out, - pe_cache, - self.ki_out, - self.profile_logs, - ) - else: - raise ValueError(f"Unsupported algorithm: {self.algorithm}") - - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) - return self.x_rmsnorm_out, self.q_out, self.kv_out, self.ki_out - - def __call__( - self, - x: torch.Tensor, - pe_cache: torch.Tensor, - start_pos: int, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - return self.golden_forward(x, pe_cache, start_pos) diff --git a/python/models/deepseek_v3_2/ops/top1_allreduce.py b/python/models/deepseek_v3_2/ops/top1_allreduce.py deleted file mode 100644 index 1d500e3..0000000 --- a/python/models/deepseek_v3_2/ops/top1_allreduce.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Top1 Allreduce operation""" - -import torch - -__all__ = [ - "top1_allreduce", -] - - -def top1_allreduce( - logits: torch.Tensor, - flag: int, - index_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """ - Define the Top1 Allreduce operation. - - Args: - logits: Input tensor. - flag: Flag. - index_out: Output tensor. - profile_logs: Profile logs tensor. - """ - torch.ops.tilert.top1_allreduce_op(logits, flag, index_out, profile_logs) diff --git a/python/models/deepseek_v3_2/ops/top_p.py b/python/models/deepseek_v3_2/ops/top_p.py deleted file mode 100644 index 4394c2a..0000000 --- a/python/models/deepseek_v3_2/ops/top_p.py +++ /dev/null @@ -1,68 +0,0 @@ -"""TopP operation module.""" - -import torch - -__all__ = [ - "top_p", -] - - -def top_p( - logits: torch.Tensor, - in_indices: torch.Tensor, - sampling_seed: torch.Tensor, - positions: torch.Tensor, - is_verify_mode: bool, - temperature: float, - top_p: float, - top_k: int, - flag: int, - indices: torch.Tensor, - scores: torch.Tensor, - debug_tensor: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """top_p operation. - - Args: - logits (Tensor): The logits tensor. - in_indices (Tensor): The tensor containing input indices. - sampling_seed (Tensor): Random seeds for each sequence position. - positions (Tensor): Token positions for each sequence element. - is_verify_mode (bool): A flag indicating if verify mode is enabled in MTP. When set to - `True`, the `in_indices` will be checked to check if it is in - the top-k values. - temperature (float): The temperature parameter, used for scaling logits in softmax - calculations. - top_p (float): The top-p value, used for nucleus sampling to restrict the selection to the - smallest set of tokens whose cumulative probability is greater than or equal - to `top_p`. - top_k (int): The number of top-k values that occupy the top-p probability mass - during sampling. - flag (int): Used in all reduction. - indices (Tensor): The tensor containing output indices. - scores (Tensor): The tensor containing corresponding scores for the indices. - profile_logs (Tensor): A tensor for storing profiling log data during execution in MTP. - """ - dim = logits.shape[-1] - if dim == 19360: - call_func = torch.ops.tilert.top_p_glm5_op - elif dim == 16160: - call_func = torch.ops.tilert.top_p_op - else: - raise ValueError(f"Unsupported dimension: {dim}") - call_func( - logits, - in_indices, - sampling_seed, - positions, - is_verify_mode, - temperature, - top_p, - top_k, - flag, - indices, - scores, - debug_tensor, - profile_logs, - ) diff --git a/python/models/deepseek_v3_2/ops/up_gate_silu.py b/python/models/deepseek_v3_2/ops/up_gate_silu.py deleted file mode 100644 index 2f214c0..0000000 --- a/python/models/deepseek_v3_2/ops/up_gate_silu.py +++ /dev/null @@ -1,24 +0,0 @@ -"""UpGateSiLU operation module.""" - -import torch - -__all__ = [ - "up_gate_silu", -] - - -def up_gate_silu( - hidden_in: torch.Tensor, - expert_indices_in: torch.Tensor, - experts_weights_in: torch.Tensor, - hidden_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """Up Gate SiLU operation.""" - torch.ops.tilert.up_gate_silu_op( - hidden_in, - expert_indices_in, - experts_weights_in, - hidden_out, - profile_logs, - ) diff --git a/python/models/deepseek_v3_2/refs/kernel.py b/python/models/deepseek_v3_2/refs/kernel.py deleted file mode 100644 index eb5e274..0000000 --- a/python/models/deepseek_v3_2/refs/kernel.py +++ /dev/null @@ -1,354 +0,0 @@ -try: - import tilelang - import tilelang.language as T -except ImportError: - raise ImportError("Cannot import tilelang, please install tilelang.") from None - - -import torch -import triton -import triton.language as tl - -__all__ = [ - "weight_dequant", - "act_quant", - "fp8_gemm", - "fp8_index", -] - -tilelang.set_log_level("WARNING") - -pass_configs = { - tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, - tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, - # tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True, -} - -FP8 = "float8_e4m3" -BF16 = "bfloat16" -FP32 = "float32" - - -def fast_log2_ceil(x): # type: ignore - bits_x = T.reinterpret("uint32", x) - exp_x = (bits_x >> 23) & 0xFF - man_bits = bits_x & ((1 << 23) - 1) - return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0)) - - -def fast_pow2(x): # type: ignore - bits_x = (x + 127) << 23 - return T.reinterpret("float32", bits_x) - - -def fast_round_scale(amax, fp8_max_inv): # type: ignore - return fast_pow2(fast_log2_ceil(amax * fp8_max_inv)) - - -@triton.jit -def weight_dequant_kernel( # type: ignore - x_ptr, - s_ptr, - y_ptr, - M_Size: tl.constexpr, - N_Size: tl.constexpr, - BLOCK_SIZE: tl.constexpr, -) -> None: - """ - Weight dequantization kernel. - - Dequantizes weights using the provided scaling factors and stores the - result. - - Args: - x_ptr (tl.pointer): Pointer to the quantized weights. - s_ptr (tl.pointer): Pointer to the scaling factors. - y_ptr (tl.pointer): Pointer to the output buffer for dequantized - weights. - M (int): Number of rows in the weight matrix. - N (int): Number of columns in the weight matrix. - BLOCK_SIZE (tl.constexpr): Size of the block for tiling. - - Returns: - None - """ - pid_m = tl.program_id(axis=0) - pid_n = tl.program_id(axis=1) - n_size = tl.cdiv(N_Size, BLOCK_SIZE) - offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - offs = offs_m[:, None] * N_Size + offs_n[None, :] - mask = (offs_m[:, None] < M_Size) & (offs_n[None, :] < N_Size) - x_in = tl.load(x_ptr + offs, mask=mask).to(tl.float32) - s_in = tl.load(s_ptr + pid_m * n_size + pid_n) - y_out = x_in * s_in - tl.store(y_ptr + offs, y_out, mask=mask) - - -def weight_dequant(x_in: torch.Tensor, s_in: torch.Tensor, block_size: int = 128) -> torch.Tensor: - """ - Dequantizes the given weight tensor using the provided scale tensor. - - Args: - x_in (torch.Tensor): The quantized weight tensor of shape (M, N). - s_in (torch.Tensor): The scale tensor of shape (M//block_size, - N//block_size). - block_size (int, optional): The block size to use for dequantization. - Defaults to 128. - - Returns: - torch.Tensor: The dequantized weight tensor of the same shape as `x`. - - Raises: - AssertionError: If `x` or `s` are not contiguous or if their dimensions - are not 2. - """ - assert x_in.is_contiguous() and s_in.is_contiguous(), "Input tensors must be contiguous" - assert x_in.dim() == 2 and s_in.dim() == 2, "Input tensors must have 2 dimensions" - M_Size, N_Size = x_in.size() - y_out = torch.empty_like(x_in, dtype=torch.get_default_dtype()) - grid = lambda meta: ( # noqa: E731 - triton.cdiv(M_Size, meta["BLOCK_SIZE"]), - triton.cdiv(N_Size, meta["BLOCK_SIZE"]), - ) - weight_dequant_kernel[grid](x_in, s_in, y_out, M_Size, N_Size, BLOCK_SIZE=block_size) - return y_out - - -@tilelang.jit(pass_configs=pass_configs) -def act_quant_kernel( # type: ignore - N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False # type: ignore -): # type: ignore - M = T.symbolic("M") - fp8_min = -448.0 - fp8_max = 448.0 - fp8_max_inv = 1 / fp8_max - num_stages = 0 if round_scale else 2 - blk_m = 32 - group_size = 128 - - @T.prim_func - def act_quant_kernel_( # type: ignore - X: T.Tensor[(M, N), in_dtype], - Y: T.Tensor[(M, N), out_dtype], - S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype], - ): # type: ignore - with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as ( - pid_m, - pid_n, - ): - x_shared = T.alloc_shared((blk_m, group_size), in_dtype) - x_local = T.alloc_fragment((blk_m, group_size), in_dtype) - amax_local = T.alloc_fragment((blk_m,), scale_dtype) - s_local = T.alloc_fragment((blk_m,), scale_dtype) - y_local = T.alloc_fragment((blk_m, group_size), out_dtype) - y_shared = T.alloc_shared((blk_m, group_size), out_dtype) - - for _ in T.Pipelined(1, num_stages=num_stages): - T.copy(X[pid_m * blk_m, pid_n * group_size], x_shared) - T.copy(x_shared, x_local) - T.reduce_absmax(x_local, amax_local, dim=1) - for i in T.Parallel(blk_m): - amax_local[i] = T.max(amax_local[i], 1e-4) - if round_scale: - s_local[i] = fast_round_scale(amax_local[i], fp8_max_inv) - else: - s_local[i] = amax_local[i] * fp8_max_inv - for i, j in T.Parallel(blk_m, group_size): - y_local[i, j] = T.clamp(x_local[i, j] / s_local[i], fp8_min, fp8_max) - for i in T.Parallel(blk_m): - S[pid_m * blk_m + i, pid_n] = s_local[i] - T.copy(y_local, y_shared) - T.copy(y_shared, Y[pid_m * blk_m, pid_n * group_size]) - - return act_quant_kernel_ - - -def act_quant( - x: torch.Tensor, block_size: int = 128, scale_fmt: str | None = None -) -> tuple[torch.Tensor, torch.Tensor]: - """ - Quantizes the input tensor `x` using block-wise quantization. - - Args: - x (torch.Tensor): The input tensor to be quantized. - Must be contiguous and its last dimension size must be divisible by `block_size`. - block_size (int, optional): The size of the blocks to be used for quantization. - Default is 128. - scale_fmt (Optional[str], optional): The format of the scale. Default is None. - Returns: - Tuple[torch.Tensor, torch.Tensor]: A tuple containing: - - The quantized tensor with dtype `torch.float8_e4m3fn`. - - A tensor of scaling factors with dtype `torch.float32`. - """ - assert x.is_contiguous(), "Input tensor must be contiguous" - assert ( - x.size(-1) % block_size == 0 - ), f"Last dimension size must be divisible by block_size (block_size={block_size})" - N = x.size(-1) - y = torch.empty_like(x, dtype=torch.float8_e4m3fn) - s = x.new_empty(*x.size()[:-1], N // block_size, dtype=torch.float32) - kernel = act_quant_kernel(N, round_scale=scale_fmt is not None) - kernel(x.view(-1, N), y.view(-1, N), s.view(-1, N // block_size)) - return y, s - - -@tilelang.jit(pass_configs=pass_configs) -def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"): # type: ignore - assert out_dtype in [BF16, "float32"] - - M = T.symbolic("M") - group_size = 128 - block_M = 32 - block_N = 128 - block_K = 128 - - @T.prim_func - def fp8_gemm_kernel_( # type: ignore - A: T.Tensor[(M, K), FP8], - B: T.Tensor[(N, K), FP8], - C: T.Tensor[(M, N), out_dtype], - scales_a: T.Tensor[(M, T.ceildiv(K, group_size)), FP32], - scales_b: T.Tensor[(T.ceildiv(N, group_size), T.ceildiv(K, group_size)), FP32], - ): # type: ignore - with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as ( - bx, - by, - ): - A_shared = T.alloc_shared((block_M, block_K), FP8) - B_shared = T.alloc_shared((block_N, block_K), FP8) - C_shared = T.alloc_shared((block_M, block_N), out_dtype) - Scale_C_shared = T.alloc_shared((block_M), FP32) - C_local = T.alloc_fragment((block_M, block_N), accum_dtype) - C_local_accum = T.alloc_fragment((block_M, block_N), accum_dtype) - - # Improve L2 Cache - T.use_swizzle(panel_size=10) - - T.clear(C_local) - T.clear(C_local_accum) - K_iters = T.ceildiv(K, block_K) - for k in T.Pipelined(K_iters, num_stages=4): - # Load A into shared memory - T.copy(A[by * block_M, k * block_K], A_shared) - # Load B into shared memory - T.copy(B[bx * block_N, k * block_K], B_shared) - # Load scale into shared memory - Scale_B = scales_b[bx * block_N // group_size, k] - for i in T.Parallel(block_M): - Scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B - - T.gemm(A_shared, B_shared, C_local, transpose_B=True) - # Promote to enable 2xAcc - for i, j in T.Parallel(block_M, block_N): - C_local_accum[i, j] += C_local[i, j] * Scale_C_shared[i] - T.clear(C_local) - # TMA store - T.copy(C_local_accum, C_shared) - T.copy(C_shared, C[by * block_M, bx * block_N]) - - return fp8_gemm_kernel_ - - -def fp8_gemm( - a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor -) -> torch.Tensor: - """ - Perform a matrix multiplication using FP8 precision. - - Args: - a (torch.Tensor): The first input matrix, must be contiguous. - a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous. - b (torch.Tensor): The second input matrix, must be contiguous. - b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous. - - Returns: - torch.Tensor: The result of the matrix multiplication. - """ - assert a.is_contiguous() and b.is_contiguous(), "Input tensors must be contiguous" - assert a_s.is_contiguous() and b_s.is_contiguous(), "Scaling factor tensors must be contiguous" - K = a.size(-1) - M = a.numel() // K - N = b.size(0) - c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype()) - kernel = fp8_gemm_kernel(N, K) - kernel(a.view(M, K), b, c.view(M, N), a_s.view(M, -1), b_s) - return c - - -@tilelang.jit(out_idx=[4], pass_configs=pass_configs) -def fp8_index_kernel(h: int, d: int): # type: ignore - b = T.symbolic("b") - m = T.symbolic("m") - n = T.symbolic("n") - - blk_n1 = 512 - blk_n2 = 128 - - @T.prim_func - def fp8_index_kernel_( - q: T.Tensor[(b, m, h, d), FP8], - q_s: T.Tensor[(b, m, h), FP32], - k: T.Tensor[(b, n, d), FP8], - k_s: T.Tensor[(b, n), FP32], - o: T.Tensor[(b, m, n), FP32], - ) -> None: - with T.Kernel(b, m, T.ceildiv(n, blk_n1)) as (i_b, i_m, i1_n): - q_smem = T.alloc_shared((h, d), FP8) - T.copy(q[i_b, i_m, 0, 0], q_smem) - - q_s_frag = T.alloc_fragment(h, FP32) - T.copy(q_s[i_b, i_m, 0], q_s_frag) - - for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=2): - k_smem = T.alloc_shared((blk_n2, d), FP8) - T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem) - - k_s_frag = T.alloc_fragment(blk_n2, FP32) - T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag) - - logits = T.alloc_fragment((blk_n2, h), FP32) - T.gemm( - k_smem, - q_smem, - logits, - transpose_A=False, - transpose_B=True, - clear_accum=True, - ) - - for i_h, i3_n in T.Parallel(h, blk_n2): - logits[i3_n, i_h] = T.max(logits[i3_n, i_h], 0) * q_s_frag[i_h] - - logits_sum = T.alloc_fragment(blk_n2, FP32) - T.reduce_sum(logits, logits_sum, dim=1) - - for i3_n in T.Parallel(blk_n2): - logits_sum[i3_n] *= k_s_frag[i3_n] - - T.copy(logits_sum, o[i_b, i_m, i1_n * blk_n1 + i2_n * blk_n2]) - - return fp8_index_kernel_ - - -def fp8_index( - q: torch.Tensor, - q_s: torch.Tensor, - k: torch.Tensor, - k_s: torch.Tensor, -) -> torch.Tensor: - """ - Perform index score using FP8 precision. - - Args: - q (torch.Tensor): The Q tensor, must be contiguous. - q_s (torch.Tensor): The scaling factor for Q (float), must be contiguous. - k (torch.Tensor): The K tensor, must be contiguous. - k_s (torch.Tensor): The scaling factor for K (e8m0 here), must be contiguous. - - fp8 q @ fp8 k -> fp32 logits - relu(fp32 logits) * q_s (weights) -> fp32 logits - fp32 logits -> fp32 logits_sum - fp32 logits_sum * k_s (e8m0) -> fp32 index_score - """ - return fp8_index_kernel(q.shape[2], q.shape[3])(q, q_s, k, k_s) diff --git a/python/models/glm_5/params.py b/python/models/glm_5/params.py deleted file mode 100644 index 2721229..0000000 --- a/python/models/glm_5/params.py +++ /dev/null @@ -1 +0,0 @@ -"""GLM5 parameters and initializers.""" diff --git a/python/profiler/__init__.py b/python/profiler/__init__.py deleted file mode 100644 index e9b1cf9..0000000 --- a/python/profiler/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Profiler utilities for TileRT.""" diff --git a/python/profiler/utils.py b/python/profiler/utils.py deleted file mode 100644 index ecd83f6..0000000 --- a/python/profiler/utils.py +++ /dev/null @@ -1,477 +0,0 @@ -import os -from dataclasses import dataclass -from typing import Any - -import numpy as np -import torch - -from tilert.utils import SLICES_FOR_TILERT_OP - -# Worker names used by ExecPlanDescriptor (previously from scheduling.plan_v0) -WORKER_NAMES = [ - "Init", - "Prefetch", - "Compute", - "ExtraTask1/SyncIo", - "ExtraTask2/IoP0", - "ExtraTask3/IoP2", - "ExtraTask4", - "ExtraTask5", -] - -try: - from openpyxl import Workbook - from openpyxl.cell import Cell - from openpyxl.styles import Alignment, Border, PatternFill, Side - from openpyxl.styles.colors import COLOR_INDEX - from openpyxl.worksheet.worksheet import Worksheet -except ImportError: - print("openpyxl is not installed, profile logs will not be visualized") - Workbook = None - - -__all__ = [ - "ExcelStyleConfigs", - "ExecPlanDescriptor", - "WorkerBookVisualizer", - "visualize_profile_logs", - "parse_profile_log_tensor", - "parse_op_time", -] - - -@dataclass -class ExcelStyleConfigs: - """Excel style configurations.""" - - # 2 col * 3 stream - cols_per_worker: int = 6 - ns_per_tick: int = 1000 - - -@dataclass -class ExecPlanDescriptor: - """Exec plan descriptor.""" - - workers_def: list - op_lists: list - - -class WorkerBookVisualizer: - """Sheet visualizer.""" - - def __init__(self, exec_plan_desc: ExecPlanDescriptor): - self.exec_plan_desc = exec_plan_desc - - self.wb = Workbook() - self.wb.remove(self.wb.active) - - # Excel configs - self.style_configs = ExcelStyleConfigs() - - self.op_cols_splits = 3 - - self.time_bar_cols = 1 - self.op_stat_bar_cols = 6 - - workers_num = len(self.exec_plan_desc.workers_def) - self.op_vis_bar_cols = workers_num * self.style_configs.cols_per_worker - assert self.op_stat_bar_cols % self.op_cols_splits == 0 - - @property - def time_bar_next_col(self) -> int: - return self.time_bar_cols + 1 - - @property - def op_stat_bar_next_col(self) -> int: - return self.time_bar_next_col + self.op_stat_bar_cols - - @property - def op_vis_bar_next_col(self) -> int: - return self.op_stat_bar_next_col + self.op_vis_bar_cols - - @staticmethod - def add_region_cell( - ws: Worksheet, - value: str, - start_row: int, - start_col: int, - row_size: int = 1, - col_size: int = 1, - color_offset: int = -1, - ) -> Cell: - cell = ws.cell(row=start_row, column=start_col, value=value) - cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True) - if color_offset >= 0: - cell.fill = PatternFill( - start_color=COLOR_INDEX[50 + color_offset], - end_color=COLOR_INDEX[50 + color_offset], - fill_type="solid", - ) - ws.merge_cells( - start_row=start_row, - start_column=start_col, - end_row=start_row + row_size - 1, - end_column=start_col + col_size - 1, - ) - return cell - - def init_layout(self, ws: Worksheet) -> None: - workers_name = self.exec_plan_desc.workers_def - worker_cols = self.style_configs.cols_per_worker - - self.add_region_cell(ws, "Op Info", 1, self.time_bar_next_col, 1, self.op_stat_bar_cols) - - for worker_id, worker_name in enumerate(workers_name): - start_col = worker_cols * worker_id + self.op_stat_bar_next_col - self.add_region_cell(ws, worker_name, 1, start_col, 1, worker_cols) - - def _parse_inst_info( - self, insts_info: list[tuple[str, float, int] | tuple[str, float] | str], op_idx: int - ) -> tuple[str, float, int]: - inst_info = insts_info[op_idx] - if isinstance(inst_info, str): - op_name, op_cost = inst_info, 0.0 - op_stream = op_idx % self.op_cols_splits - elif len(inst_info) == 2: - op_name, op_cost = inst_info - op_stream = op_idx % self.op_cols_splits - elif len(inst_info) == 3: - op_name, op_cost, op_stream = inst_info - else: - raise TypeError("Invalid inst_info format") - return op_name, op_cost, op_stream - - def add_region_cell_by_time( - self, - ws: Worksheet, - op_show_info: str, - start_time: float, - end_time: float, - op_col_start: int, - op_col_size: int, - ns_tick: int, - color_offset: int = -1, - ) -> Cell: - op_start_row_idx = np.round(start_time / ns_tick).astype(np.int32) + 2 - op_end_row_idx = np.round(end_time / ns_tick).astype(np.int32) + 2 - op_end_row_idx = max(op_end_row_idx, op_start_row_idx) - return self.add_region_cell( - ws, - op_show_info, - op_start_row_idx, - op_col_start, - max(op_end_row_idx - op_start_row_idx, 1), - op_col_size, - color_offset, - ) - - def timeline_visual_region( - self, - ws: Worksheet, - profile_logs: np.ndarray, - insts_info: list[tuple[str, float, int] | tuple[str, float] | str], - ignore_prefilling: bool = True, - ) -> None: - ns_tick = self.style_configs.ns_per_tick - self.init_layout(ws) - - total_end_time = 0 - for op_idx, op_log in enumerate(profile_logs): - op_name, op_cost, op_stream = self._parse_inst_info(insts_info, op_idx) - - if op_stream >= self.op_cols_splits: - print(f"stream_id (aka col_id) must < {self.op_cols_splits}") - raise ValueError - - valid_mask: np.ndarray = op_log >= 0 - if ignore_prefilling: - valid_mask[2:4] = False - - if np.count_nonzero(valid_mask) == 0: - continue - - op_start_time = np.min(op_log, where=valid_mask, initial=np.inf) - op_end_time = np.max(op_log, where=valid_mask, initial=-np.inf) - total_end_time = max(total_end_time, op_end_time) - - op_cost_theory = op_cost / 1000 - op_cost_actual = (op_end_time - op_start_time) / 1000 - op_bw_utils = f"{op_cost_theory / op_cost_actual * 100:.2f}" - - op_show_info = ( - f"{op_name}\n" - + f"BW Util: {op_bw_utils}%\n" - + f"Actual: {op_cost_actual:.2f}us\n" - + f"Theoretical: {op_cost_theory:.2f}us\n" - + f"Start Time: {op_start_time / 1000:.2f}us\n" - + f"End Time: {op_end_time / 1000:.2f}us" - ) - op_col_size = self.op_stat_bar_cols // self.op_cols_splits - op_col_start = self.time_bar_next_col + op_stream * op_col_size - self.add_region_cell_by_time( - ws, - op_show_info, - op_start_time, - op_end_time, - op_col_start, - op_col_size, - ns_tick, - ) - - for queue_idx, (start_time, end_time) in enumerate(zip(op_log[::2], op_log[1::2])): - if start_time < 0 or end_time < 0: - continue - task_dur = (end_time - start_time) / 1000 - task_bw_utils = f"{min(100, op_cost_theory / task_dur * 100):.2f}" - task_show_info = ( - f"{op_name}\n" - + f"Dur: {task_dur:.2f}us\n" - + f"BW Util. {task_bw_utils}%:\n" - + f"Start: {start_time / 1000:.2f}us\n" - + f"End: {end_time / 1000:.2f}us" - ) - task_col_size = self.style_configs.cols_per_worker // self.op_cols_splits - task_col_start = ( - self.op_stat_bar_next_col - + queue_idx * self.style_configs.cols_per_worker - + op_stream * task_col_size - ) - cell = self.add_region_cell_by_time( - ws, - task_show_info, - start_time, - end_time, - task_col_start, - task_col_size, - ns_tick, - queue_idx, - ) - cell.border = Border( - left=Side(style="thin"), - right=Side(style="thin"), - top=Side(style="thin"), - bottom=Side(style="thin"), - ) - - for dur_idx, dur_start in enumerate(range(0, int(total_end_time), ns_tick)): - ws.cell(row=dur_idx + 2, column=1, value=f"{(dur_start + ns_tick) / 1000:.2f}") - - def brief_table_region( - self, - ws: Worksheet, - profile_logs: np.ndarray, - insts_info: list[tuple[str, float, int] | tuple[str, float] | str], - ) -> None: - for op_idx, op_log in enumerate(profile_logs): - op_name, _, _ = self._parse_inst_info(insts_info, op_idx) - - ws.cell(row=op_idx + 2, column=self.op_vis_bar_next_col, value=op_name) - - for queue_idx, (start_time, end_time) in enumerate(zip(op_log[::2], op_log[1::2])): - if start_time < 0 or end_time < 0: - continue - task_dur = (end_time - start_time) / 1000 - ws.cell( - row=op_idx + 2, column=self.op_vis_bar_next_col + queue_idx + 1, value=task_dur - ) - - def add_sheet(self, profile_logs: np.ndarray, sheet_name: str) -> "WorkerBookVisualizer": - """Add a sheet to the workbook.""" - wb = self.wb - insts_info = self.exec_plan_desc.op_lists - - ws = wb.create_sheet(sheet_name) - self.timeline_visual_region(ws, profile_logs, insts_info) - self.brief_table_region(ws, profile_logs, insts_info) - - return self - - def add_sm_brief_sheet( - self, profile_logs: np.ndarray, sheet_name: str - ) -> "WorkerBookVisualizer": - """Add a brief sheet to workbook which contains min/max start/end and duration among SMs""" - wb = self.wb - insts_info = self.exec_plan_desc.op_lists - ws = wb.create_sheet(sheet_name) - - profile_logs = np.transpose(profile_logs, (1, 0, 2)) - - # 1. init layout - workers_name = self.exec_plan_desc.workers_def - worker_metric_def = [ - "min_start", - "max_end", - "min_dur", - "max_dur", - "mean_dur", - "std_dur", - ] - - worker_cols = len(worker_metric_def) - - self.add_region_cell(ws, "Op Info", 1, self.time_bar_next_col, 1, self.op_stat_bar_cols) - - for worker_id, worker_name in enumerate(workers_name): - start_col = worker_cols * worker_id + self.op_stat_bar_next_col - self.add_region_cell(ws, worker_name, 1, start_col, 1, worker_cols) - for metric_id, metric_name in enumerate(worker_metric_def): - start_col_metric = start_col + metric_id - self.add_region_cell(ws, metric_name, 2, start_col_metric, 1, 1) - - # 2. calc metrics - # profile_logs: (num_ops, num_sm, num_task*2) - for op_idx, op_profile_log in enumerate(profile_logs): - valid_mask = (op_profile_log >= 0) & (op_profile_log < 1e9) - # skip if this op is fully invalid - if not np.any(valid_mask): - continue - - op_name, _, _ = self._parse_inst_info(insts_info, op_idx) - self.add_region_cell(ws, op_name, op_idx + 3, self.time_bar_next_col, 1, 2) - - for queue_idx in range(op_profile_log.shape[1] // 2): - starts = op_profile_log[:, queue_idx * 2] - ends = op_profile_log[:, queue_idx * 2 + 1] - - valid_mask = ( - (starts >= 0) & (starts < 1e9) & (ends >= 0) & (ends < 1e9) & (starts <= ends) - ) - - valid_starts = starts[valid_mask] / 1000 - valid_ends = ends[valid_mask] / 1000 - - if len(valid_starts) == 0: - continue - - min_start = np.min(valid_starts) - max_end = np.max(valid_ends) - durations = valid_ends - valid_starts - - metrics_values = [ - min_start, - max_end, - np.min(durations), - np.max(durations), - np.mean(durations), - np.std(durations), - ] - - # row_idx start from 3, because {1: work_name, 2: metric_name} - # col_idx start from worker::start_col - start_row = op_idx + 3 - start_col = worker_cols * queue_idx + self.op_stat_bar_next_col - color_offset = queue_idx - - for i, value in enumerate(metrics_values): - # color mean and std dev - cell_color = color_offset if i >= 4 else -1 - self.add_region_cell(ws, value, start_row, start_col + i, 1, 1, cell_color) - - return self - - def save(self, out_path: str) -> None: - """Save the workbook to a file.""" - os.makedirs(os.path.dirname(out_path), exist_ok=True) - self.wb.save(out_path) - - -def visualize_profile_logs( - all_profile_logs: np.ndarray, - out_path: str, - inst2opname: list[tuple[str, float, int] | tuple[str, float] | str], - with_mean: bool = False, - with_max: bool = False, -) -> None: - """Visualize profile logs.""" - valid_ctas = np.argwhere(np.any(all_profile_logs != 0, axis=(1, 2)))[:, 0] - filtered_logs = all_profile_logs[valid_ctas] - filtered_masks = np.logical_and(filtered_logs >= 0, filtered_logs < 1e9) - mean_profile_logs = np.mean(filtered_logs, axis=0, where=filtered_masks) - mean_profile_logs[np.isnan(mean_profile_logs)] = -1 - if filtered_logs.size == 0: - return - assemble_profile_logs = np.zeros_like(filtered_logs[0]) - assemble_profile_logs[:, ::2] = np.min( - filtered_logs[..., ::2], axis=0, where=filtered_masks[..., ::2], initial=np.inf - ) - assemble_profile_logs[:, 1::2] = np.max( - filtered_logs[..., 1::2], axis=0, where=filtered_masks[..., 1::2], initial=-np.inf - ) - assemble_profile_logs[np.isinf(assemble_profile_logs)] = -1 - - visualizer = WorkerBookVisualizer(ExecPlanDescriptor(WORKER_NAMES, inst2opname)) - if with_mean: - visualizer.add_sheet(mean_profile_logs, "mean") - if with_max: - raise NotImplementedError("with_max is not implemented") - - visualizer.add_sm_brief_sheet(filtered_logs, "mean_sm_brief") - for block_idx, profile_logs in enumerate(filtered_logs): - profile_logs[profile_logs > 1e9] = -1 - visualizer.add_sheet(profile_logs, f"block_{block_idx}") - visualizer.save(out_path) - - -def parse_profile_log_tensor( - profile_logs_tensor: torch.Tensor, - out_path: str, - inst2opname: Any, - with_mean: bool = False, -) -> None: - """Parse a profile log tensor into a dictionary. - - Args: - profile_log_tensor: The profile log tensor. - out_path: The path to save the profile logs. - inst2opname: The mapping from instance index to operation name. - - list[tuple[str, float, int] | tuple[str, float] | str] - - Returns: - None. - """ - # Remove the extra slices for storing instructions and glb bars. - profile_logs_tensor = profile_logs_tensor[:-SLICES_FOR_TILERT_OP, :, :] - - profile_logs = profile_logs_tensor.cpu().detach().numpy() - valid_insts_logs = np.any(profile_logs != 0, axis=(1, 2)) - profile_logs = profile_logs[valid_insts_logs] - valid_blocks_logs = np.any(profile_logs != 0, axis=(0, 2)) - profile_logs = profile_logs[:, valid_blocks_logs, :] - # Return if no valid blocks logs are found. - if profile_logs.size == 0: - print("Warning: No profile logs available.") - return - profile_logs = np.transpose(profile_logs, (1, 0, 2)) - ctx_start_times = profile_logs[:, 0, 0] - profile_logs = profile_logs[:, 1:, :] - profile_logs = (profile_logs - ctx_start_times[:, None, None]).astype(np.float32) / 1.855 - - if Workbook is not None: - visualize_profile_logs(profile_logs, out_path, inst2opname, with_mean) - - -def parse_op_time(profile_logs: torch.Tensor, op_idx: int = 0, block_idx: int = 0) -> None: - data = profile_logs[op_idx, block_idx, :].cpu().numpy() - max_time = data.max() - start_time = data.min() - FREQUENCY = 1850.0 - - worker_names = [ - "controller", - " sync_io", - " io_p0", - " io_p1", - " io_p2", - " consumer", - " extra1", - " extra2", - ] - for i, worker_name in enumerate(worker_names): - if data[i * 2] != max_time: - print( - f"{worker_name}:\tstart:{(data[i * 2] - start_time) / FREQUENCY:.3f}, " - f"duration:{(data[i * 2 + 1] - data[i * 2]) / FREQUENCY:.3f}, " - f"end:{(data[i * 2 + 1] - start_time) / FREQUENCY:.3f}" - ) diff --git a/requirements.txt b/requirements.txt index ae9da40..fd4a9ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,15 @@ -torch>=2.6.0 +# Runtime dependencies for the v0.1.4 wheel, pinned to the exact ABI the +# wheel was built against. ``torch`` must be installed from PyTorch's cu130 +# index — PyPI's default ``torch`` is a different CUDA build and will not load +# the cu130-linked tilert binary: +# +# pip install --index-url https://download.pytorch.org/whl/cu130 torch==2.11.0 +# pip install -r requirements.txt +# +# The recommended path remains the prebuilt Docker image (see README). +torch==2.11.0 +transformers==4.46.3 +tokenizers==0.20.3 numpy -transformers +scipy +einops diff --git a/tilert/__init__.py b/tilert/__init__.py new file mode 100644 index 0000000..d34ce51 --- /dev/null +++ b/tilert/__init__.py @@ -0,0 +1,91 @@ +"""TileRT Python package. + +Two backend libraries ship with TileRT — one per model family: + + - ``libtilert_dsv32.so`` (DeepSeek-V3.2) + - ``libtilert_glm5.so`` (GLM-5) + +They are NOT loaded at import time. The caller selects a backend via +``load_backend(model_type)`` (done automatically by ``tilert.generate``). +Only one backend may be loaded per process — both register the ``tilert`` +torch-op namespace. Run DSv3.2 and GLM-5 in separate processes. +""" + +import ctypes +import logging +import os +from importlib.metadata import PackageNotFoundError +from importlib.metadata import version as pkg_version +from pathlib import Path + +import torch + +if not hasattr(torch, "ops"): + raise RuntimeError("PyTorch is required but torch.ops is not available") + +try: + __version__ = pkg_version("tilert") +except PackageNotFoundError: + __version__ = "0.0.0" + + +def init_logging() -> logging.Logger: + """Initialize logging configuration.""" + logging.basicConfig( + level=logging.DEBUG, + format="%(filename)s:%(lineno)d [%(levelname)s]: %(message)s", + ) + return logging.getLogger(__name__) + + +logger = init_logging() + +_BACKENDS = { + "deepseek_v3_2": "libtilert_dsv32.so", + "glm5": "libtilert_glm5.so", +} + +_loaded_backend: str | None = None + + +def load_backend(model_type: str) -> None: + """Load the backend for ``model_type`` (lazy, once per process). + + DeepSeek-V3.2 and GLM-5 ship as separate libraries; the matching one is + loaded on first use. Loading a second, different backend in the same + process raises (both libraries define the ``tilert`` op namespace). + """ + global _loaded_backend + so_name = _BACKENDS.get(model_type) + if so_name is None: + raise ValueError(f"Unknown model_type {model_type!r}. Supported: {sorted(_BACKENDS)}") + if _loaded_backend is not None: + if _loaded_backend != so_name: + raise RuntimeError( + f"TileRT backend '{_loaded_backend}' already loaded; cannot load " + f"'{so_name}' in the same process. Run {model_type} in a fresh process." + ) + return + pkg_dir = Path(__file__).parent + lib_path = pkg_dir / so_name + if not lib_path.exists(): + fallback = pkg_dir / "libtilert.so" + if not fallback.exists(): + raise RuntimeError(f"Backend library not found: {lib_path}.") + lib_path = fallback + ctypes.CDLL(str(lib_path), mode=ctypes.RTLD_GLOBAL | os.RTLD_LAZY) + torch.ops.load_library(str(lib_path)) + _loaded_backend = so_name + logger.info( + "Loaded TileRT backend %s (%s) for model_type=%s", so_name, lib_path.name, model_type + ) + + +from .tilert_init import tilert_init # noqa: E402 + +__all__ = [ + "logger", + "load_backend", + "tilert_init", + "__version__", +] diff --git a/python/benchmark/__init__.py b/tilert/benchmark/__init__.py similarity index 83% rename from python/benchmark/__init__.py rename to tilert/benchmark/__init__.py index 49a349d..1194444 100644 --- a/python/benchmark/__init__.py +++ b/tilert/benchmark/__init__.py @@ -4,9 +4,8 @@ from typing import TypeAlias from tilert.models.deepseek_v3_2.generator import DSAv32Generator -from tilert.models.glm_5.generator import GLM5Generator -Generator: TypeAlias = DSAv32Generator | GLM5Generator +Generator: TypeAlias = DSAv32Generator @dataclass @@ -15,7 +14,6 @@ class BenchMode: with_mtp: bool label: str - # Sampling parameters — None means keep current generator defaults (top-k1 argmax). use_topp: bool = False top_p: float = 1.0 top_k: int = 256 @@ -27,13 +25,25 @@ class CellStats: """Stats for a single table cell (one mode x one benchmark column).""" tok_s: float = 0.0 - ms: float = 0.0 + iters_s: str = "-" acc_rate: str = "-" BenchStats = dict[str, dict[str, CellStats]] +@dataclass +class PerStepData: + """Per-step timing data from a single generation run.""" + + prompt_len: int + time_list: list[float] + accepted_counts: list[int] + + +PerStepDict = dict[str, dict[str, list[PerStepData]]] + + def apply_mode(generator: Generator, mode: BenchMode) -> None: """Apply sampling parameters for a benchmark mode.""" generator.update_sampling_params( @@ -68,16 +78,14 @@ def print_summary_table( if not all_stats: return - # Collect column keys in insertion order (preserves benchmark ordering) col_keys: list[str] = [] for cols in all_stats.values(): for k in cols: if k not in col_keys: col_keys.append(k) - ROW_LABELS = ["tok/s", "ms", "acc"] + ROW_LABELS = ["tok/s", "it/s", "acc"] - # Build formatted cell strings: {mode: {col: [row0, row1, row2]}} formatted: dict[str, dict[str, list[str]]] = {} for mode, cols in all_stats.items(): formatted[mode] = {} @@ -88,11 +96,10 @@ def print_summary_table( else: formatted[mode][k] = [ _fmt(cell.tok_s, "tok/s"), - _fmt(cell.ms, "ms"), + cell.iters_s, cell.acc_rate, ] - # Compute column widths col_widths: dict[str, int] = {} for k in col_keys: w = len(k) @@ -102,22 +109,18 @@ def print_summary_table( col_widths[k] = w mode_width = max(len("Mode"), max(len(m) for m in all_stats)) - # Row label column shares the mode column; pick wider of mode names vs row labels mode_width = max(mode_width, max(len(r) for r in ROW_LABELS)) print(f"\n## Benchmark Summary ({model_name})\n") - # Header hdr = [f" {'Mode':<{mode_width}} "] hdr += [f" {k:<{col_widths[k]}} " for k in col_keys] print("|" + "|".join(hdr) + "|") - # Separator sep = ["-" * (mode_width + 2)] sep += ["-" * (col_widths[k] + 2) for k in col_keys] print("|" + "|".join(sep) + "|") - # Data rows: 3 rows per mode mode_list = list(all_stats.keys()) for _, mode in enumerate(mode_list): for row_idx, _row_label in enumerate(ROW_LABELS): diff --git a/python/benchmark/coding_prompt.py b/tilert/benchmark/coding_prompt.py similarity index 54% rename from python/benchmark/coding_prompt.py rename to tilert/benchmark/coding_prompt.py index e4ff6ed..1d98b34 100644 --- a/python/benchmark/coding_prompt.py +++ b/tilert/benchmark/coding_prompt.py @@ -3,17 +3,27 @@ from typing import cast import numpy as np -from benchmark import BenchMode, BenchStats, CellStats, Generator, apply_mode + +from tilert.benchmark import ( + BenchMode, + BenchStats, + CellStats, + Generator, + PerStepData, + PerStepDict, + apply_mode, +) PROMPT = "Hi, can you write a sort program in C for me?" -def run(generator: Generator, modes: list[BenchMode]) -> BenchStats: +def run(generator: Generator, modes: list[BenchMode]) -> tuple[BenchStats, PerStepDict]: """Run the coding-prompt benchmark for each mode. Returns stats with column: Coding. """ stats: BenchStats = {} + per_step: PerStepDict = {} for mode in modes: apply_mode(generator, mode) @@ -21,8 +31,8 @@ def run(generator: Generator, modes: list[BenchMode]) -> BenchStats: print(f"Prompt: {PROMPT}") print("Completion:") - _, time_list, accepted_counts = cast( - tuple[str, list[float], list[int]], + _, time_list, accepted_counts, prompt_len = cast( + tuple[str, list[float], list[int], int], generator.generate(PROMPT, True, with_mtp=mode.with_mtp), ) @@ -32,15 +42,25 @@ def run(generator: Generator, modes: list[BenchMode]) -> BenchStats: total_tokens = sum(accepted_counts) total_time = sum(time_list) speed = total_tokens / total_time if total_time > 0 else 0 - avg_ms = total_time / len(time_list) * 1000 avg_a = total_tokens / len(accepted_counts) acc_rate = f"{avg_a:.2f}/{min(accepted_counts)}/{max(accepted_counts)}" - mode_stats["Coding"] = CellStats(tok_s=speed, ms=avg_ms, acc_rate=acc_rate) + iters_s = len(time_list) / total_time if total_time > 0 else 0.0 + mode_stats["Coding"] = CellStats( + tok_s=speed, iters_s=f"{iters_s:.1f} it/s", acc_rate=acc_rate + ) elif time_list: mean_time = float(np.mean(time_list)) speed = 1 / mean_time - mode_stats["Coding"] = CellStats(tok_s=speed, ms=mean_time * 1000) + mode_stats["Coding"] = CellStats(tok_s=speed, iters_s=f"{speed:.1f} it/s") + + per_step[mode.label] = { + "Coding": [ + PerStepData( + prompt_len=prompt_len, time_list=time_list, accepted_counts=accepted_counts + ) + ] + } stats[mode.label] = mode_stats - return stats + return stats, per_step diff --git a/tilert/benchmark/config.py b/tilert/benchmark/config.py new file mode 100644 index 0000000..5f40628 --- /dev/null +++ b/tilert/benchmark/config.py @@ -0,0 +1,69 @@ +"""TileRT configuration file loading. + +Reads model weights paths from ~/.tilert/config.toml so that benchmark scripts +and regression workflows do not need hardcoded paths. + +Config file format (~/.tilert/config.toml): + + [weights] + deepseek_v3_2 = "/path/to/tilert_weights/DeepSeek-V32" + deepseek_v3_2_v2 = "/path/to/tilert_weights/DeepSeek-V32-v2" +""" + +import tomllib +from pathlib import Path + +CONFIG_DIR = Path.home() / ".tilert" +CONFIG_FILE = CONFIG_DIR / "config.toml" + + +def get_config_path() -> Path: + """Return the path to the TileRT config file.""" + return CONFIG_FILE + + +def get_weights_dir(model: str, cli_override: str | None = None) -> str: + """Resolve the weights directory for *model*. + + Resolution order (highest priority first): + 1. *cli_override* (from ``--model-weights-dir`` CLI flag) + 2. ``~/.tilert/config.toml`` → ``[weights].`` + + Raises ``FileNotFoundError`` / ``KeyError`` with a user-friendly message + when the config file or key is missing. + """ + if cli_override is not None: + return cli_override + + config_path = get_config_path() + if not config_path.exists(): + raise FileNotFoundError( + f"No --model-weights-dir provided and config file not found at {config_path}.\n" + f"Create it with:\n\n" + f" mkdir -p {CONFIG_DIR}\n" + f" cat > {config_path} << 'EOF'\n" + f" [weights]\n" + f' deepseek_v3_2 = "/path/to/DeepSeek-V32"\n' + f" EOF\n" + ) + + try: + with open(config_path, "rb") as f: + config = tomllib.load(f) + except tomllib.TOMLDecodeError as e: + raise ValueError( + f"Failed to parse {config_path}: {e}\n" f"Please check the file for syntax errors." + ) from e + + weights = config.get("weights", {}) + if model not in weights: + available = ", ".join(weights.keys()) if weights else "(none)" + raise KeyError( + f"Model {model!r} not found in {config_path} [weights] section.\n" + f"Available models: {available}\n" + f"Add it with:\n\n" + f" [weights]\n" + f' {model} = "/path/to/{model}/weights"\n' + ) + + return str(weights[model]) diff --git a/tilert/benchmark/long_prompt.py b/tilert/benchmark/long_prompt.py new file mode 100644 index 0000000..7df175b --- /dev/null +++ b/tilert/benchmark/long_prompt.py @@ -0,0 +1,82 @@ +"""Long-prompt benchmark: single generation, measures long-form throughput.""" + +from typing import cast + +import numpy as np + +from tilert.benchmark import ( + BenchMode, + BenchStats, + CellStats, + Generator, + PerStepData, + PerStepDict, + apply_mode, +) + +PROMPT = "Hi, can you tell me a very long story, with roughly 3000 words?" + + +def run(generator: Generator, modes: list[BenchMode]) -> tuple[BenchStats, PerStepDict]: + """Run the long-prompt benchmark for each mode. + + Returns stats with column: Long. + """ + stats: BenchStats = {} + per_step: PerStepDict = {} + + for mode in modes: + apply_mode(generator, mode) + print(f"\n--- Long-prompt benchmark ({mode.label}) ---") + print(f"Prompt: {PROMPT}") + print("Completion:") + + _, time_list, accepted_counts, prompt_len = cast( + tuple[str, list[float], list[int], int], + generator.generate(PROMPT, True, with_mtp=mode.with_mtp), + ) + + mode_stats: dict[str, CellStats] = {} + + if mode.with_mtp and accepted_counts: + total_tokens = sum(accepted_counts) + total_time = sum(time_list) + speed = total_tokens / total_time if total_time > 0 else 0 + avg_a = total_tokens / len(accepted_counts) + acc_rate = f"{avg_a:.2f}/{min(accepted_counts)}/{max(accepted_counts)}" + + cumtok = list(np.cumsum(accepted_counts)) + split_idx = next((i for i, t in enumerate(cumtok) if t >= 2048), len(time_list)) + end_idx = next((i for i, t in enumerate(cumtok) if t >= 2048 + 512), len(time_list)) + pre_time = time_list[:split_idx] + post_time = time_list[split_idx:end_idx] + pre_ips = len(pre_time) / sum(pre_time) if pre_time else 0.0 + post_ips = len(post_time) / sum(post_time) if post_time else 0.0 + iters_s = f"{pre_ips:.1f}/{post_ips:.1f} it/s" + + mode_stats["Long"] = CellStats(tok_s=speed, iters_s=iters_s, acc_rate=acc_rate) + elif time_list: + mean_time = float(np.mean(time_list)) + speed = 1 / mean_time + + split_idx = min(2048, len(time_list)) + end_idx = min(2048 + 512, len(time_list)) + pre_time = time_list[:split_idx] + post_time = time_list[split_idx:end_idx] + pre_ips = len(pre_time) / sum(pre_time) if pre_time else 0.0 + post_ips = len(post_time) / sum(post_time) if post_time else 0.0 + iters_s = f"{pre_ips:.1f}/{post_ips:.1f} it/s" + + mode_stats["Long"] = CellStats(tok_s=speed, iters_s=iters_s) + + per_step[mode.label] = { + "Long": [ + PerStepData( + prompt_len=prompt_len, time_list=time_list, accepted_counts=accepted_counts + ) + ] + } + + stats[mode.label] = mode_stats + + return stats, per_step diff --git a/python/benchmark/short_prompt.py b/tilert/benchmark/short_prompt.py similarity index 66% rename from python/benchmark/short_prompt.py rename to tilert/benchmark/short_prompt.py index bebd2ce..4bdebe2 100644 --- a/python/benchmark/short_prompt.py +++ b/tilert/benchmark/short_prompt.py @@ -1,41 +1,59 @@ -"""Short-prompt benchmark: 20 iterations, measures steady-state decode throughput.""" +"""Short-prompt benchmark: 1 warmup + 20 iterations, measures steady-state decode throughput.""" from typing import cast import numpy as np -from benchmark import BenchMode, BenchStats, CellStats, Generator, apply_mode + +from tilert.benchmark import ( + BenchMode, + BenchStats, + CellStats, + Generator, + PerStepData, + PerStepDict, + apply_mode, +) PROMPT = "Tell me 10 jokes, keep them all under 100 words." NUM_ITERS = 20 TOKEN_CHECKPOINTS = [200] -def run(generator: Generator, modes: list[BenchMode]) -> BenchStats: +def run(generator: Generator, modes: list[BenchMode]) -> tuple[BenchStats, PerStepDict]: """Run the short-prompt benchmark for each mode. Returns stats with columns: Short@ for each checkpoint. """ stats: BenchStats = {} + per_step: PerStepDict = {} for mode in modes: apply_mode(generator, mode) print(f"\n--- Short-prompt benchmark ({mode.label}) ---", flush=True) + print(" warmup...", flush=True) + generator.generate(PROMPT, False, with_mtp=mode.with_mtp) + all_times: list[list[float]] = [] all_accepted: list[list[int]] = [] all_results: list[str] = [] + all_per_step_data: list[PerStepData] = [] for _iter in range(NUM_ITERS): if _iter % 5 == 0: print(f" iter {_iter}/{NUM_ITERS}...", flush=True) - result, time_list, accepted_counts = cast( - tuple[str, list[float], list[int]], + result, time_list, accepted_counts, prompt_len = cast( + tuple[str, list[float], list[int], int], generator.generate(PROMPT, False, with_mtp=mode.with_mtp), ) all_times.append(time_list) all_accepted.append(accepted_counts) all_results.append(result) + all_per_step_data.append( + PerStepData( + prompt_len=prompt_len, time_list=time_list, accepted_counts=accepted_counts + ) + ) - # Verify determinism and print output once mismatches = [i for i, r in enumerate(all_results) if r != all_results[0]] if mismatches: print(f" WARNING: non-deterministic output at iters {mismatches}") @@ -47,21 +65,21 @@ def run(generator: Generator, modes: list[BenchMode]) -> BenchStats: if mode.with_mtp: for token_num in TOKEN_CHECKPOINTS: speeds: list[float] = [] + iter_rates: list[float] = [] for time_list, accepted_list in zip(all_times, all_accepted): if time_list and accepted_list: cumsum_tokens = np.cumsum(accepted_list) cumsum_times = np.cumsum(time_list) idx = int(np.searchsorted(cumsum_tokens, token_num)) - # If total tokens < token_num, use all available data if idx >= len(cumsum_times): idx = len(cumsum_times) - 1 tok_count = int(cumsum_tokens[idx]) elapsed = float(cumsum_times[idx]) if elapsed > 0: speeds.append(tok_count / elapsed) + iter_rates.append((idx + 1) / elapsed) if speeds: speed = float(np.mean(speeds)) - mean_time = 1 / speed flat_accepted = [a for al in all_accepted for a in al] acc_rate = "-" @@ -69,8 +87,11 @@ def run(generator: Generator, modes: list[BenchMode]) -> BenchStats: avg_a = sum(flat_accepted) / len(flat_accepted) acc_rate = f"{avg_a:.2f}/{min(flat_accepted)}/{max(flat_accepted)}" + iters_s = float(np.mean(iter_rates)) if iter_rates else 0.0 mode_stats[f"Short@{token_num}"] = CellStats( - tok_s=speed, ms=mean_time * 1000, acc_rate=acc_rate + tok_s=speed, + iters_s=f"{iters_s:.1f} it/s", + acc_rate=acc_rate, ) else: for token_num in TOKEN_CHECKPOINTS: @@ -82,8 +103,15 @@ def run(generator: Generator, modes: list[BenchMode]) -> BenchStats: if per_token_times: mean_time = float(np.mean(per_token_times)) speed = 1 / mean_time - mode_stats[f"Short@{token_num}"] = CellStats(tok_s=speed, ms=mean_time * 1000) + mode_stats[f"Short@{token_num}"] = CellStats( + tok_s=speed, iters_s=f"{speed:.1f} it/s" + ) + + mode_per_step: dict[str, list[PerStepData]] = {} + for token_num in TOKEN_CHECKPOINTS: + mode_per_step[f"Short@{token_num}"] = all_per_step_data + per_step[mode.label] = mode_per_step stats[mode.label] = mode_stats - return stats + return stats, per_step diff --git a/tilert/generate.py b/tilert/generate.py new file mode 100644 index 0000000..bfcd97f --- /dev/null +++ b/tilert/generate.py @@ -0,0 +1,299 @@ +"""Text generation script for TileRT.""" + +import time +from argparse import ArgumentParser +from typing import TYPE_CHECKING + +import tilert + +if TYPE_CHECKING: + from tilert.models.deepseek_v3_2.generator import DSAv32Generator + from tilert.models.glm_5.generator import GLM5Generator +from tilert.benchmark import BenchMode +from tilert.benchmark import coding_prompt as coding_bench +from tilert.benchmark import long_prompt as long_bench +from tilert.benchmark import merge_stats, print_summary_table +from tilert.benchmark import short_prompt as short_bench +from tilert.benchmark.config import get_weights_dir + + +def get_generator( + model_type: str, + max_new_tokens: int, + temperature: float, + model_weights_dir: str, + with_mtp: bool, + top_p: float = 0.9, + top_k: int = 256, + enable_thinking: bool = False, + sampling_seed: int = 42, +) -> "DSAv32Generator | GLM5Generator": + """Load the matching backend .so and build the generator for ``model_type``. + + DeepSeek-V3.2 and GLM-5 ship as separate libraries; only one backend loads + per process. Generators are imported lazily after the backend is loaded. + """ + tilert.load_backend(model_type) + + if model_type == "deepseek_v3_2": + from tilert.models.deepseek_v3_2.generator import DSAv32Generator + from tilert.models.deepseek_v3_2.model_args import ModelArgs as DSAv32ModelArgs + + return DSAv32Generator( + model_args=DSAv32ModelArgs(), + max_new_tokens=max_new_tokens, + temperature=temperature, + model_weights_dir=model_weights_dir, + with_mtp=with_mtp, + top_p=top_p, + top_k=top_k, + use_topp=top_p < 1.0, + sampling_seed=sampling_seed, + enable_thinking=enable_thinking, + ) + + if model_type == "glm5": + from tilert.models.glm_5.generator import GLM5Generator + from tilert.models.glm_5.model_args import ModelArgsGLM5 + + return GLM5Generator( + model_args=ModelArgsGLM5(), + max_new_tokens=max_new_tokens, + temperature=temperature, + model_weights_dir=model_weights_dir, + with_mtp=with_mtp, + top_p=top_p, + top_k=top_k, + use_topp=top_p < 1.0, + enable_thinking=enable_thinking, + sampling_seed=sampling_seed, + ) + + raise ValueError(f"unsupported model_type: {model_type!r}") + + +def parse_args(): # type: ignore + parser = ArgumentParser(description="Command-line interface for text generation.") + parser.add_argument( + "--model-weights-dir", + type=str, + default=None, + help="Path to model weights directory (resolved from ~/.tilert/config.toml if omitted)", + ) + parser.add_argument( + "--model", + type=str, + default="deepseek_v3_2", + choices=["deepseek_v3_2", "glm5"], + help="Model type to use (default: deepseek_v3_2).", + ) + parser.add_argument("--max-new-tokens", type=int, default=4000, help="Max tokens to generate") + parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature") + parser.add_argument( + "--top-p", + type=float, + default=1.0, + help="Top-p (nucleus) sampling threshold. Use < 1.0 to enable top-p sampling (e.g. 0.9)", + ) + parser.add_argument("--top-k", type=int, default=256, help="Top-k sampling threshold") + parser.add_argument("--interactive", action="store_true") + parser.add_argument( + "--with-mtp", + action="store_true", + help="Enable MTP (Multi-Token Prediction) for speculative decoding", + ) + parser.add_argument( + "--use-random-weights", + action="store_true", + help="Use random weights instead of pretrained (for testing MTP without real weights)", + ) + parser.add_argument( + "--enable-thinking", + action="store_true", + help="Enable thinking mode in chat template", + ) + parser.add_argument( + "--sampling-seed", + type=int, + default=42, + help="Sampling seed for top-p sampling (fixed per request, default: 42)", + ) + parser.add_argument( + "--model-name", + type=str, + default=None, + help="Override display name for benchmark tables", + ) + parser.add_argument( + "--tag", + type=str, + default=None, + help="Tag for regression_plots/ directory (default: auto-detect from git state)", + ) + parser.add_argument( + "--modes", + type=str, + default=None, + help="Comma-separated mode filters: top-k1,top-p0.95 (default: all)", + ) + parser.add_argument( + "--workloads", + type=str, + default=None, + help="Comma-separated workload filters: short,coding,long (default: all)", + ) + parser.add_argument( + "--enable-logprobs", + action="store_true", + help="Enable kernel-level top-256 logprobs export (for benchmarking overhead)", + ) + return parser.parse_args() + + +if __name__ == "__main__": + """ + Usage (run as a module; --model-weights-dir may be omitted if the path is + registered under ~/.tilert/config.toml). Run DeepSeek-V3.2 and GLM-5 in + separate processes — the two backends cannot coexist in one interpreter. + + # DeepSeek-V3.2 — standard generation with pretrained weights: + python -m tilert.generate --model deepseek_v3_2 \ + --model-weights-dir /path/to/DeepSeek-V3.2-TileRT \ + --max-new-tokens 1000 2>&1 | tee test.log + + # DeepSeek-V3.2 — MTP generation with random weights (for testing): + python -m tilert.generate --model deepseek_v3_2 --with-mtp --use-random-weights \ + --model-weights-dir /path/to/DeepSeek-V3.2-TileRT \ + --max-new-tokens 1000 2>&1 | tee test.log + + # DeepSeek-V3.2 — MTP generation with pretrained weights: + python -m tilert.generate --model deepseek_v3_2 --with-mtp \ + --model-weights-dir /path/to/DeepSeek-V3.2-TileRT \ + --max-new-tokens 1000 2>&1 | tee test.log + + # GLM-5 — standard generation: + python -m tilert.generate --model glm5 \ + --model-weights-dir /path/to/GLM-5-FP8-TileRT \ + --max-new-tokens 1000 2>&1 | tee test.log + + # GLM-5 — MTP generation: + python -m tilert.generate --model glm5 --with-mtp \ + --model-weights-dir /path/to/GLM-5-FP8-TileRT \ + --max-new-tokens 1000 2>&1 | tee test.log + """ + args = parse_args() + + config_key = args.model + model_name = args.model.upper() + if args.model_name: + model_name = args.model_name + model_weights_dir = get_weights_dir(config_key, cli_override=args.model_weights_dir) + + if args.interactive: + with_mtp = args.with_mtp + else: + with_mtp = True + + generator = get_generator( + model_type=args.model, + max_new_tokens=args.max_new_tokens, + temperature=args.temperature, + model_weights_dir=model_weights_dir, + with_mtp=with_mtp, + top_p=args.top_p, + top_k=args.top_k, + enable_thinking=args.enable_thinking, + sampling_seed=args.sampling_seed, + ) + + t0 = time.monotonic() + if args.use_random_weights: + print("Initializing random weights...") + if hasattr(generator, "init"): + generator.init() # type: ignore[union-attr] + generator.init_random_weights() + else: + print("Loading pretrained weights...") + generator.from_pretrained() + load_time = time.monotonic() - t0 + + if args.enable_logprobs: + if hasattr(generator.decode_layer, "set_logprobs_enabled"): + generator.decode_layer.set_logprobs_enabled(True) # type: ignore[union-attr] + print("Logprobs export enabled (top-256)") + else: + print(f"Warning: logprobs not supported for {type(generator).__name__}") + + if args.interactive: + print("Welcome to the TileRT interactive mode! Type '/exit' to exit.") + while True: + prompt = input(">>> ") + if prompt == "/exit": + break + _ = generator.generate(prompt) # type: ignore[has-type] + else: + + bench_top_p = args.top_p if args.top_p < 1.0 else 0.95 + modes = [ + BenchMode(with_mtp=False, label="top-k1 w/o MTP"), + BenchMode(with_mtp=True, label="top-k1 w/ MTP"), + BenchMode( + with_mtp=True, + label=f"top-p{bench_top_p} w/ MTP", + use_topp=True, + top_p=bench_top_p, + top_k=args.top_k, + temperature=args.temperature, + ), + ] + + if args.modes: + allowed = {m.strip() for m in args.modes.split(",")} + modes = [m for m in modes if any(a in m.label for a in allowed)] + if not modes: + raise SystemExit( + f"Error: --modes '{args.modes}' matched no benchmark modes. " + f"Valid tokens: top-k1, top-p0.95" + ) + + t0 = time.monotonic() + workload_runners = [] + allowed_workloads = ( + {w.strip() for w in args.workloads.split(",")} + if args.workloads + else {"short", "coding", "long"} + ) + if "short" in allowed_workloads: + workload_runners.append(short_bench.run) + if "coding" in allowed_workloads: + workload_runners.append(coding_bench.run) + if "long" in allowed_workloads: + workload_runners.append(long_bench.run) + if not workload_runners: + raise SystemExit( + f"Error: --workloads '{args.workloads}' matched no workloads. " + f"Valid values: short, coding, long" + ) + + all_bench_results = [ + runner(generator, modes) for runner in workload_runners # type: ignore[arg-type] + ] + bench_time = time.monotonic() - t0 + all_bench_stats = [stats for stats, _ in all_bench_results] + + print_summary_table( + merge_stats(all_bench_stats), + model_name=model_name, + ) + + total = load_time + bench_time + print(f"\n## {model_name} Timing") + print() + print("| Phase | Time |") + print("|-------|------|") + print(f"| Loading | {load_time:.1f}s |") + print(f"| Benchmark | {bench_time:.1f}s |") + print(f"| **Total** | **{total:.1f}s** |") + + print("Cleaning up...") + generator.cleanup() diff --git a/python/models/__init__.py b/tilert/models/__init__.py similarity index 100% rename from python/models/__init__.py rename to tilert/models/__init__.py diff --git a/python/models/base.py b/tilert/models/base.py similarity index 83% rename from python/models/base.py rename to tilert/models/base.py index 58171a7..b9f5d4d 100644 --- a/python/models/base.py +++ b/tilert/models/base.py @@ -3,7 +3,7 @@ import os from abc import ABC, abstractmethod from enum import Enum -from typing import Any +from typing import Any, ClassVar import torch import torch.nn as nn @@ -17,11 +17,13 @@ "TileRTModule", ] +ModelArgsLike = Any + class TilertWeightsConverter: """Tilert weights converter""" - def __init__(self, model_args: ModelArgs, num_devices: int): + def __init__(self, model_args: ModelArgsLike, num_devices: int): self.model_args = model_args self.num_devices = num_devices @@ -38,6 +40,29 @@ class TileRTModule(nn.Module, ABC): own forward method. """ + _SUPPORTED_ALGORITHMS: ClassVar[dict[str, list[Enum]]] = {} + _VALID_COMPUTE_KERNEL_TYPES: ClassVar[frozenset[str]] = frozenset( + { + "bf16", + "fp8", + "fp8mma", + "general", + "bf16mma", + "fp16mma", + "fp8mma_68cta", + } + ) + + @classmethod + def get_supported_algorithms(cls, arch_name: str) -> list[Enum]: + """Return supported algorithms for the given architecture.""" + if arch_name not in cls._SUPPORTED_ALGORITHMS: + raise ValueError( + f"{cls.__name__} does not support arch '{arch_name}'. " + f"Supported: {list(cls._SUPPORTED_ALGORITHMS.keys())}" + ) + return cls._SUPPORTED_ALGORITHMS[arch_name] + def __init__( self, op_name: str = "", @@ -45,7 +70,7 @@ def __init__( tilert_weights_dir: str = "", layer_idx: int = 0, compute_kernel_type: str = "bf16", - model_args: ModelArgs | None = None, + model_args: ModelArgsLike | None = None, num_devices: int = 8, device_id: int = 0, *args: Any, @@ -64,7 +89,7 @@ def __init__( """ super().__init__(*args, **kwargs) - self.model_args = model_args if model_args is not None else ModelArgs() + self.model_args: ModelArgsLike = model_args if model_args is not None else ModelArgs() self.num_devices = num_devices self.device_id = device_id self.algorithm: Enum | None = None @@ -79,10 +104,10 @@ def __init__( self.flag_enable_tilert = False - if compute_kernel_type not in ["bf16", "fp8", "fp8mma"]: + if compute_kernel_type not in self._VALID_COMPUTE_KERNEL_TYPES: raise ValueError( - f"Invalid compute kernel type: {compute_kernel_type}, \ - must be one of bf16, fp8, fp8mma." + f"Invalid compute kernel type: {compute_kernel_type}, " + f"must be one of {sorted(self._VALID_COMPUTE_KERNEL_TYPES)}." ) self.compute_kernel_type = compute_kernel_type @@ -112,6 +137,14 @@ def set_algorithm(self, algorithm: Enum) -> None: Args: algorithm: Algorithm. """ + if self._SUPPORTED_ALGORITHMS: + arch = self.model_args.arch_name + supported = self.get_supported_algorithms(arch) + if algorithm not in supported: + raise ValueError( + f"{type(self).__name__}: algorithm {algorithm} not supported " + f"for arch '{arch}'. Supported: {supported}" + ) self.algorithm = algorithm def register_weights(self, weights_config: dict[str, dict[str, Any]]) -> None: @@ -214,7 +247,11 @@ class SerializableTileRTModule(TileRTModule): """Serializable TileRT module.""" def __init__( - self, model_args: ModelArgs, device_id: int, num_devices: int, remove_selected: bool = False + self, + model_args: ModelArgsLike, + device_id: int, + num_devices: int, + remove_selected: bool = False, ): super().__init__( type(self).__name__, model_args=model_args, device_id=device_id, num_devices=num_devices @@ -284,14 +321,21 @@ def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: for op, prefix, suffix, retain_weights in zip( self.exec_seq, self.prefix_seq, self.suffix_seq, self.retain_weights_seq ): + if op.is_tilert_weights_init: + logger.debug(f"Skipping init_tilert_weights for {op.op_name} (already initialized)") + continue + keys_to_remove = set() op_state_dict = {} for op_key in op.get_tilert_weights_alias(): original_key = f"{prefix}{op_key}{suffix}" - op_state_dict[op_key] = state_dict[original_key] - if self.remove_selected: - keys_to_remove.add(original_key) + if original_key in state_dict: + op_state_dict[op_key] = state_dict[original_key] + if self.remove_selected: + keys_to_remove.add(original_key) + op.init_tilert_weights(op_state_dict) + if self.remove_selected and not retain_weights: for k in keys_to_remove: del state_dict[k] diff --git a/tilert/models/common.py b/tilert/models/common.py new file mode 100644 index 0000000..6d6f436 --- /dev/null +++ b/tilert/models/common.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +import torch +import torch.nn as nn +import torch.nn.functional as F + +if TYPE_CHECKING: + from tilert.models.deepseek_v3_2.refs.kernel import act_quant, fp8_gemm, weight_dequant + +__all__ = [ + "act_quant", + "fp8_gemm", + "weight_dequant", + "init_func", + "linear", + "RMSNorm", +] + +from tilert.models.deepseek_config import ( + block_size, + gemm_impl, +) + +_LAZY_IMPORTS = {"act_quant", "fp8_gemm", "weight_dequant"} + + +def __getattr__(name: str) -> object: + if name in _LAZY_IMPORTS: + from tilert.models.deepseek_v3_2.refs import kernel + + attr = getattr(kernel, name) + globals()[name] = attr + return attr + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def _get_scale_tensor(tensor: torch.Tensor) -> torch.Tensor: + """Return the dynamically attached ``scale`` tensor.""" + scale = getattr(tensor, "scale", None) + if scale is None: + raise AttributeError("Expected quantized tensor to carry a 'scale' attribute.") + return cast(torch.Tensor, scale) + + +def init_func(x_in: torch.Tensor) -> torch.Tensor: + x_dtype = x_in.dtype + x_fp32 = x_in.to(torch.float32) + if x_fp32.dim() >= 2: + initial_tensor = nn.init.kaiming_uniform_(x_fp32) + else: + initial_tensor = nn.init.uniform_(x_fp32) + return initial_tensor.to(x_dtype) + + +def linear( + x_in: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None = None, + scale_fmt: str | None = None, +) -> torch.Tensor: + """ + Applies a linear transformation to the incoming data: y = xA^T + b. + + Args: + x_in (torch.Tensor): The input tensor. + weight (torch.Tensor): The weight tensor. It may be quantized. + bias (Optional[torch.Tensor]): The bias tensor to be added. Default is None. + + Returns: + torch.Tensor: The result of the linear transformation. + """ + if weight.element_size() > 1: + return F.linear(x_in, weight, bias) + + from tilert.models.deepseek_v3_2.refs.kernel import act_quant, fp8_gemm, weight_dequant + + if gemm_impl == "bf16": + weight = weight_dequant(weight, _get_scale_tensor(weight)) + return F.linear(x_in, weight, bias) + + x_quant: torch.Tensor + scale: torch.Tensor + x_quant, scale = act_quant(x_in, block_size, scale_fmt) + y_out: torch.Tensor = fp8_gemm(x_quant, scale, weight, _get_scale_tensor(weight)) + if bias is not None: + y_out += bias + return y_out + + +class RMSNorm(nn.Module): + """ + Root Mean Square Layer Normalization (RMSNorm). + + Args: + dim (int): Dimension of the input tensor. + eps (float): Epsilon value for numerical stability. Defaults to 1e-6. + """ + + def __init__(self, dim: int, eps: float = 1e-6, weight: torch.Tensor | None = None): + super().__init__() + self.dim = dim + self.eps = eps + + if weight is None: + self.weight = nn.Parameter(init_func(torch.empty(dim, dtype=torch.float32))) + else: + self.weight = torch.nn.Parameter(weight) + + def forward( + self, x: torch.Tensor, residual: torch.Tensor | None = None + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """ + Forward pass for RMSNorm. + + Args: + x (torch.Tensor): Input tensor. + + Returns: + torch.Tensor: Normalized tensor with the same shape as input. + """ + dtype = torch.bfloat16 + if residual is None: + x = x.float() + var_s = x.pow(2).mean(-1, keepdim=True) + x = x * torch.rsqrt(var_s + self.eps) + return (self.weight * x).to(dtype) + + x = residual = x.float() + residual.float() + var_s = x.pow(2).mean(-1, keepdim=True) + x = x * torch.rsqrt(var_s + self.eps) + return (self.weight * x).to(dtype), residual.to(dtype) diff --git a/tilert/models/deepseek_config.py b/tilert/models/deepseek_config.py new file mode 100644 index 0000000..ec3701d --- /dev/null +++ b/tilert/models/deepseek_config.py @@ -0,0 +1,24 @@ +"""Global configuration for DeepSeek models.""" + +from typing import Literal + +import torch.distributed as dist + +__all__ = [ + "get_world_size", + "get_rank", + "block_size", + "gemm_impl", +] + + +def get_world_size() -> int: + return dist.get_world_size() if dist.is_initialized() else 8 + + +def get_rank() -> int: + return dist.get_rank() if dist.is_initialized() else 0 + + +block_size = 128 +gemm_impl: Literal["bf16", "fp8"] = "bf16" diff --git a/python/models/deepseek_v3_2/__init__.py b/tilert/models/deepseek_v3_2/__init__.py similarity index 100% rename from python/models/deepseek_v3_2/__init__.py rename to tilert/models/deepseek_v3_2/__init__.py diff --git a/python/models/deepseek_v3_2/generator.py b/tilert/models/deepseek_v3_2/generator.py similarity index 81% rename from python/models/deepseek_v3_2/generator.py rename to tilert/models/deepseek_v3_2/generator.py index 3813259..fb7a467 100644 --- a/python/models/deepseek_v3_2/generator.py +++ b/tilert/models/deepseek_v3_2/generator.py @@ -40,6 +40,7 @@ def __init__( top_p: float = 0.9, top_k: int = 256, sampling_seed: int = 42, + enable_thinking: bool = False, ): """Initialize the DSAv32Generator. @@ -52,6 +53,8 @@ def __init__( top_p: Top-p threshold for nucleus sampling. Defaults to 0.9. top_k: Number of top-k candidates for top-p sampling. Defaults to 256. sampling_seed: Sampling seed for top-p (fixed per request). Defaults to 42. + enable_thinking: Whether to enable thinking mode in the chat template. + Maps to the DSv32 tokenizer's ``thinking`` Jinja variable. """ torch.set_num_threads(64) self.model_weights_dir = model_weights_dir @@ -63,13 +66,14 @@ def __init__( self.top_p = top_p self.top_k = top_k self.sampling_seed = sampling_seed + self.enable_thinking = enable_thinking self.config = model_args self.tokenizer = AutoTokenizer.from_pretrained( self.model_weights_dir, trust_remote_code=True ) # nosec B615 self.eos_id = self.tokenizer.eos_token_id - self.batch_size = 1 # fixed batch size to 1 for now + self.batch_size = 1 self.default_device = torch.device("cuda:0") @@ -100,6 +104,37 @@ def from_pretrained(self) -> None: """Load the model weights from the given path.""" self.decode_layer.from_pretrained(self.model_weights_dir) + def extract_ffn_cache(self) -> tuple[dict[int, list], dict[int, set[str]]]: + """Extract MOE/MLP op objects and skip keys from current loaded weights. + + Returns: + Tuple of (cached_ffn_ops_per_device, skip_keys_per_device). + """ + from tilert.models.deepseek_v3_2.modules.end2end import ( + _extract_ffn_ops, + _get_moe_weight_keys, + ) + + cached_ffn_ops: dict[int, list] = {} + skip_keys: dict[int, set[str]] = {} + for device_id in range(self.decode_layer.num_devices): + dsa = self.decode_layer._dsa_objects[device_id] + if dsa is None: + raise RuntimeError(f"Device {device_id} Dsa not available for cache extraction") + cached_ffn_ops[device_id] = _extract_ffn_ops(dsa) + skip_keys[device_id] = _get_moe_weight_keys(dsa) + return cached_ffn_ops, skip_keys + + def from_pretrained_with_cache( + self, + cached_ffn_ops_per_device: dict[int, list], + skip_keys_per_device: dict[int, set[str]], + ) -> None: + """Load weights reusing cached MOE/MLP ops.""" + self.decode_layer.from_pretrained_with_cache( + self.model_weights_dir, cached_ffn_ops_per_device, skip_keys_per_device + ) + def update_sampling_params( self, temperature: float = 1.0, @@ -123,7 +158,7 @@ def generate( print_log: bool = True, with_mtp: bool | None = None, prompt_tokens: list[int] | None = None, - ) -> tuple[str, list[float], list[int]]: + ) -> tuple[str, list[float], list[int], int]: """Main function to load the model and perform single sequence generation. Args: @@ -135,7 +170,7 @@ def generate( and use these tokens directly (useful for exact-length benchmarking). Returns: - Tuple of (result_text, time_list, accepted_counts). + Tuple of (result_text, time_list, accepted_counts, prompt_len). accepted_counts is empty for non-MTP mode. """ active_mtp = with_mtp if with_mtp is not None else self.with_mtp @@ -144,10 +179,10 @@ def generate( self.decode_layer.set_sampling_seed(self.sampling_seed, with_mtp=active_mtp) if active_mtp: return self._generate_with_mtp(prompt, print_log, prompt_tokens=prompt_tokens) - result, time_list = self._generate_without_mtp( + result, time_list, prompt_len = self._generate_without_mtp( prompt, print_log, with_mtp=active_mtp, prompt_tokens=prompt_tokens ) - return result, time_list, [] # Empty accepted_counts for non-MTP + return result, time_list, [], prompt_len def _generate_without_mtp( self, @@ -155,17 +190,15 @@ def _generate_without_mtp( print_log: bool = True, with_mtp: bool = False, prompt_tokens: list[int] | None = None, - ) -> tuple[str, list[float]]: + ) -> tuple[str, list[float], int]: """Standard generation without MTP.""" if prompt_tokens is None: prompt_tokens = self.tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], add_generation_prompt=True + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + thinking=self.enable_thinking, ) - # adapt to transformers 5.2.0 - if not isinstance(prompt_tokens, list) and prompt_tokens.get("input_ids") is not None: - prompt_tokens = prompt_tokens["input_ids"] - assert prompt_tokens is not None max_seq_len = self.config.max_seq_len prompt_len = len(prompt_tokens) total_len = min(max_seq_len, self.max_new_tokens + prompt_len) @@ -193,9 +226,8 @@ def _generate_without_mtp( time_list.append(end_time - start_time) intermediates, *_ = multi_devices_results[0] - next_token = intermediates[Idx.TOKEN_OUT][0][0] # only the first token + next_token = intermediates[Idx.TOKEN_OUT][0][0] - # replace the next token with the prompt token if the prompt mask is True next_token = torch.where( prompt_mask[0, cur_pos_val], tokens[0, cur_pos_val], next_token ) @@ -219,7 +251,6 @@ def _generate_without_mtp( stats_time(time_list, "==== Performance ====") print("\n") - # Reset sequence after generation, i.e. reset the cur_pos to 0 internally self.decode_layer.reset_sequence() completion_tokens = [] @@ -231,29 +262,26 @@ def _generate_without_mtp( decoded_tokens = self.tokenizer.batch_decode(completion_tokens, skip_special_tokens=True) - return f"{decoded_tokens[0]}\n" if decoded_tokens else "", time_list + return f"{decoded_tokens[0]}\n" if decoded_tokens else "", time_list, prompt_len def _generate_with_mtp( self, prompt: str, print_log: bool = True, prompt_tokens: list[int] | None = None, - ) -> tuple[str, list[float], list[int]]: + ) -> tuple[str, list[float], list[int], int]: """Generation with MTP (Multi-Token Prediction) speculative decoding.""" if prompt_tokens is None: prompt_tokens = self.tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], add_generation_prompt=True + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + thinking=self.enable_thinking, ) - # adapt to transformers 5.2.0 - if not isinstance(prompt_tokens, list) and prompt_tokens.get("input_ids") is not None: - prompt_tokens = prompt_tokens["input_ids"] - assert prompt_tokens is not None max_seq_len = self.config.max_seq_len prompt_len = len(prompt_tokens) total_len = min(max_seq_len, self.max_new_tokens + prompt_len) - # Output tokens buffer tokens = torch.full( (self.batch_size, total_len), -1, dtype=torch.long, device=self.default_device ) @@ -263,17 +291,14 @@ def _generate_with_mtp( prefill_time_list = [] decode_time_list = [] - decode_accepted_counts = [] # Only track decode phase for statistics - cur_pos = 0 # Current position in the output sequence + decode_accepted_counts = [] + cur_pos = 0 - # Prefill phase: process prompt tokens in non-overlapping chunks. - # Each chunk fills unique KV cache positions for both main model and MTP[0]. while cur_pos < prompt_len - 1: draft_end = min(cur_pos + self.mtp_seq_len, prompt_len) draft_tokens = tokens[0, cur_pos:draft_end].clone() actual_token_count = draft_tokens.shape[0] - # Pad if needed (use last token for padding) if actual_token_count < self.mtp_seq_len: pad_token = draft_tokens[-1].item() padding = torch.full( @@ -286,18 +311,13 @@ def _generate_with_mtp( draft_tokens = draft_tokens.reshape(1, self.mtp_seq_len).to(torch.int32) - # Provide the extra token for MTP[0]'s shifted input last position. - # MTP[0] needs tokens[cur_pos+1 : cur_pos+mtp_seq_len+1], so the - # extra token is at cur_pos + mtp_seq_len. mtp_extra_pos = cur_pos + self.mtp_seq_len if mtp_extra_pos < prompt_len: mtp_extra_token = int(tokens[0, mtp_extra_pos].item()) else: - # Beyond prompt — use last valid draft token as padding mtp_extra_token = int(tokens[0, draft_end - 1].item()) self.decode_layer.set_prefill_mtp_extra_token(mtp_extra_token) - # Tell GPU how many tokens are valid (for cur_pos advancement) self.decode_layer.set_prefill_valid_tokens(actual_token_count) start_time = time.time() @@ -305,27 +325,16 @@ def _generate_with_mtp( end_time = time.time() prefill_time_list.append(end_time - start_time) - # No overlap: advance by the full actual_token_count cur_pos += actual_token_count - # After no-overlap prefill, cur_pos may have overshot to prompt_len. - # Reset to prompt_len - 1 for correct decode start (first decode - # reprocesses the last prompt token position). cur_pos = prompt_len - 1 self.set_cur_pos(prompt_len - 1) - # Decode phase: speculative decoding - # Set prefill_valid_tokens to 0 to switch to decode mode self.decode_layer.set_prefill_valid_tokens(0) finished = False while cur_pos < total_len - 1 and not finished: - # Get next_draft_tokens from previous iteration - # (or use last prompt tokens for first decode) if cur_pos == prompt_len - 1: - # First decode iteration: use last prompt token repeated as placeholder drafts - # We can't use [t6, t7, t8, t9] because that would apply wrong RoPE positions - # (cur_pos=9 means positions 9,10,11,12, but t6 should be at position 6) last_token = tokens[0, prompt_len - 1].item() draft_tokens = torch.full( (self.mtp_seq_len,), @@ -335,7 +344,6 @@ def _generate_with_mtp( ) draft_tokens = draft_tokens.reshape(1, self.mtp_seq_len).to(torch.int32) else: - # Use next_draft_tokens from previous iteration draft_tokens = self.decode_layer.get_next_draft_tokens(0).reshape( 1, self.mtp_seq_len ) @@ -346,11 +354,9 @@ def _generate_with_mtp( decode_time_list.append(end_time - start_time) num_accepted = self.decode_layer.get_num_accepted(0) - # Use predicted_tokens for output (not next_draft_tokens which is for next iteration) predicted_tokens = self.decode_layer.get_predicted_tokens(0).flatten() decode_accepted_counts.append(num_accepted) - # Add accepted tokens to output num_output_tokens = num_accepted for i in range(num_output_tokens): if cur_pos + 1 + i >= total_len: @@ -358,12 +364,10 @@ def _generate_with_mtp( new_token = int(predicted_tokens[i].item()) tokens[0, cur_pos + 1 + i] = new_token - # Print generated token if cur_pos + 1 + i >= prompt_len and print_log: decoded_text = self.tokenizer.decode([new_token], skip_special_tokens=True) print(decoded_text, end="", flush=True) - # Check for EOS if new_token == self.eos_id: finished = True break @@ -384,7 +388,6 @@ def _generate_with_mtp( f"min={min_accepted}, max={max_accepted}" ) - # Calculate correct TPS accounting for MTP's multiple tokens per call if decode_time_list: total_decode_time = sum(decode_time_list) effective_tps = total_tokens / total_decode_time if total_decode_time > 0 else 0 @@ -394,14 +397,11 @@ def _generate_with_mtp( print("\n") - # Reset sequence after generation self.decode_layer.reset_sequence() - # Extract completion tokens completion_tokens = [] for _, toks in enumerate(tokens.tolist()): toks = toks[prompt_len : prompt_len + self.max_new_tokens] - # Remove -1 padding and tokens after EOS toks = [t for t in toks if t != -1] if self.eos_id in toks: toks = toks[: toks.index(self.eos_id)] @@ -413,6 +413,7 @@ def _generate_with_mtp( f"{decoded_tokens[0]}\n" if decoded_tokens else "", decode_time_list, decode_accepted_counts, + prompt_len, ) def inject_cache( @@ -452,7 +453,6 @@ def inject_cache( logger.warning("inject_cache called with empty layer_caches") return - # Infer seqlen from first tensor if end_pos not specified first_ki, _, _ = layer_caches[0] seqlen = first_ki.size(0) if end_pos is None: @@ -473,9 +473,6 @@ def inject_cache( base_idx = layer_id * 3 - # Copy to device and inject into cache - # Cache layout: [batch=1, max_seq_len, dim] - # External data: [seqlen, dim] ki_src = ki[:cache_len].to(f"cuda:{device_id}") kv_src = kv[:cache_len].to(f"cuda:{device_id}") pe_src = pe[:cache_len].to(f"cuda:{device_id}") @@ -487,14 +484,11 @@ def inject_cache( logger.info(f"Cache injection completed for {num_devices} devices") def set_cur_pos(self, cur_pos: int) -> None: - """Set the current position for RoPE in C++ backend. - - This should be called after inject_cache() to ensure the C++ global - g_cur_pos matches the injected cache length. This is critical for - correct RoPE position encoding during continued generation. + """Set the current position for RoPE. - For MTP mode, sets the GPU tensor at intermediates[31] directly. - For non-MTP mode, calls the C++ dsa_show_hands_set_cur_pos API. + This should be called after inject_cache() to ensure the runtime position + matches the injected cache length, for correct RoPE position encoding + during continued generation. Args: cur_pos: The current sequence position (typically the length of prefilled tokens). @@ -505,22 +499,19 @@ def set_cur_pos(self, cur_pos: int) -> None: >>> # Now generate continues from the correct position """ if self.with_mtp: - # MTP E2E uses g_cur_pos_tensors which is the GPU tensor num_devices = self.decode_layer.num_devices for device_id in range(num_devices): intermediates, _, _, _ = self.decode_layer._get_device_result(device_id) cur_pos_tensor = intermediates[Idx.CUR_POS] cur_pos_tensor.fill_(cur_pos) else: - # Non-MTP uses the C++ global g_cur_pos torch.ops.tilert.dsa_show_hands_set_cur_pos(cur_pos) def inject_last_hidden_state(self, last_hidden_state: torch.Tensor) -> None: """Inject the last hidden state for MTP mode. For MTP (Multi-Token Prediction), the MTP preprocess layer needs the - last hidden state from the main model's last token. This method injects - the hidden state into intermediates[33] (last_hidden_states slot). + last hidden state from the main model's last token. Args: last_hidden_state: [hidden_size] or [1, hidden_size] BF16 tensor. @@ -535,14 +526,12 @@ def inject_last_hidden_state(self, last_hidden_state: torch.Tensor) -> None: logger.warning("inject_last_hidden_state called but with_mtp is False, skipping") return - # Normalize shape to [1, hidden_size] if last_hidden_state.dim() == 1: last_hidden_state = last_hidden_state.unsqueeze(0) num_devices = self.decode_layer.num_devices for device_id in range(num_devices): intermediates, _, _, _ = self.decode_layer._get_device_result(device_id) - # Shape: [batch=1, seq=4, hidden_size], we set seq[0] since it's the last token lhs_tensor = intermediates[Idx.LAST_HIDDEN_STATES] lhs_src = last_hidden_state.to(f"cuda:{device_id}") lhs_tensor[0, 0, :].copy_(lhs_src.squeeze(0)) diff --git a/python/models/deepseek_v3_2/model_args.py b/tilert/models/deepseek_v3_2/model_args.py similarity index 92% rename from python/models/deepseek_v3_2/model_args.py rename to tilert/models/deepseek_v3_2/model_args.py index b149edf..441b684 100644 --- a/python/models/deepseek_v3_2/model_args.py +++ b/tilert/models/deepseek_v3_2/model_args.py @@ -50,8 +50,8 @@ class ModelArgs: arch_name = "deepseek_v3_2" - max_batch_size: int = 1 # NOTE: the current implementation only supports a batch size being 1 - max_seq_len: int = 160 * 1024 # 160K + max_batch_size: int = 1 + max_seq_len: int = 160 * 1024 dtype: Literal["bf16", "fp8"] = "fp8" scale_fmt: str | None = None @@ -63,23 +63,20 @@ class ModelArgs: n_dense_layers: int = 3 n_heads: int = 128 - # moe n_routed_experts: int = 256 n_shared_experts: int = 1 n_activated_experts: int = 8 n_expert_groups: int = 8 n_limited_groups: int = 4 - score_func: Literal["softmax", "sigmoid"] = "softmax" + score_func: Literal["softmax", "sigmoid", "sqrtsoftplus"] = "softmax" route_scale: float = 2.5 - # mla q_lora_rank: int = 1536 kv_lora_rank: int = 512 qk_nope_head_dim: int = 128 qk_rope_head_dim: int = 64 v_head_dim: int = 128 - # yarn original_seq_len: int | None = 4096 rope_theta: float = 10000.0 rope_factor: float | None = 40 @@ -87,14 +84,12 @@ class ModelArgs: beta_slow: int | None = 1 mscale: float = 1.0 - # index index_n_heads: int = 64 index_head_dim: int = 128 index_topk: int = 2048 kv_cache_pad: int = 8 - # quant block_size: int = 128 eps: float = 1e-6 diff --git a/python/models/deepseek_v3_2/modules/__init__.py b/tilert/models/deepseek_v3_2/modules/__init__.py similarity index 100% rename from python/models/deepseek_v3_2/modules/__init__.py rename to tilert/models/deepseek_v3_2/modules/__init__.py diff --git a/python/models/deepseek_v3_2/modules/dsa.py b/tilert/models/deepseek_v3_2/modules/dsa.py similarity index 70% rename from python/models/deepseek_v3_2/modules/dsa.py rename to tilert/models/deepseek_v3_2/modules/dsa.py index 64116f9..2efe143 100644 --- a/python/models/deepseek_v3_2/modules/dsa.py +++ b/tilert/models/deepseek_v3_2/modules/dsa.py @@ -13,17 +13,75 @@ class Dsa(SerializableTileRTModule): """DSA module.""" - def __init__(self, model_args: ModelArgs, device_id: int, num_devices: int): + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + cached_ffn_ops: list | None = None, + ): super().__init__( model_args=model_args, device_id=device_id, num_devices=num_devices, remove_selected=True, ) + from tilert.models.deepseek_v3_2.modules.mla_v2 import ( + PureMlaV2, + SparseSelectMlaV2, + ) + + mla_cls = SparseSelectMlaV2 if device_id == 0 else PureMlaV2 + mla_kwargs: dict = {} + + dev = f"cuda:{device_id}" + n_peers = num_devices - 1 + if device_id == 0: + self.v2_peer_bufs = torch.zeros(n_peers, dtype=torch.int64, device=dev) + self.v2_partial_buf = torch.zeros( + model_args.max_batch_size, 4, model_args.dim, dtype=torch.bfloat16, device=dev + ) + mla_kwargs = { + "peer_bufs": self.v2_peer_bufs, + "partial_buf": self.v2_partial_buf, + } + else: + max_seq_len = getattr(model_args, "num_mtp", 3) + 1 + topk = model_args.index_topk + self.v2_ll_buf = torch.zeros(max_seq_len * topk * 2, dtype=torch.int32, device=dev) + mla_kwargs = {"ll_buf": self.v2_ll_buf} + + mla_num_devices: int | None = None + if device_id != 0: + mla_num_devices = num_devices - 1 + + if cached_ffn_ops is not None: + assert ( + len(cached_ffn_ops) == model_args.n_layers + ), f"Expected {model_args.n_layers} cached FFN ops, got {len(cached_ffn_ops)}" for layer_idx in range(model_args.n_layers): - block_type = MlpBlock if layer_idx < model_args.n_dense_layers else MoeBlock - block = block_type(model_args=model_args, device_id=device_id, num_devices=num_devices) + ffn_op = cached_ffn_ops[layer_idx] if cached_ffn_ops else None + if layer_idx < model_args.n_dense_layers: + block = MlpBlock( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + mla_cls=mla_cls, + mla_num_devices=mla_num_devices, + mla_kwargs=mla_kwargs, + mlp=ffn_op, + ) + else: + block = MoeBlock( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + mla_cls=mla_cls, + mla_num_devices=mla_num_devices, + mla_kwargs=mla_kwargs, + moe=ffn_op, + ) self.register_op(block, prefix=f"layer_{layer_idx}_", suffix=f"_dev_{device_id}") self.register_op( @@ -64,7 +122,17 @@ def get_temp_vars( q_lora_rank = self.model_args.q_lora_rank kv_lora_rank = self.model_args.kv_lora_rank qk_nope_head_dim = self.model_args.qk_nope_head_dim - n_local_heads = self.model_args.n_heads // self.num_devices + if self.device_id != 0: + from tilert.models.deepseek_v3_2.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqbWeightsConverter, + ) + + qk_head_dim = self.model_args.qk_nope_head_dim + self.model_args.qk_rope_head_dim + n_local_heads = RmsnormProjqWqbWeightsConverter._compute_n_local_heads( + self.model_args.n_heads, self.num_devices - 1, qk_head_dim + ) + else: + n_local_heads = self.model_args.n_heads // self.num_devices qk_rope_head_dim = self.model_args.qk_rope_head_dim index_head_dim = self.model_args.index_head_dim v_head_dim = self.model_args.v_head_dim @@ -132,8 +200,7 @@ def get_temp_vars( ) temp_vars[Idx.MOE_UP_GATE] = torch.zeros_like(exp_up_gate) - # temp_vars[Idx.IDX_SEL_WS] = torch.zeros(*batch_seq, 4, index_topk * 2, **int32_desc) - temp_vars[Idx.IDX_SEL_WS] = torch.zeros(*batch_seq, (200 * 1024 + 258), **int32_desc) + temp_vars[Idx.IDX_SEL_WS] = torch.zeros(*batch_seq, (200 * 1024 + 260), **int32_desc) temp_vars[Idx.MTP0_TOKEN_OUT] = torch.zeros(*batch_seq, 1, **int32_desc) temp_vars[Idx.MTP1_TOKEN_OUT] = torch.zeros(*batch_seq, 1, **int32_desc) @@ -147,6 +214,14 @@ def get_temp_vars( temp_vars[Idx.TOP_P_SCORES] = torch.zeros(*batch_seq, **fp32_desc) temp_vars[Idx.TOP_P_DEBUG] = torch.zeros(*batch_seq, vocab_size, **fp32_desc) + temp_vars[Idx.LORA_SLOT_ID] = torch.zeros(1, **int32_desc) + temp_vars[Idx.LORA_RANK] = torch.zeros(1, **int32_desc) + + max_top_n = 256 + temp_vars[Idx.TOP_N_LOG_PROBS] = torch.zeros(*batch_seq, max_top_n, **fp32_desc) + temp_vars[Idx.TOP_N_INDICES] = torch.zeros(*batch_seq, max_top_n, **int32_desc) + temp_vars[Idx.LOGPROBS_FLAG] = torch.zeros(1, **int32_desc) + for i, t in enumerate(temp_vars): if t is None: raise RuntimeError(f"temp_vars[{i}] ({Idx(i).name}) was not initialized") diff --git a/python/models/deepseek_v3_2/modules/end2end.py b/tilert/models/deepseek_v3_2/modules/end2end.py similarity index 68% rename from python/models/deepseek_v3_2/modules/end2end.py rename to tilert/models/deepseek_v3_2/modules/end2end.py index 47a5671..e1be82e 100644 --- a/python/models/deepseek_v3_2/modules/end2end.py +++ b/tilert/models/deepseek_v3_2/modules/end2end.py @@ -8,9 +8,11 @@ from typing import Any import torch +from safetensors import safe_open from safetensors.torch import load_file from tilert import logger +from tilert.models.base import TileRTModule from tilert.models.deepseek_v3_2.model_args import ModelArgs from tilert.models.deepseek_v3_2.modules.dsa import Dsa from tilert.models.deepseek_v3_2.modules.mtp import MTP @@ -18,12 +20,62 @@ from tilert.models.utils import precompute_freqs_cis from tilert.utils import get_profile_log_tensor -__all__ = ["ShowHandsDSALayer"] +__all__ = ["ShowHandsDSALayer", "_extract_ffn_ops", "_get_moe_weight_keys"] DeviceResult = tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor], torch.Tensor] +def _mark_weights_initialized(module: TileRTModule) -> None: + """Recursively mark a module and all sub-ops as having initialized tilert weights.""" + module.is_tilert_weights_init = True + if hasattr(module, "exec_seq"): + for op in module.exec_seq: + _mark_weights_initialized(op) + + +def _extract_ffn_ops(dsa: "Dsa") -> list: + """Extract Moe/Mlp op objects from a Dsa's layer blocks. + + Returns a list of length n_layers where each element is a Moe or Mlp instance. + """ + from tilert.models.deepseek_v3_2.modules.mlp import MlpBlock + from tilert.models.deepseek_v3_2.modules.moe import MoeBlock + + ffn_ops = [] + for block in dsa.exec_seq: + if isinstance(block, MoeBlock): + op = block.moe + _mark_weights_initialized(op) + ffn_ops.append(op) + elif isinstance(block, MlpBlock): + op = block.mlp + _mark_weights_initialized(op) + ffn_ops.append(op) + + assert ( + len(ffn_ops) == dsa.model_args.n_layers + ), f"Expected {dsa.model_args.n_layers} FFN ops, got {len(ffn_ops)}" + return ffn_ops + + +def _get_moe_weight_keys(dsa: "Dsa") -> set[str]: + """Get state_dict keys that belong exclusively to MOE/MLP ops in this Dsa.""" + from tilert.models.deepseek_v3_2.modules.mlp import MlpBlock + from tilert.models.deepseek_v3_2.modules.moe import MoeBlock + + moe_keys: set[str] = set() + mla_keys: set[str] = set() + for block, prefix, suffix in zip(dsa.exec_seq, dsa.prefix_seq, dsa.suffix_seq): + if isinstance(block, (MoeBlock, MlpBlock)): + ffn = block.moe if isinstance(block, MoeBlock) else block.mlp + for alias in ffn.get_tilert_weights_alias(): + moe_keys.add(f"{prefix}{alias}{suffix}") + for alias in block.mla.get_tilert_weights_alias(): + mla_keys.add(f"{prefix}{alias}{suffix}") + return moe_keys - mla_keys + + def dsa_show_hands_prepare_money( params: list[torch.Tensor], temp_vars: list[torch.Tensor], @@ -102,9 +154,6 @@ def dsa_mtp_e2e_show_hands_set_prefill_valid_tokens( def dsa_mtp_e2e_show_hands_set_prefill_mtp_extra_token(token: int, is_glm5: bool = False) -> Any: """Set the extra token for MTP[0] shifted input during prefill. - This is the prompt token at (cur_pos + mtp_seq_len), used as the last position - of MTP[0]'s shifted input to enable no-overlap prefill chunking. - Args: token: The extra prompt token id (int32). """ @@ -144,6 +193,7 @@ def __init__( self.with_mtp = with_mtp self.multi_devices_results: list[DeviceResult | None] = [None] * torch.cuda.device_count() + self._dsa_objects: list[Dsa | None] = [None] * torch.cuda.device_count() self.temperature = temperature self.top_p = top_p @@ -155,7 +205,11 @@ def _gen_freqs_cis(self) -> torch.Tensor: return torch.view_as_real(freqs_cis).reshape(freqs_cis.shape[0], -1) def load_device_weights( - self, model_path: str, device_id: int, extra_keys: list + self, + model_path: str, + device_id: int, + extra_keys: list, + skip_keys: set[str] | None = None, ) -> dict[str, torch.Tensor]: index_file = "model.safetensors.index.json" with open(os.path.join(model_path, index_file), encoding="utf-8") as f: @@ -165,20 +219,33 @@ def load_device_weights( weights_list = [_k for _k in weight_file_map.keys() if _k.endswith(f"dev_{device_id}")] weights_list = [*weights_list, *extra_keys] + if skip_keys: + weights_list = [k for k in weights_list if k not in skip_keys] + target_files = set() for weight_key in weights_list: weight_file = weight_file_map[weight_key] target_files.add(weight_file) state_dicts = {} + weights_set = set(weights_list) for weight_file in target_files: - logger.info(f"Loading weights from {weight_file} for device {device_id}") - state_dict = load_file( - os.path.join(model_path, weight_file), device=f"cuda:{device_id}" - ) - state_dicts.update(state_dict) - del state_dict - torch.cuda.empty_cache() + filepath = os.path.join(model_path, weight_file) + if skip_keys: + logger.info( + f"Selectively loading weights from {weight_file} for device {device_id}" + ) + with safe_open(filepath, framework="pt", device=f"cuda:{device_id}") as f: + for key in f.keys(): + if key in weights_set: + state_dicts[key] = f.get_tensor(key) + torch.cuda.empty_cache() + else: + logger.info(f"Loading weights from {weight_file} for device {device_id}") + state_dict = load_file(filepath, device=f"cuda:{device_id}") + state_dicts.update(state_dict) + del state_dict + torch.cuda.empty_cache() state_dicts["freqs_cis"] = self._gen_freqs_cis().to(device_id) return state_dicts @@ -186,11 +253,7 @@ def load_device_weights( def update_sampling_config( self, temperature: float, top_p: float, top_k: int, use_topp: bool = True ) -> None: - """Update sampling config, re-capturing CUDA graphs if parameters changed. - - Sampling parameters are baked into CUDA graph instructions at prepare_money - time, so any change requires a full teardown + re-capture cycle. - """ + """Update sampling config, re-capturing CUDA graphs if parameters changed.""" new_config = (temperature, top_p, top_k, use_topp) current_config = (self.temperature, self.top_p, self.top_k, self.use_topp) if new_config == current_config: @@ -201,20 +264,17 @@ def update_sampling_config( f"temperature={temperature}, top_p={top_p}, top_k={top_k}, use_topp={use_topp}" ) - # Teardown: stop all threads and unregister all modules if self.with_mtp: dsa_show_hands_go_home(True, self.is_glm5) dsa_show_hands_go_home(False, self.is_glm5) else: dsa_show_hands_go_home(False, self.is_glm5) - # Store new config self.temperature = temperature self.top_p = top_p self.top_k = top_k self.use_topp = use_topp - # Update sampling_config tensor on all devices for device_id in range(self.num_devices): result = self.multi_devices_results[device_id] if result is not None: @@ -227,7 +287,6 @@ def update_sampling_config( ) ) - # Re-prepare all modules (re-captures CUDA graphs with new config) for device_id in range(self.num_devices): with torch.cuda.device(device_id): intermediates, caches, params, profile_logs = self._get_device_result(device_id) @@ -274,8 +333,23 @@ def generate_params_with_continuous_storage( offset += aligned_param_size return cloned_params - def _init_weights(self, model_path: str | None) -> None: - """Load the model weights from the given path or generate random weights.""" + def _init_weights( + self, + model_path: str | None, + cached_ffn_ops_per_device: dict[int, list] | None = None, + skip_keys_per_device: dict[int, set[str]] | None = None, + ) -> None: + """Load the model weights from the given path or generate random weights. + + Args: + model_path: Path to the model weights directory. + cached_ffn_ops_per_device: Optional dict mapping device_id to cached FFN ops. + When provided, these ops are injected into the Dsa and their weights + are not re-loaded from disk. + skip_keys_per_device: Optional dict mapping device_id to safetensors keys + to skip during loading. Used together with cached_ffn_ops_per_device. + """ + self._v2_p2p: dict = {} def __load_weights(device_id: int, model_path: str | None) -> None: intermediates: list[torch.Tensor] = [] @@ -284,8 +358,12 @@ def __load_weights(device_id: int, model_path: str | None) -> None: state_dicts = {} start_time = time.time() with torch.cuda.device(device_id): - assert model_path is not None # Type narrowing for mypy - # state_dicts = _load_state_dicts(model_path, dev_attrs) + assert model_path is not None + skip_keys = ( + skip_keys_per_device.get(device_id) + if skip_keys_per_device is not None + else None + ) state_dicts = self.load_device_weights( model_path, device_id, @@ -294,12 +372,33 @@ def __load_weights(device_id: int, model_path: str | None) -> None: f"layer_{self.model_args.n_layers}_lm_head.weight_dev_{device_id}", f"layer_{self.model_args.n_layers}_model.norm.weight_dev_{device_id}", ], + skip_keys=skip_keys, ) - dsa = Dsa(self.model_args, device_id, self.num_devices) + cached_ffn_ops = ( + cached_ffn_ops_per_device.get(device_id) + if cached_ffn_ops_per_device is not None + else None + ) + dsa = Dsa( + self.model_args, + device_id, + self.num_devices, + cached_ffn_ops=cached_ffn_ops, + ) dsa.init_tilert_weights(state_dicts) + self._dsa_objects[device_id] = dsa params.extend(dsa.get_weights_list()) caches.extend(dsa.get_cache_vars()) + + if device_id == 0: + self._v2_p2p[device_id] = { + "peer_bufs": dsa.v2_peer_bufs, + } + else: + self._v2_p2p[device_id] = { + "ll_buf": dsa.v2_ll_buf, + } intermediates.extend( self.generate_params_with_continuous_storage( dsa.get_temp_vars( @@ -316,8 +415,6 @@ def __load_weights(device_id: int, model_path: str | None) -> None: ) ) - # generate_params_with_continuous_storage creates zero-filled views. - # Populate sampling_config with actual values. sampling_config = intermediates[Idx.SAMPLING_CONFIG] sampling_config.copy_( torch.tensor( @@ -325,20 +422,32 @@ def __load_weights(device_id: int, model_path: str | None) -> None: self.temperature, self.top_p, float(self.top_k), - 1.0 if self.use_topp else 0.0, # 0=top1(default), 1=topp + 1.0 if self.use_topp else 0.0, ], dtype=torch.float32, device=device_id, ) ) - # Track base (non-MTP) params/caches count for dual-module init base_params_count = len(params) base_caches_count = len(caches) - # Add MTP-specific params when with_mtp is True if self.with_mtp: - mtp = MTP(self.model_args, device_id, self.num_devices) + from tilert.models.deepseek_v3_2.modules.mla_v2 import ( + PureMlaV2, + SparseSelectMlaV2, + ) + + mtp_kwargs: dict = {} + mtp_kwargs["mla_cls"] = SparseSelectMlaV2 if device_id == 0 else PureMlaV2 + mtp_kwargs["mla_num_devices"] = 1 if device_id == 0 else self.num_devices - 1 + if device_id == 0: + mtp_kwargs["mla_kwargs"] = { + "peer_bufs": dsa.v2_peer_bufs, + } + else: + mtp_kwargs["mla_kwargs"] = {"ll_buf": dsa.v2_ll_buf} + mtp = MTP(self.model_args, device_id, self.num_devices, **mtp_kwargs) mtp.init_tilert_weights(state_dicts) params.extend(mtp.get_weights_list()) caches.extend(mtp.get_cache_vars()) @@ -379,11 +488,21 @@ def _runner(dev_id: int) -> None: if exc is not None: raise RuntimeError(f"Failed to initialize device {device_id}: {exc}") from exc - # Prepare money for all devices + if self._v2_p2p: + gpu0 = self._v2_p2p[0] + peer_bufs_cpu = torch.zeros(self.num_devices - 1, dtype=torch.int64) + for i in range(self.num_devices - 1): + dev_id = i + 1 + peer_bufs_cpu[i] = self._v2_p2p[dev_id]["ll_buf"].data_ptr() + gpu0["peer_bufs"].copy_(peer_bufs_cpu) + logger.info( + "V2 P2P exchange complete: peer_bufs (ll_buf)=%s", + [hex(int(x)) for x in peer_bufs_cpu], + ) + for device_id in range(self.num_devices): with torch.cuda.device(device_id): intermediates, caches, params, profile_logs = self._get_device_result(device_id) - # Always prepare the primary module (MTP if with_mtp, else non-MTP) dsa_show_hands_prepare_money( params, intermediates, @@ -393,7 +512,6 @@ def _runner(dev_id: int) -> None: self.with_mtp, self.is_glm5, ) - # When MTP-capable, also prepare the non-MTP module using base params/caches if self.with_mtp: dsa_show_hands_prepare_money( params[: self._base_params_count], @@ -411,6 +529,21 @@ def from_pretrained(self, model_path: str) -> None: raise ValueError(f"Model weights directory {model_path} does not exist") self._init_weights(model_path) + def from_pretrained_with_cache( + self, + model_path: str, + cached_ffn_ops_per_device: dict[int, list], + skip_keys_per_device: dict[int, set[str]], + ) -> None: + """Load weights with cached MOE/MLP ops.""" + if not os.path.exists(model_path): + raise ValueError(f"Model weights directory {model_path} does not exist") + self._init_weights( + model_path, + cached_ffn_ops_per_device=cached_ffn_ops_per_device, + skip_keys_per_device=skip_keys_per_device, + ) + def init_random_weights(self) -> None: """Generate random weights.""" self._init_weights(None) @@ -438,7 +571,6 @@ def set_sampling_seed(self, seed: int, with_mtp: bool | None = None) -> None: def reset_sequence(self) -> None: if self.with_mtp: - # Reset both MTP and non-MTP modules for clean state dsa_show_hands_reset(True, self.is_glm5) dsa_show_hands_reset(False, self.is_glm5) else: @@ -446,7 +578,6 @@ def reset_sequence(self) -> None: def cleanup(self) -> None: if self.with_mtp: - # Cleanup both MTP and non-MTP modules dsa_show_hands_go_home(True, self.is_glm5) dsa_show_hands_go_home(False, self.is_glm5) else: @@ -518,3 +649,55 @@ def get_predicted_tokens(self, device_id: int = 0) -> torch.Tensor: """ intermediates, _, _, _ = self._get_device_result(device_id) return intermediates[Idx.PREDICTED_TOKENS] + + def get_logits(self, device_id: int = 0) -> torch.Tensor: + """Get logits from the specified device. + + Args: + device_id: Device ID to get results from. + + Returns: + Logits tensor of shape [batch, seq_len, vocab_size] (FP32). + """ + intermediates, _, _, _ = self._get_device_result(device_id) + return intermediates[Idx.LOGITS_OUT] + + def get_top_n_logprobs(self, device_id: int = 0) -> tuple[torch.Tensor, torch.Tensor]: + """Get top-N log-probabilities and token IDs from the top_p kernel. + + Args: + device_id: Device ID to get results from. + + Returns: + Tuple of (log_probs, token_ids): + - log_probs: [batch, seq_len, 256] FP32 + - token_ids: [batch, seq_len, 256] INT32 + """ + intermediates, _, _, _ = self._get_device_result(device_id) + return ( + intermediates[Idx.TOP_N_LOG_PROBS], + intermediates[Idx.TOP_N_INDICES], + ) + + def get_token_logprob(self, device_id: int = 0) -> torch.Tensor: + """Get log-probability of the sampled token (from TOP_P_SCORES). + + Args: + device_id: Device ID to get results from. + + Returns: + Tensor of shape [batch, seq_len] (FP32). + """ + intermediates, _, _, _ = self._get_device_result(device_id) + return intermediates[Idx.TOP_P_SCORES] + + def set_logprobs_enabled(self, enabled: bool) -> None: + """Enable or disable logprobs export in the top_p kernel. + + Args: + enabled: True to enable logprobs export, False to disable. + """ + flag_val = 1 if enabled else 0 + for device_id in range(self.num_devices): + intermediates, _, _, _ = self._get_device_result(device_id) + intermediates[Idx.LOGPROBS_FLAG].fill_(flag_val) diff --git a/tilert/models/deepseek_v3_2/modules/mla_v2.py b/tilert/models/deepseek_v3_2/modules/mla_v2.py new file mode 100644 index 0000000..bbf4f14 --- /dev/null +++ b/tilert/models/deepseek_v3_2/modules/mla_v2.py @@ -0,0 +1,248 @@ +"""MLA weight generator classes for device-group-specific pipelines.""" + +import torch + +from tilert.models.base import SerializableTileRTModule +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.models.deepseek_v3_2.ops.layernorm_rope_rotate import LayerNormRoPERotate +from tilert.models.deepseek_v3_2.ops.projo_wkvb import ProjoWKVb +from tilert.models.deepseek_v3_2.ops.projq_wqb import ProjqWqb +from tilert.models.deepseek_v3_2.ops.projx_wis import ProjxWis +from tilert.models.deepseek_v3_2.ops.rmsnorm_kv import KVRMSNorm +from tilert.models.deepseek_v3_2.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqb, + RmsnormProjqWqbAlgorithm, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_projq_wqi import ( + RmsnormProjqWqi, + RmsnormProjqWqiAlgorithm, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_projx_wqakis import ( + RMSNormProjxWqakis, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_projx_wqkva import ( + RMSNormProjxWqkva, + RMSNormProjxWqkvaAlgorithm, +) +from tilert.models.deepseek_v3_2.ops.unproj_o_allreduce import ( + UnProjOAllReduce, + UnProjOAllReduceAlgorithm, +) + + +class SparseSelectMlaV2(SerializableTileRTModule): + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + peer_bufs: torch.Tensor | None = None, + partial_buf: torch.Tensor | None = None, + ): + super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) + + self.rmsnorm_projx_wqakis = RMSNormProjxWqakis( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.rmsnorm_projx_wqakis) + + self.rmsnorm_projq_wqi = RmsnormProjqWqi( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.rmsnorm_projq_wqi.algorithm = RmsnormProjqWqiAlgorithm.BF16MMA + self.register_op(self.rmsnorm_projq_wqi) + + self.layernorm_rope_rotate = LayerNormRoPERotate( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.layernorm_rope_rotate) + + self.projx_wis = ProjxWis( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.projx_wis) + + self.peer_bufs = peer_bufs + self.partial_buf = partial_buf + + self.ki_cache: torch.Tensor | None = None + self.kv_cache: torch.Tensor | None = None + self.pe_cache: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + """Return weight tensors.""" + weights = super().get_weights_list() + + dev = f"cuda:{self.device_id}" + if self.peer_bufs is None: + self.peer_bufs = torch.zeros(self.num_devices - 1, dtype=torch.int64, device=dev) + if self.partial_buf is None: + self.partial_buf = torch.zeros( + self.model_args.max_batch_size, + 4, + self.model_args.dim, + dtype=torch.bfloat16, + device=dev, + ) + + weights.append(self.peer_bufs) + weights.append(self.partial_buf) + + return weights + + def get_cache_vars(self) -> list[torch.Tensor]: + """Return [ki_cache, kv_cache, pe_cache] matching DsaCacheVars layout.""" + cache_seq_len = self.model_args.max_seq_len + self.model_args.kv_cache_pad + bs_args = (self.model_args.max_batch_size, cache_seq_len) + + if self.ki_cache is None: + ki_dim = self.model_args.index_head_dim + self.ki_cache = torch.zeros( + *bs_args, ki_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + if self.kv_cache is None: + kv_dim = self.model_args.kv_lora_rank + self.kv_cache = torch.zeros( + *bs_args, kv_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + if self.pe_cache is None: + pe_dim = self.model_args.qk_rope_head_dim + self.pe_cache = torch.zeros( + *bs_args, pe_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + return [*super().get_cache_vars(), self.ki_cache, self.kv_cache, self.pe_cache] + + +class PureMlaV2(SerializableTileRTModule): + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + ll_buf: torch.Tensor | None = None, + ): + super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) + + self.rmsnorm_projx_wqkva = RMSNormProjxWqkva( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.rmsnorm_projx_wqkva.algorithm = RMSNormProjxWqkvaAlgorithm.DECOUPLED + self.register_op(self.rmsnorm_projx_wqkva) + + self.rmsnorm_projq_wqb = RmsnormProjqWqb( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.rmsnorm_projq_wqb.algorithm = RmsnormProjqWqbAlgorithm.BF16MMA + self.register_op(self.rmsnorm_projq_wqb) + + self.rmsnorm_kv = KVRMSNorm( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.rmsnorm_kv) + + self.projq_wqb = ProjqWqb( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.projq_wqb) + + self.projo_wkvb = ProjoWKVb( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.projo_wkvb) + + allreduce_algo = UnProjOAllReduceAlgorithm.BF16MMA + self.unproj_o_allreduce = UnProjOAllReduce( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + algorithm=allreduce_algo, + ) + self.register_op(self.unproj_o_allreduce) + + self.ll_buf = ll_buf + + self.ki_cache: torch.Tensor | None = None + self.kv_cache: torch.Tensor | None = None + self.pe_cache: torch.Tensor | None = None + + def init_random_weights(self) -> None: + """Initialize random weights for this module.""" + super().init_random_weights() + + from tilert.models.common import init_func + + for op in [self.projq_wqb, self.projo_wkvb]: + padded_total = op.num_local_heads * op.num_devices + w = init_func( + torch.empty( + padded_total * op.wkvb_head_dim, op.wkvb_lora_rank, dtype=torch.float8_e4m3fn + ) + ) + s = init_func( + torch.empty( + padded_total * op.wkvb_head_dim // op.model_args.block_size, + op.wkvb_lora_rank_qsize, + dtype=torch.float32, + ) + ) + ref_dict = dict(zip(op.ref_weights_alias(), [w, s])) + op.init_reference_weights(ref_dict) + sharded = op.device_sharding(ref_dict) + per_dev = {k: v[op.device_id] for k, v in sharded.items()} + op.init_tilert_weights_hmma(per_dev) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Load TileRT weights for this module from state_dict.""" + self.projq_wqb.is_tilert_weights_init = True + self.projo_wkvb.is_tilert_weights_init = True + + super().init_tilert_weights(state_dict) + + for op in [self.projq_wqb, self.projo_wkvb]: + op_state_dict = {} + for op_key in op.get_tilert_weights_alias(): + for p, s in zip(self.prefix_seq, self.suffix_seq): + original_key = f"{p}{op_key}{s}" + if original_key in state_dict: + op_state_dict[op_key] = state_dict[original_key] + break + op.is_tilert_weights_init = False + op.init_tilert_weights_hmma(op_state_dict) + + def get_weights_list(self) -> list[torch.Tensor]: + """Return weight tensors.""" + weights = super().get_weights_list() + + if self.ll_buf is None: + max_seq_len = getattr(self.model_args, "num_mtp", 3) + 1 + topk = self.model_args.index_topk + self.ll_buf = torch.zeros( + max_seq_len * topk * 2, dtype=torch.int32, device=f"cuda:{self.device_id}" + ) + + weights.append(self.ll_buf) + + return weights + + def get_cache_vars(self) -> list[torch.Tensor]: + """Return [ki_cache, kv_cache, pe_cache] matching DsaCacheVars layout.""" + cache_seq_len = self.model_args.max_seq_len + self.model_args.kv_cache_pad + bs_args = (self.model_args.max_batch_size, cache_seq_len) + + if self.ki_cache is None: + ki_dim = self.model_args.index_head_dim + self.ki_cache = torch.zeros( + *bs_args, ki_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + if self.kv_cache is None: + kv_dim = self.model_args.kv_lora_rank + self.kv_cache = torch.zeros( + *bs_args, kv_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + if self.pe_cache is None: + pe_dim = self.model_args.qk_rope_head_dim + self.pe_cache = torch.zeros( + *bs_args, pe_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + return [*super().get_cache_vars(), self.ki_cache, self.kv_cache, self.pe_cache] diff --git a/python/models/deepseek_v3_2/modules/mlp.py b/tilert/models/deepseek_v3_2/modules/mlp.py similarity index 51% rename from python/models/deepseek_v3_2/modules/mlp.py rename to tilert/models/deepseek_v3_2/modules/mlp.py index 1e9a327..217de6a 100644 --- a/python/models/deepseek_v3_2/modules/mlp.py +++ b/tilert/models/deepseek_v3_2/modules/mlp.py @@ -1,7 +1,9 @@ from tilert.models.base import SerializableTileRTModule from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.models.deepseek_v3_2.modules.mla import Mla -from tilert.models.deepseek_v3_2.ops.down_allreduce import DownAllReduce +from tilert.models.deepseek_v3_2.modules.mla_v2 import PureMlaV2 as Mla +from tilert.models.deepseek_v3_2.ops.down_allreduce import ( + DownAllReduce, +) from tilert.models.deepseek_v3_2.ops.rmsnorm_up_gate_silu import ( RMSNormUpGateSiLU, RMSNormUpGateSiLUAlgorithm, @@ -11,7 +13,12 @@ class Mlp(SerializableTileRTModule): """Implement the MLP operations.""" - def __init__(self, model_args: ModelArgs, device_id: int, num_devices: int): + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + ): super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) self.rmsnorm_mlp_up_gate_silu = RMSNormUpGateSiLU( @@ -19,9 +26,9 @@ def __init__(self, model_args: ModelArgs, device_id: int, num_devices: int): device_id=device_id, num_devices=num_devices, ) - if model_args.arch_name == "glm_5": - self.rmsnorm_mlp_up_gate_silu.algorithm = RMSNormUpGateSiLUAlgorithm.FP16MMA + self.rmsnorm_mlp_up_gate_silu.algorithm = RMSNormUpGateSiLUAlgorithm.FP16MMA self.register_op(self.rmsnorm_mlp_up_gate_silu) + self.rmsnorm_mlp_down = DownAllReduce( model_args=model_args, device_id=device_id, num_devices=num_devices ) @@ -32,7 +39,15 @@ class MlpBlock(SerializableTileRTModule): """Implement the MOE block operations.""" def __init__( - self, model_args: ModelArgs, device_id: int, num_devices: int, remove_selected: bool = False + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + remove_selected: bool = False, + mla_cls: type | None = None, + mla_num_devices: int | None = None, + mla_kwargs: dict | None = None, + mlp: "Mlp | None" = None, ): super().__init__( model_args=model_args, @@ -41,7 +56,19 @@ def __init__( remove_selected=remove_selected, ) - self.mla = Mla(model_args=model_args, device_id=device_id, num_devices=num_devices) + mla_class = mla_cls or Mla + mla_nd = mla_num_devices if mla_num_devices is not None else num_devices + self.mla = mla_class( + model_args=model_args, device_id=device_id, num_devices=mla_nd, **(mla_kwargs or {}) + ) self.register_op(self.mla) - self.mlp = Mlp(model_args=model_args, device_id=device_id, num_devices=num_devices) + self.mlp = ( + mlp + if mlp is not None + else Mlp( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + ) self.register_op(self.mlp) diff --git a/python/models/deepseek_v3_2/modules/moe.py b/tilert/models/deepseek_v3_2/modules/moe.py similarity index 53% rename from python/models/deepseek_v3_2/modules/moe.py rename to tilert/models/deepseek_v3_2/modules/moe.py index f343e79..4ea2dff 100644 --- a/python/models/deepseek_v3_2/modules/moe.py +++ b/tilert/models/deepseek_v3_2/modules/moe.py @@ -1,17 +1,26 @@ +import torch + from tilert.models.base import SerializableTileRTModule from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.models.deepseek_v3_2.modules.mla import Mla -from tilert.models.deepseek_v3_2.ops.expert_down_allreduce import ExpertDownAllReduce +from tilert.models.deepseek_v3_2.modules.mla_v2 import PureMlaV2 as Mla +from tilert.models.deepseek_v3_2.ops.expert_down_allreduce import ( + ExpertDownAllReduce, + ExpertDownAllReduceAlgorithm, +) from tilert.models.deepseek_v3_2.ops.expert_sel_up_gate_silu import ( ExpertSelectUpGateSiLU, ExpertSelectUpGateSiLUAlgorithm, ) -from tilert.models.deepseek_v3_2.ops.rmsnorm_expert_proj import RMSNormExpertProj +from tilert.models.deepseek_v3_2.ops.rmsnorm_expert_proj import ( + RMSNormExpertProj, +) class Moe(SerializableTileRTModule): """Implement the MOE operations.""" + rmsnorm_expert_proj: RMSNormExpertProj + def __init__(self, model_args: ModelArgs, device_id: int, num_devices: int): super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) @@ -21,22 +30,38 @@ def __init__(self, model_args: ModelArgs, device_id: int, num_devices: int): self.register_op(self.rmsnorm_expert_proj) self.exp_sel_up_gate_silu = ExpertSelectUpGateSiLU( - model_args=model_args, device_id=device_id, num_devices=num_devices + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + algorithm=ExpertSelectUpGateSiLUAlgorithm.BF16MMA, ) - if model_args.arch_name == "glm_5": - self.exp_sel_up_gate_silu.algorithm = ExpertSelectUpGateSiLUAlgorithm.FP16MMA self.register_op(self.exp_sel_up_gate_silu) + self.expert_down_allreduce = ExpertDownAllReduce( - model_args=model_args, device_id=device_id, num_devices=num_devices + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + algorithm=ExpertDownAllReduceAlgorithm.BF16MMA, ) self.register_op(self.expert_down_allreduce) + def get_weights_list(self) -> list[torch.Tensor]: + return super().get_weights_list() + class MoeBlock(SerializableTileRTModule): """Implement the MOE block operations.""" def __init__( - self, model_args: ModelArgs, device_id: int, num_devices: int, remove_selected: bool = False + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + remove_selected: bool = False, + mla_cls: type | None = None, + mla_num_devices: int | None = None, + mla_kwargs: dict | None = None, + moe: "Moe | None" = None, ): super().__init__( model_args=model_args, @@ -45,7 +70,15 @@ def __init__( remove_selected=remove_selected, ) - self.mla = Mla(model_args=model_args, device_id=device_id, num_devices=num_devices) + mla_class = mla_cls or Mla + mla_nd = mla_num_devices if mla_num_devices is not None else num_devices + self.mla = mla_class( + model_args=model_args, device_id=device_id, num_devices=mla_nd, **(mla_kwargs or {}) + ) self.register_op(self.mla) - self.moe = Moe(model_args=model_args, device_id=device_id, num_devices=num_devices) + self.moe = ( + moe + if moe is not None + else Moe(model_args=model_args, device_id=device_id, num_devices=num_devices) + ) self.register_op(self.moe) diff --git a/python/models/deepseek_v3_2/modules/mtp.py b/tilert/models/deepseek_v3_2/modules/mtp.py similarity index 75% rename from python/models/deepseek_v3_2/modules/mtp.py rename to tilert/models/deepseek_v3_2/modules/mtp.py index fd43e0e..a24101b 100644 --- a/python/models/deepseek_v3_2/modules/mtp.py +++ b/tilert/models/deepseek_v3_2/modules/mtp.py @@ -10,7 +10,15 @@ class MTP(SerializableTileRTModule): """MTP module.""" - def __init__(self, model_args: ModelArgs, device_id: int, num_devices: int): + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + mla_cls: type | None = None, + mla_num_devices: int | None = None, + mla_kwargs: dict | None = None, + ): super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) self.embed_tokens_weight = None @@ -23,7 +31,14 @@ def __init__(self, model_args: ModelArgs, device_id: int, num_devices: int): suffix=f"_dev_{device_id}", ) self.register_op( - MoeBlock(model_args=model_args, device_id=device_id, num_devices=num_devices), + MoeBlock( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + mla_cls=mla_cls, + mla_num_devices=mla_num_devices, + mla_kwargs=mla_kwargs, + ), prefix=f"layer_{mtp_layer_id}_", suffix=f"_dev_{device_id}", ) diff --git a/python/models/deepseek_v3_2/modules/mtp_preprocess.py b/tilert/models/deepseek_v3_2/modules/mtp_preprocess.py similarity index 95% rename from python/models/deepseek_v3_2/modules/mtp_preprocess.py rename to tilert/models/deepseek_v3_2/modules/mtp_preprocess.py index dc094eb..2a8676a 100644 --- a/python/models/deepseek_v3_2/modules/mtp_preprocess.py +++ b/tilert/models/deepseek_v3_2/modules/mtp_preprocess.py @@ -22,10 +22,7 @@ def mtp_preprocess_layer( temp_vars: list[torch.Tensor], profile_logs: torch.Tensor, ) -> torch.Tensor: - """MTP preprocess layer op for DeepSeek v3. - - Output is in temp_vars[28] (eh_proj) for DSA temp vars layout. - """ + """MTP preprocess layer op for DeepSeek v3.""" return torch.ops.tilert.mtp_preprocess_layer(params, temp_vars, profile_logs) @@ -90,9 +87,6 @@ def convert_to_tilert(self, weights: list[torch.Tensor], device_id: int) -> list embedding_rmsnorm_gamma = embedding_rmsnorm_gamma.to(device=device, dtype=torch.float32) hidden_rmsnorm_gamma = hidden_rmsnorm_gamma.to(device=device, dtype=torch.float32) - # eh_proj: [out, in] = [7168, 14336]; split on dim=1 -> 8 x [7168, 1792] - # Reshape: [7168, 1792] -> [128, 56, 7, 256] -> transpose(1,2) -> [128, 7, 56, 256] - # eh_proj_weight_splited = torch.chunk(eh_proj_weight, self.num_devices, dim=1) eh_proj_weights = ( eh_proj_weight.reshape( 128, self.model_args.dim // 128, self.model_args.dim * 2 // 256 // 8, 256 diff --git a/tilert/models/deepseek_v3_2/ops/__init__.py b/tilert/models/deepseek_v3_2/ops/__init__.py new file mode 100644 index 0000000..e62ed9d --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/__init__.py @@ -0,0 +1,160 @@ +"""Core operations for deepseek v3.2.""" + +from tilert.models.deepseek_v3_2.ops.broadcast_selected_token_ids import ( + broadcast_selected_token_ids, +) +from tilert.models.deepseek_v3_2.ops.down_allreduce import ( + DownAllReduce, + DownAllReduceAlgorithm, + down_allreduce, +) +from tilert.models.deepseek_v3_2.ops.eh_proj_allreduce import ( + EHProjAllReduce, + EHProjAllReduceAlgorithm, + eh_proj_allreduce, +) +from tilert.models.deepseek_v3_2.ops.expert_down_allreduce import ( + ExpertDownAllReduce, + ExpertDownAllReduceAlgorithm, + expert_down_allreduce, +) +from tilert.models.deepseek_v3_2.ops.expert_sel_up_gate_silu import ( + ExpertSelectUpGateSiLU, + ExpertSelectUpGateSiLUAlgorithm, +) +from tilert.models.deepseek_v3_2.ops.flash_sparse_mla import ( + FlashSparseMLACombineAlgorithm, + flash_sparse_mla, +) +from tilert.models.deepseek_v3_2.ops.layernorm_rope_rotate import ( + LayerNormRoPERotateAlgorithm, + layernorm_rope_rotate, +) +from tilert.models.deepseek_v3_2.ops.padded_allreduce_add import ( + PaddedAllReduceAdd, + PaddedAllReduceAddAlgorithm, + padded_allreduce_add, +) +from tilert.models.deepseek_v3_2.ops.projo_wkvb import ProjoWKVbAlgorithm, projo_wkvb +from tilert.models.deepseek_v3_2.ops.projq_wqb import ProjqWqbAlgorithm, projq_wqb +from tilert.models.deepseek_v3_2.ops.projx_wis import ProjxWisAlgorithm, projx_wis +from tilert.models.deepseek_v3_2.ops.qkv_rope import ( + QKVRoPE, + QKVRoPEAlgorithm, + QKVRoPERefWeightsAlias, + QKVRoPETilertWeightsAlias, + qkv_rope, +) +from tilert.models.deepseek_v3_2.ops.receive_selected_token_ids import ( + receive_selected_token_ids, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_expert_proj import ( + RMSNormExpertProj, + RMSNormExpertProjAlgorithm, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_head_proj import ( + RMSNormHeadProj, + RMSNormHeadProjAlgorithm, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_kv import KVRMSNormAlgorithm, rmsnorm_kv +from tilert.models.deepseek_v3_2.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqb, + RmsnormProjqWqbAlgorithm, + RmsnormProjqWqbWeightsConverter, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_projq_wqi import ( + RmsnormProjqWqi, + RmsnormProjqWqiAlgorithm, + RmsnormProjqWqiWeightsConverter, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_projx_wqakis import ( + RMSNormProjxWqakis, + RMSNormProjxWqakisAlgorithm, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_projx_wqkva import ( + RMSNormProjxWqkva, + RMSNormProjxWqkvaAlgorithm, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_quant import rmsnorm_quant +from tilert.models.deepseek_v3_2.ops.rmsnorm_up_gate_silu import ( + RMSNormUpGateSiLU, + RMSNormUpGateSiLUAlgorithm, +) +from tilert.models.deepseek_v3_2.ops.rotate import ( + Rotate, + RotateAlgorithm, + RotateRefWeightsAlias, + RotateTilertWeightsAlias, + rotate, + rotate_activation, +) +from tilert.models.deepseek_v3_2.ops.sparse_index import sparse_index, sparse_index_topk +from tilert.models.deepseek_v3_2.ops.topk import TopK, topk_accurate, topk_approximate +from tilert.models.deepseek_v3_2.ops.unproj_o_allreduce import ( + UnProjOAllReduce, + UnProjOAllReduceAlgorithm, + unproj_o_allreduce, +) + +__all__ = [ + "down_allreduce", + "DownAllReduce", + "DownAllReduceAlgorithm", + "expert_down_allreduce", + "ExpertDownAllReduce", + "ExpertDownAllReduceAlgorithm", + "rmsnorm_kv", + "KVRMSNormAlgorithm", + "unproj_o_allreduce", + "projo_wkvb", + "ProjoWKVbAlgorithm", + "projq_wqb", + "ProjqWqbAlgorithm", + "rotate", + "rotate_activation", + "Rotate", + "RotateAlgorithm", + "RotateRefWeightsAlias", + "RotateTilertWeightsAlias", + "layernorm_rope_rotate", + "LayerNormRoPERotateAlgorithm", + "TopK", + "topk_approximate", + "topk_accurate", + "sparse_index", + "sparse_index_topk", + "flash_sparse_mla", + "FlashSparseMLACombineAlgorithm", + "projx_wis", + "ProjxWisAlgorithm", + "qkv_rope", + "QKVRoPE", + "QKVRoPEAlgorithm", + "QKVRoPERefWeightsAlias", + "QKVRoPETilertWeightsAlias", + "eh_proj_allreduce", + "EHProjAllReduceAlgorithm", + "rmsnorm_quant", + "RmsnormProjqWqi", + "RmsnormProjqWqiAlgorithm", + "RmsnormProjqWqiWeightsConverter", + "RMSNormExpertProj", + "RMSNormExpertProjAlgorithm", + "RMSNormProjxWqakis", + "RMSNormProjxWqakisAlgorithm", + "RMSNormProjxWqkva", + "RMSNormProjxWqkvaAlgorithm", + "RMSNormUpGateSiLU", + "RMSNormUpGateSiLUAlgorithm", + "UnProjOAllReduce", + "UnProjOAllReduceAlgorithm", + "RMSNormHeadProj", + "RMSNormHeadProjAlgorithm", + "ExpertSelectUpGateSiLU", + "ExpertSelectUpGateSiLUAlgorithm", + "PaddedAllReduceAdd", + "PaddedAllReduceAddAlgorithm", + "padded_allreduce_add", + "broadcast_selected_token_ids", + "receive_selected_token_ids", +] diff --git a/tilert/models/deepseek_v3_2/ops/broadcast_selected_token_ids.py b/tilert/models/deepseek_v3_2/ops/broadcast_selected_token_ids.py new file mode 100644 index 0000000..f6bf2a8 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/broadcast_selected_token_ids.py @@ -0,0 +1,36 @@ +"""BroadcastSelectedTokenIds — P2P broadcast of idx_selects from GPU 0 to peers.""" + +import torch + +__all__ = [ + "broadcast_selected_token_ids", +] + + +def broadcast_selected_token_ids( + idx_selects: torch.Tensor, + peer_bufs: torch.Tensor, + flag_val: int, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", +) -> None: + """Broadcast idx_selects [1,S,2048] int32 from GPU 0 to peer GPUs. + + Args: + idx_selects: Source tensor [1, S, 2048] int32 on GPU 0. + peer_bufs: Device pointer array [N] int64 — each entry is a peer + buffer address. + flag_val: Synchronization flag value. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16"). + """ + torch.ops.tilert.broadcast_selected_token_ids_op( + idx_selects, + peer_bufs, + flag_val, + model_arch, + compute_kernel_type, + profile_logs, + ) diff --git a/python/models/deepseek_v3_2/ops/down_allreduce.py b/tilert/models/deepseek_v3_2/ops/down_allreduce.py similarity index 82% rename from python/models/deepseek_v3_2/ops/down_allreduce.py rename to tilert/models/deepseek_v3_2/ops/down_allreduce.py index dfb5a81..cd81461 100644 --- a/python/models/deepseek_v3_2/ops/down_allreduce.py +++ b/tilert/models/deepseek_v3_2/ops/down_allreduce.py @@ -1,10 +1,8 @@ """DownAllreduce operation module.""" -from collections.abc import Callable from dataclasses import dataclass from enum import Enum -# import torch.nn.functional as F import torch from tilert.models.base import TileRTModule @@ -17,7 +15,6 @@ __all__ = [ "down_allreduce", - "down_allreduce_glm5", "DownAllReduceAlgorithm", "DownAllReduce", "DownAllReduceTilertWeightsAlias", @@ -32,6 +29,8 @@ def down_allreduce( flag: int, vec_out: torch.Tensor, profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", ) -> None: """ Fused operation of down and allreduce. @@ -43,10 +42,9 @@ def down_allreduce( x_in: Input tensor. flag: Input flag. vec_out: Output tensor. - profile_logs: Profile logs tensor. This is a 1D tensor of shape - (num_sms,) to store the profile logs of the down_allreduce - operation, where num_sms is the number of SMs on the - device. + profile_logs: Profile logs tensor (1D). + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16"). """ torch.ops.tilert.down_allreduce_op( vec_in, @@ -55,41 +53,8 @@ def down_allreduce( x_in, flag, vec_out, - profile_logs, - ) - - -def down_allreduce_glm5( - vec_in: torch.Tensor, - mat_in: torch.Tensor, - mat_scale: torch.Tensor, - x_in: torch.Tensor, - flag: int, - vec_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """ - Fused operation of down and allreduce. - - Args: - vec_in: Input tensor. - mat_in: Input tensor. - mat_scale: Input tensor. - x_in: Input tensor. - flag: Input flag. - vec_out: Output tensor. - profile_logs: Profile logs tensor. This is a 1D tensor of shape - (num_sms,) to store the profile logs of the down_allreduce - operation, where num_sms is the number of SMs on the - device. - """ - torch.ops.tilert.down_allreduce_glm5_op( - vec_in, - mat_in, - mat_scale, - x_in, - flag, - vec_out, + model_arch, + compute_kernel_type, profile_logs, ) @@ -121,6 +86,11 @@ def __call__(self) -> list[str]: class DownAllReduce(TileRTModule): """DownAllReduce module""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [DownAllReduceAlgorithm.GENERAL], + "glm_5": [DownAllReduceAlgorithm.GENERAL], + } + def __init__( self, model_args: ModelArgs, @@ -142,7 +112,6 @@ def __init__( self.moe_inter_dim = self.model_args.moe_inter_dim self.moe_inter_dim_per_device = self.moe_inter_dim // self.num_devices self.inter_dim_per_device = self.inter_dim // self.num_devices - # effective number of experts self.n_experts: int = self.inter_dim_per_device // self.moe_inter_dim_per_device self.block_size = self.model_args.block_size self.dim_scale_dim = self.dim // self.block_size @@ -150,39 +119,30 @@ def __init__( self.moe_inter_scale_dim_per_device = self.moe_inter_dim_per_device // self.block_size self.algorithm = algorithm - # reference weights + if self.arch_name in ("deepseek_v3_2", "glm_5"): + self.compute_kernel_type = "bf16" + else: + raise ValueError(f"Unsupported architecture: {self.arch_name}") + + self.model_arch = self.arch_name + self.ref_down: torch.Tensor | None = None - # tilert weights self.tilert_weights: torch.Tensor | None = None self.tilert_scales: torch.Tensor | None = None - # tilert vars self.hidden_out: torch.Tensor | None = None self.profile_logs: torch.Tensor | None = None self.is_init = False - # tilert_funcs - self.down_allreduce_func: Callable | None = None - - if self.arch_name == "deepseek_v3_2": - self.down_allreduce_func = down_allreduce - elif self.arch_name == "glm_5": - self.down_allreduce_func = down_allreduce_glm5 - else: - raise ValueError(f"Unsupported architecture: {self.arch_name}") - self.tilert_weights_alias = DownAllReduceTilertWeightsAlias() - # for device sharding, corresponding to the output of device_sharding - # and input of tilert_forward self.tensor_alias: list[str] = [ "down_weights", "down_scales", ] - # reference tensor aliases self.ref_tensor_alias: list[str] = [ "mlp.down_proj.weight", "mlp.down_proj.weight_scale_inv", @@ -204,7 +164,7 @@ def get_weights_list(self) -> list[torch.Tensor]: def device_sharding( self, weights_dict: dict[str, torch.Tensor], - key_prefix: str, # e.g. model.layers.{layer_id}.mlp + key_prefix: str, ) -> tuple[torch.Tensor, torch.Tensor]: """ Device sharding. @@ -219,7 +179,6 @@ def device_sharding( down_proj_scale_key = f"{key_prefix}.down_proj.weight_scale_inv" down_proj_weight = weights_dict[down_proj_weight_key] down_proj_scale = weights_dict[down_proj_scale_key] - # To match the old convertcode down_proj_weight = down_proj_weight.reshape( self.dim, self.n_experts, self.num_devices, self.moe_inter_dim_per_device ) @@ -254,7 +213,7 @@ def device_sharding( def init_reference_weights( self, state_dict: dict[str, torch.Tensor], - key_prefix: str, # e.g. model.layers.{layer_id}.mlp + key_prefix: str, device_id: int = 0, ) -> None: """ @@ -295,7 +254,6 @@ def init_tilert_vars(self, batch_size: int, seq_len: int, device_id: int = 0) -> batch_size: Batch size. seq_len: Sequence length. """ - # tilert vars self.hidden_out = torch.zeros( (batch_size, seq_len, self.dim), dtype=torch.bfloat16, @@ -304,8 +262,10 @@ def init_tilert_vars(self, batch_size: int, seq_len: int, device_id: int = 0) -> self.profile_logs = get_profile_log_tensor(device=f"cuda:{device_id}") self.is_init = True - def init_random_weights(self, device_id: int = 0) -> None: + def init_random_weights(self, device_id: int | None = None) -> None: """Initialize the random weights.""" + if device_id is None: + device_id = self.device_id scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 down_weights = torch.randn( self.dim, self.inter_dim, dtype=torch.bfloat16, device=f"cuda:{device_id}" @@ -364,9 +324,8 @@ def tilert_forward( x_in: torch.Tensor, flag: int, ) -> torch.Tensor: - assert self.down_allreduce_func is not None assert self.hidden_out is not None - self.down_allreduce_func( + down_allreduce( vec_in, self.tilert_weights, self.tilert_scales, @@ -374,6 +333,8 @@ def tilert_forward( flag, self.hidden_out, self.profile_logs, + self.model_arch, + self.compute_kernel_type, ) return self.hidden_out diff --git a/tilert/models/deepseek_v3_2/ops/eh_proj_allreduce.py b/tilert/models/deepseek_v3_2/ops/eh_proj_allreduce.py new file mode 100644 index 0000000..8a72823 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/eh_proj_allreduce.py @@ -0,0 +1,295 @@ +"""EHProjAllReduce operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "eh_proj_allreduce", + "EHProjAllReduceTilertWeightsAlias", +] + + +def eh_proj_allreduce( + vec_in_enorm: torch.Tensor, + vec_in_hnorm: torch.Tensor, + w_eh: torch.Tensor, + flag: int, + vec_out: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, +) -> None: + """ + Fused operation of EHProj and allreduce. + + Args: + vec_in_enorm: Input tensor of shape (1, seq_len, 7168). + vec_in_hnorm: Input tensor of shape (1, seq_len, 7168). + w_eh: Input tensor of shape (7168, 1792) or (128, 7, 56, 256). + flag: Input tensor. + vec_out: Output tensor of shape (1, seq_len, 7168). + profile_logs: Profile logs tensor (1D). + model_arch: Model architecture string. + """ + compute_kernel_type = "bf16" + torch.ops.tilert.eh_proj_allreduce_op( + vec_in_enorm, + vec_in_hnorm, + w_eh, + flag, + vec_out, + profile_logs, + model_arch, + compute_kernel_type, + torch.empty(0, dtype=torch.int64, device=vec_in_enorm.device), + ) + + +class EHProjAllReduceAlgorithm(Enum): + """EHProjAllReduce algorithm""" + + GENERAL = "general" + + +class EHProjAllReduceWeightsConverter(TilertWeightsConverter): + """EHProj weights converter""" + + def convert_to_general(self, weights_list: list[torch.Tensor]) -> tuple[torch.Tensor]: + """ + Convert the weights to general format. + + Args: + weights_list: List of weights. + + Returns: + Tuple of weights. + """ + args = self.model_args + assert args.arch_name == "deepseek_v3_2" or args.arch_name == "glm_5" + dim = args.dim + num_sms = 128 + dim_per_sm = dim // num_sms + in_dim = dim * 2 + in_dim_per_device = in_dim // self.num_devices + stages = in_dim_per_device // 256 + + with torch.inference_mode(): + (proj_weights,) = weights_list + proj_weights = proj_weights.reshape(num_sms, dim_per_sm, stages, 256) + proj_weights = proj_weights.transpose(1, 2) + return (proj_weights.contiguous(),) + + +@dataclass +class EHProjAllReduceTilertWeightsAlias: + """TileRT weights alias for EHProjAllReduce.""" + + eh_proj_weights = "eh_proj_weights" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.eh_proj_weights] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class EHProjAllReduce(TileRTModule): + """EHProjAllReduce module""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [EHProjAllReduceAlgorithm.GENERAL], + "glm_5": [EHProjAllReduceAlgorithm.GENERAL], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + algorithm: EHProjAllReduceAlgorithm = EHProjAllReduceAlgorithm.GENERAL, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + ) + + self.arch_name = self.model_args.arch_name + self.dim = self.model_args.dim + + self.algorithm = algorithm + + self.ref_proj: torch.Tensor | None = None + + self.tilert_proj: torch.Tensor | None = None + + self.hidden_out: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + self.tilert_weights_alias = EHProjAllReduceTilertWeightsAlias() + + self.tensor_alias: list[str] = [ + "eh_proj_weights", + ] + + self.ref_tensor_alias: list[str] = [ + "eh_proj.weight", + ] + + @property + def tilert_tensor_alias(self) -> list[str]: + return self.tilert_weights_alias.tilert_tensor_alias + + def get_weights_list(self) -> list[torch.Tensor]: + """ + Get the weights list. + + Returns: + List of weights. + """ + return [self.tilert_proj] + + def device_sharding( + self, + weights_dict: dict[str, torch.Tensor], + key_prefix: str | None = None, + ) -> tuple[torch.Tensor]: + """ + Device sharding. + + Args: + weights_dict: Dictionary of weights. + key_prefix: Key prefix. + Returns: + Tuple of weights. + """ + eh_proj_key = "eh_proj.weight" + if key_prefix is not None: + eh_proj_key = f"{key_prefix}.eh_proj.weight" + + eh_proj_weight = weights_dict[eh_proj_key] + in_dim = eh_proj_weight.shape[1] + out_dim = eh_proj_weight.shape[0] + in_dim_per_device = in_dim // self.num_devices + eh_proj_weight = eh_proj_weight.reshape(out_dim, self.num_devices, in_dim_per_device) + eh_proj_weight = eh_proj_weight.transpose(0, 1) + return (eh_proj_weight.contiguous(),) + + def init_reference_weights( + self, + state_dict: dict[str, torch.Tensor], + key_prefix: str | None = None, + device_id: int = 0, + ) -> None: + """ + Initialize the reference weights. + + Args: + state_dict: State dictionary. + device_id: Device ID. + """ + sharded_list = self.device_sharding(state_dict, key_prefix) + + eh_proj_weight = sharded_list[0][device_id] + + self.ref_proj = eh_proj_weight + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """ + Initialize the tilert weights. + + Args: + state_dict: State dictionary. + """ + assert self.algorithm is not None + (self.tilert_proj,) = EHProjAllReduceWeightsConverter( + self.model_args, self.num_devices + ).dispatch(self.algorithm, [state_dict[alias] for alias in self.tensor_alias]) + + def init_tilert_vars(self, batch_size: int, seq_len: int, device_id: int = 0) -> None: + """ + Initialize the tilert variables. + + Args: + batch_size: Batch size. + seq_len: Sequence length. + """ + self.hidden_out = torch.zeros( + (batch_size, seq_len, self.dim), + dtype=torch.bfloat16, + device=f"cuda:{device_id}", + ) + self.profile_logs = get_profile_log_tensor(device=f"cuda:{device_id}") + self.is_init = True + + def init_random_weights(self, device_id: int | None = None) -> None: + """Initialize the random weights.""" + if device_id is None: + device_id = self.device_id + proj_weights = torch.randn( + self.dim, self.dim * 2, dtype=torch.bfloat16, device=f"cuda:{device_id}" + ) + + tensor_list = [ + proj_weights, + ] + state_dict = dict(zip(self.ref_tensor_alias, tensor_list)) + + self.init_reference_weights(state_dict, None, device_id) + sharded_list = self.device_sharding(state_dict, None) + sharded_state_dict = { + alias: sharded_list[i][device_id] for i, alias in enumerate(self.tensor_alias) + } + self.init_tilert_weights(sharded_state_dict) + + def golden_forward( + self, + vec_in_enorm: torch.Tensor, + vec_in_hnorm: torch.Tensor, + device_id: int = 0, + ) -> torch.Tensor: + """ + Forward pass for the down-project module. + + Args: + vec_in_enorm: Input vector of shape (1, seq_len, 7168). + vec_in_hnorm: Input vector of shape (1, seq_len, 7168). + + Returns: + Output tensor. + """ + assert self.ref_proj is not None + bsz = vec_in_enorm.shape[0] + assert bsz == 1 + + vec_in_concat = torch.cat([vec_in_enorm, vec_in_hnorm], dim=-1) + dim_per_device = (self.dim * 2) // self.num_devices + vec_in_slice = vec_in_concat[ + ..., dim_per_device * device_id : dim_per_device * device_id + dim_per_device + ] + return vec_in_slice @ self.ref_proj.T + + def tilert_forward( + self, + vec_in_enorm: torch.Tensor, + vec_in_hnorm: torch.Tensor, + flag: int, + ) -> torch.Tensor: + assert self.hidden_out is not None + eh_proj_allreduce( + vec_in_enorm, + vec_in_hnorm, + self.tilert_proj, + flag, + self.hidden_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + return self.hidden_out diff --git a/tilert/models/deepseek_v3_2/ops/expert_down_allreduce.py b/tilert/models/deepseek_v3_2/ops/expert_down_allreduce.py new file mode 100644 index 0000000..f7ff7b7 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/expert_down_allreduce.py @@ -0,0 +1,500 @@ +"""ExpertDownAllreduce operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "expert_down_allreduce", + "ExpertDownAllReduceAlgorithm", + "ExpertDownAllReduce", + "ExpertDownAllReduceTilertWeightsAlias", +] + +VALID_SEQ_LENS = (1, 2, 4) + + +def expert_down_allreduce( + vec_in: torch.Tensor, + mat_in: torch.Tensor, + mat_scale: torch.Tensor, + indices: torch.Tensor, + scores: torch.Tensor, + x_in: torch.Tensor, + flag: int, + vec_out: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", +) -> None: + """ + Fused expert down + allreduce (unified for DSv32 and GLM5). + + Args: + vec_in: [1, seq_len, n_experts, 256], bfloat16. + mat_in: [n_experts, dim, 256], float8_e4m3fn. + mat_scale: [n_experts, 1024, 2], bfloat16 (DSv32) or float32 (GLM5). + indices: [1, seq_len, 8], int32. + scores: [1, seq_len, 8], float32. + x_in: [1, seq_len, dim], bfloat16. + flag: User flag. + vec_out: [1, seq_len, dim], bfloat16 (output). + profile_logs: 1D tensor for profile logs. + compute_kernel_type: "bf16". + """ + torch.ops.tilert.expert_down_allreduce_op( + vec_in, + mat_in, + mat_scale, + indices, + scores, + x_in, + flag, + vec_out, + profile_logs, + model_arch, + compute_kernel_type, + ) + + +class ExpertDownAllReduceAlgorithm(Enum): + """ExpertDownAllReduce algorithm.""" + + GENERAL = "general" + BF16MMA = "bf16mma" + + +class ExpertDownAllReduceWeightsConverter(TilertWeightsConverter): + """ExpertDownAllReduce weights converter.""" + + @staticmethod + def _swizzle_qmma_16x32(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 + assert mat_in.dtype == torch.float8_e4m3fn + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) + + @staticmethod + def _swizzle_qmma_8x32(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 8 and mat_in.shape[-1] == 32 + pre_shape = mat_in.shape[:-2] + return mat_in.reshape(*pre_shape, 8, 2, 4, 4).transpose(-2, -3).contiguous() + + @staticmethod + def _swizzle_bf16mma_full_16x32(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle a (16, 32) FP8 sub-block for the BF16 MMA kernel.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 + assert mat_in.dtype == torch.float8_e4m3fn + pre = mat_in.shape[:-2] + mat = mat_in.reshape(*pre, 2, 8, 2, 2, 4, 2) + n = len(pre) + mat = mat.permute(*range(n), 1 + n, 4 + n, 2 + n, 3 + n, 0 + n, 5 + n) + return mat.reshape(*pre, 32, 16).contiguous() + + @staticmethod + def _swizzle_bf16mma_partial_8x32(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle a (8, 32) FP8 partial sub-block for the BF16 MMA kernel.""" + assert mat_in.shape[-2] == 8 and mat_in.shape[-1] == 32 + assert mat_in.dtype == torch.float8_e4m3fn + pre = mat_in.shape[:-2] + mat = mat_in.reshape(*pre, 8, 2, 2, 4, 2) + n = len(pre) + mat = mat.permute(*range(n), 0 + n, 3 + n, 1 + n, 2 + n, 4 + n) + return mat.reshape(*pre, 32, 8).contiguous() + + def convert_to_general( + self, weights_list: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert weights to general (tilert) format.""" + args = self.model_args + assert args.arch_name in ("deepseek_v3_2", "glm_5") + arch_name = args.arch_name + dim = args.dim + num_sms = 128 + dim_per_sm = dim // num_sms + dim_scale_dim = dim // args.block_size + expert_dim = args.moe_inter_dim // 8 + k_chunks = expert_dim // 32 + scale_cols = expert_dim // args.block_size + + with torch.inference_mode(): + mat_in, scale_in = weights_list + exp_num = mat_in.shape[0] + mat_in_s = mat_in.reshape(exp_num, num_sms, dim_per_sm, expert_dim) + mat_in_0 = ( + mat_in_s[:, :, :16].reshape(exp_num, num_sms, 16, k_chunks, 32).transpose(2, 3) + ) + mat_in_0 = self._swizzle_qmma_16x32(mat_in_0).reshape(exp_num, 128, -1) + mat_in_1 = ( + mat_in_s[:, :, 16:32].reshape(exp_num, num_sms, 16, k_chunks, 32).transpose(2, 3) + ) + mat_in_1 = self._swizzle_qmma_16x32(mat_in_1).reshape(exp_num, 128, -1) + mat_in_2 = ( + mat_in_s[:, :, 32:48].reshape(exp_num, num_sms, 16, k_chunks, 32).transpose(2, 3) + ) + mat_in_2 = self._swizzle_qmma_16x32(mat_in_2).reshape(exp_num, 128, -1) + mats_to_cat = [mat_in_0, mat_in_1, mat_in_2] + if arch_name == "deepseek_v3_2": + mat_in_3 = ( + mat_in_s[:, :, 48:56].reshape(exp_num, num_sms, 8, k_chunks, 32).transpose(2, 3) + ) + mat_in_3 = self._swizzle_qmma_8x32(mat_in_3).reshape(exp_num, 128, -1) + mats_to_cat.append(mat_in_3) + mat_in_swizzled = torch.cat(mats_to_cat, dim=2) + mat_in_swizzled = mat_in_swizzled.reshape(exp_num, dim, expert_dim) + + mat_scale_tilert = ( + scale_in.reshape(exp_num, dim_scale_dim, 1, scale_cols) + .repeat(1, 1, 16, 1) + .reshape(exp_num, num_sms, -1) + ) + target_cols_per_sm = 1024 * scale_cols // num_sms + pad_amount = target_cols_per_sm - mat_scale_tilert.shape[-1] + if pad_amount > 0: + padding_zeros = torch.zeros( + (exp_num, num_sms, pad_amount), + dtype=scale_in.dtype, + device=scale_in.device, + ) + mat_scale_tilert = torch.cat([mat_scale_tilert, padding_zeros], dim=2) + mat_scale_tilert = mat_scale_tilert.reshape(exp_num, 1024, scale_cols) + if arch_name == "glm_5": + if mat_scale_tilert.dtype != torch.float32: + print( + "Warning: ExpertDownAllReduceWeightsConverter: " + + f"mat_scale_tilert.dtype: {mat_scale_tilert.dtype} " + + "is not float32, convert to float32." + ) + mat_scale_tilert = mat_scale_tilert.to(torch.float32) + else: + mat_scale_tilert = mat_scale_tilert.to(torch.bfloat16) + return mat_in_swizzled.contiguous(), mat_scale_tilert.contiguous() + + def convert_to_bf16mma( + self, weights_list: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """Pack FP8 weights for the BF16 MMA kernel (DSv32 only).""" + args = self.model_args + assert args.arch_name == "deepseek_v3_2", "BF16 MMA only wired for DSv32." + dim = args.dim + num_sms = 128 + dim_per_sm = dim // num_sms + expert_dim = args.moe_inter_dim // 8 + k_chunks = expert_dim // 32 + scale_cols = expert_dim // args.block_size + assert dim_per_sm == 56, "BF16 MMA layout currently assumes dim_per_sm=56 (DSv32)." + + with torch.inference_mode(): + mat_in, scale_in = weights_list + exp_num = mat_in.shape[0] + mat_per_cta = mat_in.reshape(exp_num, num_sms, dim_per_sm, expert_dim) + + full_part = mat_per_cta[:, :, :48, :] + partial_part = mat_per_cta[:, :, 48:, :] + + full_tiles = full_part.reshape(exp_num, num_sms, 3, 16, k_chunks, 32) + full_tiles = full_tiles.transpose(3, 4) + full_swizzled = self._swizzle_bf16mma_full_16x32(full_tiles) + full_swizzled = full_swizzled.reshape(exp_num, num_sms, 3 * k_chunks * 32 * 16) + + partial_tiles = partial_part.reshape(exp_num, num_sms, 1, 8, k_chunks, 32).transpose( + 3, 4 + ) + partial_swizzled = self._swizzle_bf16mma_partial_8x32(partial_tiles) + partial_swizzled = partial_swizzled.reshape(exp_num, num_sms, k_chunks * 32 * 8) + + mat_swizzled = torch.cat([full_swizzled, partial_swizzled], dim=2) + mat_swizzled = mat_swizzled.reshape(exp_num, dim, expert_dim) + + mat_scale_tilert = ( + scale_in.reshape(exp_num, dim // args.block_size, 1, scale_cols) + .repeat(1, 1, 16, 1) + .reshape(exp_num, num_sms, -1) + ) + target_cols_per_sm = 1024 * scale_cols // num_sms + pad_amount = target_cols_per_sm - mat_scale_tilert.shape[-1] + if pad_amount > 0: + padding_zeros = torch.zeros( + (exp_num, num_sms, pad_amount), + dtype=scale_in.dtype, + device=scale_in.device, + ) + mat_scale_tilert = torch.cat([mat_scale_tilert, padding_zeros], dim=2) + mat_scale_tilert = mat_scale_tilert.reshape(exp_num, 1024, scale_cols) + mat_scale_tilert = mat_scale_tilert.to(torch.bfloat16) + + return mat_swizzled.contiguous(), mat_scale_tilert.contiguous() + + +@dataclass +class ExpertDownAllReduceTilertWeightsAlias: + """TileRT weights alias for ExpertDownAllReduce.""" + + exp_down_weights = "exp_down_weights" + exp_down_scales = "exp_down_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.exp_down_weights, self.exp_down_scales] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class ExpertDownAllReduce(TileRTModule): + """ExpertDownAllReduce module.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ + ExpertDownAllReduceAlgorithm.GENERAL, + ExpertDownAllReduceAlgorithm.BF16MMA, + ], + "glm_5": [ExpertDownAllReduceAlgorithm.GENERAL], + } + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + algorithm: ExpertDownAllReduceAlgorithm = ExpertDownAllReduceAlgorithm.GENERAL, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + self.arch_name = self.model_args.arch_name + self.dim = self.model_args.dim + self.n_activated_experts: int = self.model_args.n_activated_experts + self.n_routed_experts: int = self.model_args.n_routed_experts + self.n_shared_experts: int = self.model_args.n_shared_experts + self.moe_inter_dim = self.model_args.moe_inter_dim + self.block_size = self.model_args.block_size + self.algorithm = algorithm + + self.ref_down: torch.Tensor | None = None + self.tilert_weights: torch.Tensor | None = None + self.tilert_scales: torch.Tensor | None = None + self.hidden_out: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + if self.arch_name in ("deepseek_v3_2", "glm_5"): + self.compute_kernel_type = "bf16" + if algorithm == ExpertDownAllReduceAlgorithm.BF16MMA: + self.compute_kernel_type = "bf16mma" + else: + raise ValueError(f"Unsupported architecture: {self.arch_name}") + + self.model_arch = self.arch_name + + self.tilert_weights_alias = ExpertDownAllReduceTilertWeightsAlias() + self.tensor_alias = ["exp_down_weights", "exp_down_scales"] + self.ref_tensor_alias = ( + ["mlp.shared_experts.down_proj.weight"] + + [f"mlp.experts.{i}.down_proj.weight" for i in range(self.n_routed_experts)] + + ["mlp.shared_experts.down_proj.weight_scale_inv"] + + [f"mlp.experts.{i}.down_proj.weight_scale_inv" for i in range(self.n_routed_experts)] + ) + + @property + def tilert_tensor_alias(self) -> list[str]: + return self.tilert_weights_alias.tilert_tensor_alias + + def set_algorithm(self, algorithm: Enum) -> None: + """Set algorithm and sync compute_kernel_type for BF16MMA dispatch.""" + super().set_algorithm(algorithm) + if algorithm == ExpertDownAllReduceAlgorithm.BF16MMA: + self.compute_kernel_type = "bf16mma" + elif algorithm == ExpertDownAllReduceAlgorithm.GENERAL: + self.compute_kernel_type = "bf16" + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_weights, self.tilert_scales] + + @staticmethod + def process_down_weights( + key_prefix: str, + weights_hf: dict[str, torch.Tensor], + num_devices: int, + ) -> tuple[torch.Tensor, torch.Tensor]: + down_proj_weight_key = f"{key_prefix}.down_proj.weight" + down_proj_scale_key = f"{key_prefix}.down_proj.weight_scale_inv" + down_proj_weight = weights_hf[down_proj_weight_key] + down_proj_scale = weights_hf[down_proj_scale_key] + + dim = down_proj_weight.shape[-2] + dim_scale_dim = down_proj_scale.shape[-2] + moe_inter_dim = down_proj_weight.shape[-1] + in_scale_dim = down_proj_scale.shape[-1] + moe_inter_dim_per_device = moe_inter_dim // num_devices + in_scale_dim_per_device = in_scale_dim // num_devices + + down_proj_weight = down_proj_weight.reshape(dim, num_devices, moe_inter_dim_per_device) + down_proj_weight = down_proj_weight.transpose(0, 1).reshape( + num_devices, 1, dim, moe_inter_dim_per_device + ) + down_proj_scale = down_proj_scale.reshape( + dim_scale_dim, num_devices, in_scale_dim_per_device + ) + down_proj_scale = down_proj_scale.transpose(0, 1).reshape( + num_devices, 1, dim_scale_dim, in_scale_dim_per_device + ) + return down_proj_weight, down_proj_scale + + def device_sharding( + self, + weights_dict: dict[str, torch.Tensor], + key_prefix: str, + ) -> tuple[torch.Tensor, torch.Tensor]: + assert self.n_shared_experts == 1, "Only one shared expert is supported" + down_weights_list = [] + down_scales_list = [] + exp_prefix = f"{key_prefix}.shared_experts" + down_weights, down_scales = self.process_down_weights( + exp_prefix, weights_dict, self.num_devices + ) + down_weights_list.append(down_weights) + down_scales_list.append(down_scales) + for exp_id in range(self.n_routed_experts): + exp_prefix = f"{key_prefix}.experts.{exp_id}" + down_weights, down_scales = self.process_down_weights( + exp_prefix, weights_dict, self.num_devices + ) + down_weights_list.append(down_weights) + down_scales_list.append(down_scales) + down_weights = torch.cat(down_weights_list, dim=1) + down_scales = torch.cat(down_scales_list, dim=1) + return down_weights.contiguous(), down_scales.contiguous() + + def init_reference_weights( + self, + state_dict: dict[str, torch.Tensor], + key_prefix: str, + device_id: int = 0, + ) -> None: + sharded_list = self.device_sharding(state_dict, key_prefix) + down_weights = sharded_list[0][device_id] + down_scales = sharded_list[1][device_id] + + down_list = [ + weight_dequant(down_weight, down_scale) + for down_weight, down_scale in zip(down_weights, down_scales) + ] + self.ref_down = torch.stack([t.to(torch.bfloat16) for t in down_list], dim=0) + + def get_tilert_weights_alias(self) -> list[str]: + """Return the alias list keyed into ``state_dict`` for this op.""" + return list(self.tilert_weights_alias()) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + assert self.algorithm is not None, "Algorithm is not set" + self.tilert_weights, self.tilert_scales = ExpertDownAllReduceWeightsConverter( + self.model_args, self.num_devices + ).dispatch(self.algorithm, [state_dict[alias] for alias in self.tensor_alias]) + + def init_tilert_vars(self, batch_size: int, seq_len: int, device_id: int = 0) -> None: + self.hidden_out = torch.zeros( + (batch_size, seq_len, self.dim), + dtype=torch.bfloat16, + device=f"cuda:{device_id}", + ) + self.profile_logs = get_profile_log_tensor(device=f"cuda:{device_id}") + self.is_init = True + + def init_random_weights(self, device_id: int | None = None) -> None: + if device_id is None: + device_id = self.device_id + n = self.n_routed_experts + 1 + dev = f"cuda:{device_id}" + down_weights = list( + torch.randn(n, self.dim, self.moe_inter_dim, dtype=torch.bfloat16, device=dev) + .to(torch.float8_e4m3fn) + .unbind(0) + ) + dim_scale_dim = self.dim // self.block_size + moe_inter_dim_scale_dim = self.moe_inter_dim // self.block_size + scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 + down_scales = list( + torch.randn( + n, dim_scale_dim, moe_inter_dim_scale_dim, dtype=scale_dtype, device=dev + ).unbind(0) + ) + state_dict = dict( + zip( + self.ref_tensor_alias, + [*down_weights, *down_scales], + ) + ) + self.init_reference_weights(state_dict, "mlp", device_id) + sharded_list = self.device_sharding(state_dict, "mlp") + sharded_state_dict = { + alias: sharded_list[i][device_id] for i, alias in enumerate(self.tensor_alias) + } + self.init_tilert_weights(sharded_state_dict) + + def golden_forward( + self, + vec_in: torch.Tensor, + indices: torch.Tensor, + scores: torch.Tensor, + ) -> torch.Tensor: + assert self.ref_down is not None + assert vec_in.dim() == 4 and vec_in.size(0) == 1 + seq_len = vec_in.shape[1] + hidden_out_list = [] + for s in range(seq_len): + hidden_out_w2_list = [] + hidden_out_w2_shared = vec_in[0, s, 0].float() @ self.ref_down[0].float().T + hidden_out_w2_list.append(hidden_out_w2_shared) + ref_down_sel = self.ref_down[1:][indices[0, s]] + for i in range(self.n_activated_experts): + hidden_out_w2_sel = vec_in[0, s, i + 1].float() @ ref_down_sel[i].float().T + hidden_out_w2_list.append(hidden_out_w2_sel * scores[0, s, i]) + hidden_out_w2 = torch.stack(hidden_out_w2_list, dim=0).to(torch.bfloat16) + hidden_out_w2 = torch.sum(hidden_out_w2, dim=0) + + hidden_out_list.append(hidden_out_w2) + hidden_out = torch.stack(hidden_out_list, dim=0) + return hidden_out[None, ...] + + def tilert_forward( + self, + vec_in: torch.Tensor, + indices: torch.Tensor, + scores: torch.Tensor, + x_in: torch.Tensor, + flag: int, + ) -> torch.Tensor: + assert self.hidden_out is not None + expert_down_allreduce( + vec_in, + self.tilert_weights, + self.tilert_scales, + indices, + scores, + x_in, + flag, + self.hidden_out, + self.profile_logs, + self.model_arch, + self.compute_kernel_type, + ) + return self.hidden_out + + def __call__( + self, + x_in: torch.Tensor, + indices: torch.Tensor, + scores: torch.Tensor, + ) -> torch.Tensor: + return self.golden_forward(x_in, indices, scores) diff --git a/tilert/models/deepseek_v3_2/ops/expert_sel_up_gate_silu.py b/tilert/models/deepseek_v3_2/ops/expert_sel_up_gate_silu.py new file mode 100644 index 0000000..3e663bf --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/expert_sel_up_gate_silu.py @@ -0,0 +1,713 @@ +"""ExpertSelectUpGateSiLU operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch +import torch.nn.functional as F + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "ExpertSelectUpGateSiLUAlgorithm", + "ExpertSelectUpGateSiLU", + "ExpertSelectUpGateSiLURefWeightsAlias", + "ExpertSelectUpGateSiLUTilertWeightsAlias", + "expert_select_up_gate_silu", +] + + +def expert_select_up_gate_silu( + hidden_in: torch.Tensor, + scores_in: torch.Tensor, + bias_in: torch.Tensor, + experts_weights_in: torch.Tensor, + hidden_out: torch.Tensor, + expert_probs_out: torch.Tensor, + expert_indices_out: torch.Tensor, + profile_logs: torch.Tensor, + algorithm: str = "fp8mma", + *, + model_arch: str, +) -> None: + """Expert SelectUpGateSiLU operation.""" + torch.ops.tilert.expert_select_up_gate_silu_op( + hidden_in, + scores_in, + bias_in, + experts_weights_in, + hidden_out, + expert_probs_out, + expert_indices_out, + profile_logs, + model_arch, + algorithm, + ) + + +@dataclass +class ExpertSelectUpGateSiLURefWeightsAlias: + """Reference weights alias for ExpertSelectUpGateSiLU.""" + + key_prefix: str = "mlp" + n_routed_experts: int = 256 + + @property + def ref_tensor_alias(self) -> list[str]: + n = self.n_routed_experts + return ( + [f"{self.key_prefix}.gate.e_score_correction_bias"] + + [f"{self.key_prefix}.shared_experts.gate_proj.weight"] + + [f"{self.key_prefix}.experts.{i}.gate_proj.weight" for i in range(n)] + + [f"{self.key_prefix}.shared_experts.up_proj.weight"] + + [f"{self.key_prefix}.experts.{i}.up_proj.weight" for i in range(n)] + + [f"{self.key_prefix}.shared_experts.gate_proj.weight_scale_inv"] + + [f"{self.key_prefix}.experts.{i}.gate_proj.weight_scale_inv" for i in range(n)] + + [f"{self.key_prefix}.shared_experts.up_proj.weight_scale_inv"] + + [f"{self.key_prefix}.experts.{i}.up_proj.weight_scale_inv" for i in range(n)] + ) + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class ExpertSelectUpGateSiLUTilertWeightsAlias: + """TileRT weights alias for ExpertSelectUpGateSiLU.""" + + exp_bias = "exp_bias" + exp_gate_weights = "exp_gate_weights" + exp_gate_scales = "exp_gate_scales" + exp_up_weights = "exp_up_weights" + exp_up_scales = "exp_up_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.exp_bias, + self.exp_gate_weights, + self.exp_gate_scales, + self.exp_up_weights, + self.exp_up_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class ExpertSelectUpGateSiLUAlgorithm(Enum): + """ExpertSelectUpGateSiLU algorithm""" + + FP8MMA = "fp8mma" + FP16MMA = "fp16mma" + BF16MMA = "bf16mma" + + +class ExpertSelectUpGateSiLUWeightsConverter(TilertWeightsConverter): + """ExpertSelectUpGateSiLU weights converter""" + + @staticmethod + def _swizzle_qmma_16x32(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 + assert mat_in.dtype == torch.float8_e4m3fn + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x32(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def tilert_to_tilert_144sm( + mat_in: torch.Tensor, mat_scale_in: torch.Tensor, mma_type: str | None = None + ) -> torch.Tensor: + """ + Convert tilert weights and scales to tilert_144sm input format. + + Args: + mat_in: tilert weights + mat_scale_in: tilert scales + mma_type: MMA type, None,"16x32" or "16x16" + Returns: + tilert_144sm weights and scales + """ + exp_num = mat_in.shape[0] + assert mat_in.shape == (exp_num, 512, 7168) + assert mat_scale_in.shape == (exp_num, 4, 64) + weights_trt = mat_in.reshape(exp_num, 128, 4, 7168) + weights_w1 = weights_trt[:, :, :2].reshape(exp_num, 256, 7168) + weights_w3 = weights_trt[:, :, 2:].reshape(exp_num, 256, 7168) + weights_w1 = weights_w1.reshape(exp_num, 16, 16, 7, 1024).transpose(2, 3) + weights_w3 = weights_w3.reshape(exp_num, 16, 16, 7, 1024).transpose(2, 3) + if mma_type == "16x32": + weights_w1 = weights_w1.reshape(exp_num, 16, 7, 16, 32, 32).transpose(3, 4) + weights_w1 = ExpertSelectUpGateSiLUWeightsConverter._swizzle_mma_16x32(weights_w1) + weights_w1 = weights_w1.reshape(exp_num, 16, 7, 16, 1024) + weights_w3 = weights_w3.reshape(exp_num, 16, 7, 16, 32, 32).transpose(3, 4) + weights_w3 = ExpertSelectUpGateSiLUWeightsConverter._swizzle_mma_16x32(weights_w3) + weights_w3 = weights_w3.reshape(exp_num, 16, 7, 16, 1024) + elif mma_type == "16x16": + weights_w1 = weights_w1.reshape(exp_num, 16, 7, 16, 64, 16).transpose(3, 4) + weights_w1 = ExpertSelectUpGateSiLUWeightsConverter._swizzle_mma_16x16(weights_w1) + weights_w1 = weights_w1.reshape(exp_num, 16, 7, 16, 1024) + weights_w3 = weights_w3.reshape(exp_num, 16, 7, 16, 64, 16).transpose(3, 4) + weights_w3 = ExpertSelectUpGateSiLUWeightsConverter._swizzle_mma_16x16(weights_w3) + weights_w3 = weights_w3.reshape(exp_num, 16, 7, 16, 1024) + + weights = torch.cat([weights_w1, weights_w3], dim=3) + assert weights.shape == (exp_num, 16, 7, 32, 1024) + weights = weights.reshape(exp_num, 16, 7, 32 * 1024) + + scales_unswizzled = torch.zeros(exp_num, 4, 56) + for i in range(64): + if ((i % 8) * 8 + i // 8) < 56: + scales_unswizzled[..., ((i % 8) * 8 + i // 8)] = mat_scale_in[..., i] + scales_unswizzled = scales_unswizzled.reshape(exp_num, 2, 2, 56) + + scales_w1 = scales_unswizzled[:, :, :1].repeat(1, 1, 8, 1).reshape(exp_num, 16, 1, 7, 8) + scales_w1 = scales_w1.transpose(2, 3) + scales_w3 = scales_unswizzled[:, :, 1:].repeat(1, 1, 8, 1).reshape(exp_num, 16, 1, 7, 8) + scales_w3 = scales_w3.transpose(2, 3) + scales = torch.cat([scales_w1, scales_w3], dim=3) + assert scales.shape == (exp_num, 16, 7, 2, 8) + scales = ( + scales.reshape(exp_num, 16, 7, 2 * 8).to(torch.bfloat16).view(dtype=torch.float8_e4m3fn) + ) + weights_and_scales = torch.zeros( + exp_num, 16, 7, 32 * 1024 + 128, dtype=torch.float8_e4m3fn, device=mat_in.device + ) + weights_and_scales[:, :, :, : 32 * 1024].copy_(weights) + weights_and_scales[:, :, :, 32 * 1024 : 32 * 1024 + 32].copy_(scales) + return weights_and_scales + + @staticmethod + def tilert_to_tilert_144sm_mma( + mat_in: torch.Tensor, mat_scale_in: torch.Tensor, mma_type: str = "16x32" + ) -> torch.Tensor: + """ + Convert tilert weights and scales to tilert_144sm_mma input format. + + Args: + mat_in: tilert weights + mat_scale_in: tilert scales + Returns: + tilert_144sm weights and scales + """ + return ExpertSelectUpGateSiLUWeightsConverter.tilert_to_tilert_144sm( + mat_in, mat_scale_in, mma_type + ) + + def convert_to_mma( + self, weights_list: list[torch.Tensor], algorithm: str = "fp8mma" + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert the weights to mma format.""" + args = self.model_args + dim = args.dim + pages = dim // 1024 + dim_scale_dim = dim // args.block_size + with torch.inference_mode(): + bias_or_gamma, weights_w1, scales_w1, weights_w3, scales_w3 = weights_list + exp_num = weights_w1.shape[0] + moe_rows = weights_w1.shape[1] + n_row_groups = moe_rows // 16 + scale_m_dim = moe_rows // args.block_size + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, 16, pages, 1024).transpose(2, 3) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, 16, pages, 1024).transpose(2, 3) + if algorithm == "fp8mma": + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, pages, 16, 32, 32).transpose( + 3, 4 + ) + weights_w1 = self._swizzle_qmma_16x32(weights_w1) + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, pages, 16, 1024) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, pages, 16, 32, 32).transpose( + 3, 4 + ) + weights_w3 = self._swizzle_qmma_16x32(weights_w3) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, pages, 16, 1024) + elif algorithm == "fp16mma": + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, pages, 16, 64, 16).transpose( + 3, 4 + ) + weights_w1 = self._swizzle_mma_16x16(weights_w1) + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, pages, 16, 1024) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, pages, 16, 64, 16).transpose( + 3, 4 + ) + weights_w3 = self._swizzle_mma_16x16(weights_w3) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, pages, 16, 1024) + else: + raise ValueError(f"Unsupported algorithm: {algorithm}") + weights: torch.Tensor = torch.cat([weights_w1, weights_w3], dim=3) + assert weights.shape == (exp_num, n_row_groups, pages, 32, 1024) + weights = weights.reshape(exp_num, n_row_groups, pages, 32 * 1024) + + scales_per_page = 1024 // args.block_size + repeat_factor = n_row_groups // scale_m_dim + scales_w1 = ( + scales_w1.reshape(exp_num, scale_m_dim, 1, dim_scale_dim) + .repeat(1, 1, repeat_factor, 1) + .reshape(exp_num, n_row_groups, 1, pages, scales_per_page) + ) + scales_w1 = scales_w1.transpose(2, 3) + scales_w3 = ( + scales_w3.reshape(exp_num, scale_m_dim, 1, dim_scale_dim) + .repeat(1, 1, repeat_factor, 1) + .reshape(exp_num, n_row_groups, 1, pages, scales_per_page) + ) + scales_w3 = scales_w3.transpose(2, 3) + scales = torch.cat([scales_w1, scales_w3], dim=3) + assert scales.shape == (exp_num, n_row_groups, pages, 2, scales_per_page) + + if self.model_args.arch_name == "glm_5": + if scales.dtype != torch.float32: + print( + "Warning: ExpertSelectUpGateSiLUWeightsConverter: " + + f"scales.dtype: {scales.dtype} " + + "is not float32, convert to float32." + ) + scales = scales.to(torch.float32) + else: + scales = scales.to(torch.bfloat16) + + scales = scales.reshape(exp_num, n_row_groups, pages, 2 * scales_per_page).view( + dtype=torch.float8_e4m3fn + ) + + weights_and_scales = torch.zeros( + exp_num, + n_row_groups, + pages, + 32 * 1024 + 128, + dtype=torch.float8_e4m3fn, + device=weights_w1.device, + ) + weights_and_scales[:, :, :, : 32 * 1024].copy_(weights) + weights_and_scales[:, :, :, 32 * 1024 : 32 * 1024 + scales.shape[-1]].copy_(scales) + + return bias_or_gamma.float(), weights_and_scales.contiguous() + + def convert_to_fp8mma( + self, weights_list: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Convert the weights to fp8mma format. + + Args: + weights: List of weights. + + Returns: + Tuple of weights. + """ + return self.convert_to_mma(weights_list, "fp8mma") + + def convert_to_fp16mma( + self, weights_list: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Convert the weights to fp16mma format. + + Args: + weights: List of weights. + + Returns: + Tuple of weights. + """ + return self.convert_to_mma(weights_list, "fp16mma") + + def convert_to_bf16mma( + self, weights_list: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert the weights to bf16mma format.""" + return self.convert_to_mma(weights_list, "fp16mma") + + +class ExpertSelectUpGateSiLU(TileRTModule): + """ExpertSelectUpGateSiLU module""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ + ExpertSelectUpGateSiLUAlgorithm.FP8MMA, + ExpertSelectUpGateSiLUAlgorithm.FP16MMA, + ExpertSelectUpGateSiLUAlgorithm.BF16MMA, + ], + "glm_5": [ + ExpertSelectUpGateSiLUAlgorithm.FP8MMA, + ExpertSelectUpGateSiLUAlgorithm.FP16MMA, + ], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int = 0, + ref_weights_alias: ExpertSelectUpGateSiLURefWeightsAlias | None = None, + tilert_weights_alias: ExpertSelectUpGateSiLUTilertWeightsAlias | None = None, + algorithm: ExpertSelectUpGateSiLUAlgorithm = ExpertSelectUpGateSiLUAlgorithm.FP8MMA, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.arch_name = self.model_args.arch_name + self.dim = self.model_args.dim + + self.n_activated_experts = self.model_args.n_activated_experts + self.n_routed_experts = self.model_args.n_routed_experts + self.n_shared_experts = self.model_args.n_shared_experts + self.moe_inter_dim = self.model_args.moe_inter_dim + self.n_expert_groups = self.model_args.n_expert_groups + self.n_limited_groups = self.model_args.n_limited_groups + self.route_scale = self.model_args.route_scale + self.block_size = self.model_args.block_size + self.algorithm = algorithm + + self.tilert_weights_alias = ( + tilert_weights_alias + if tilert_weights_alias is not None + else ExpertSelectUpGateSiLUTilertWeightsAlias() + ) + self.ref_weights_alias = ( + ref_weights_alias + if ref_weights_alias is not None + else ExpertSelectUpGateSiLURefWeightsAlias( + key_prefix="mlp", n_routed_experts=self.n_routed_experts + ) + ) + + self.ref_bias: torch.Tensor | None = None + self.ref_gate: torch.Tensor | None = None + self.ref_up: torch.Tensor | None = None + + self.tilert_bias: torch.Tensor | None = None + self.tilert_weights: torch.Tensor | None = None + self.tilert_scales = ( + torch.zeros(1, dtype=torch.bfloat16, device=torch.device("cuda")) + if torch.cuda.is_available() + else None + ) + + self.hidden_out: torch.Tensor | None = None + self.expert_probs: torch.Tensor | None = None + self.expert_indices: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + self._tensor_alias = self.tilert_weights_alias() + self._tilert_tensor_alias = [ + self.tilert_weights_alias.exp_bias, + "exp_upgate_weights", + "exp_upgate_scales", + ] + + @property + def tensor_alias(self) -> list[str]: + return self._tensor_alias + + @property + def tilert_tensor_alias(self) -> list[str]: + """Output weight names for get_weights_list (backward compat).""" + return self._tilert_tensor_alias + + def get_weights_list(self) -> list[torch.Tensor]: + """ + Get the weights list. + + Returns: + List of weights. + """ + return [self.tilert_bias, self.tilert_weights, self.tilert_scales] + + @staticmethod + def process_gate_up_weights( + key_prefix: str, + weights_hf: dict[str, torch.Tensor], + num_devices: int, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + gate_proj_weight_key = f"{key_prefix}.gate_proj.weight" + gate_proj_scale_key = f"{key_prefix}.gate_proj.weight_scale_inv" + up_proj_weight_key = f"{key_prefix}.up_proj.weight" + up_proj_scale_key = f"{key_prefix}.up_proj.weight_scale_inv" + + gate_proj_weight = weights_hf[gate_proj_weight_key] + gate_proj_scale = weights_hf[gate_proj_scale_key] + up_proj_weight = weights_hf[up_proj_weight_key] + up_proj_scale = weights_hf[up_proj_scale_key] + dim = gate_proj_weight.shape[-1] + in_dim = gate_proj_weight.shape[-2] + scale_dim = gate_proj_scale.shape[-1] + in_scale_dim = gate_proj_scale.shape[-2] + in_dim_per_device = in_dim // num_devices + in_scale_dim_per_device = in_scale_dim // num_devices + gate_proj_weight = gate_proj_weight.reshape(num_devices, 1, in_dim_per_device, dim) + gate_proj_scale = gate_proj_scale.reshape( + num_devices, 1, in_scale_dim_per_device, scale_dim + ) + up_proj_weight = up_proj_weight.reshape(num_devices, 1, in_dim_per_device, dim) + up_proj_scale = up_proj_scale.reshape(num_devices, 1, in_scale_dim_per_device, scale_dim) + return gate_proj_weight, gate_proj_scale, up_proj_weight, up_proj_scale + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """ + Device sharding: ref state dict -> tilert sharded tensors (num_devices, ...). + + Args: + weights_map: State dict keyed by ref_weights_alias(). + + Returns: + Dict keyed by tilert_weights_alias() with (num_devices, ...) tensors. + """ + ref_alias = self.ref_weights_alias + key_prefix = ref_alias.key_prefix + + bias_key = f"{key_prefix}.gate.e_score_correction_bias" + bias = weights_map[bias_key] + bias = bias[None, :].repeat(self.num_devices, 1) + + gate_weights_list = [] + gate_scales_list = [] + up_weights_list = [] + up_scales_list = [] + assert self.n_shared_experts == 1, "Only one shared expert is supported" + exp_prefix = f"{key_prefix}.shared_experts" + gate_weights, gate_scales, up_weights, up_scales = self.process_gate_up_weights( + exp_prefix, weights_map, self.num_devices + ) + gate_weights_list.append(gate_weights) + gate_scales_list.append(gate_scales) + up_weights_list.append(up_weights) + up_scales_list.append(up_scales) + + for exp_id in range(self.n_routed_experts): + exp_prefix = f"{key_prefix}.experts.{exp_id}" + gate_weights, gate_scales, up_weights, up_scales = self.process_gate_up_weights( + exp_prefix, weights_map, self.num_devices + ) + gate_weights_list.append(gate_weights) + gate_scales_list.append(gate_scales) + up_weights_list.append(up_weights) + up_scales_list.append(up_scales) + + gate_weights = torch.cat(gate_weights_list, dim=1) + gate_scales = torch.cat(gate_scales_list, dim=1) + up_weights = torch.cat(up_weights_list, dim=1) + up_scales = torch.cat(up_scales_list, dim=1) + tilert_alias = self.tilert_weights_alias + return { + tilert_alias.exp_bias: bias, + tilert_alias.exp_gate_weights: gate_weights, + tilert_alias.exp_gate_scales: gate_scales, + tilert_alias.exp_up_weights: up_weights, + tilert_alias.exp_up_scales: up_scales, + } + + def init_reference_weights( + self, + state_dict: dict[str, torch.Tensor], + device_id: int | None = None, + ) -> None: + """ + Initialize the reference weights. + + Args: + state_dict: State dict keyed by ref_weights_alias(). + device_id: Device ID; defaults to self.device_id. + """ + did = self.device_id if device_id is None else device_id + sharded = self.device_sharding(state_dict) + + tilert_alias = self.tilert_weights_alias + bias = sharded[tilert_alias.exp_bias][did] + gate_weights = sharded[tilert_alias.exp_gate_weights][did] + gate_scales = sharded[tilert_alias.exp_gate_scales][did] + up_weights = sharded[tilert_alias.exp_up_weights][did] + up_scales = sharded[tilert_alias.exp_up_scales][did] + + self.ref_bias = bias + ref_gate_list = [ + weight_dequant(gate_weights[i], gate_scales[i]) for i in range(gate_weights.shape[0]) + ] + ref_up_list = [ + weight_dequant(up_weights[i], up_scales[i]) for i in range(up_weights.shape[0]) + ] + self.ref_gate = torch.stack([t.to(torch.bfloat16) for t in ref_gate_list], dim=0) + self.ref_up = torch.stack([t.to(torch.bfloat16) for t in ref_up_list], dim=0) + + def get_tilert_weights_alias(self) -> list[str]: + """Return the alias list keyed into ``state_dict`` for this op.""" + return list(self.tilert_weights_alias()) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize the tilert weights.""" + assert self.algorithm is not None, "Algorithm is not set" + weights_list = [state_dict[alias] for alias in self.tilert_weights_alias()] + converter = ExpertSelectUpGateSiLUWeightsConverter(self.model_args, self.num_devices) + self.tilert_bias, self.tilert_weights = converter.dispatch(self.algorithm, weights_list) + + def init_tilert_vars(self, batch_size: int, seq_len: int, device: str = "cuda") -> None: + """ + Initialize the tilert variables. + + Args: + batch_size: Batch size. + seq_len: Sequence length. + """ + self.hidden_out = torch.zeros( + ( + batch_size, + seq_len, + self.n_activated_experts + self.n_shared_experts, + self.moe_inter_dim // self.num_devices, + ), + dtype=torch.bfloat16, + device=device, + ) + self.expert_probs = torch.zeros( + (batch_size, seq_len, self.n_activated_experts), + dtype=torch.float32, + device=device, + ) + self.expert_indices = torch.zeros( + (batch_size, seq_len, self.n_activated_experts), + dtype=torch.int32, + device=device, + ) + + self.profile_logs = get_profile_log_tensor(device=device) + self.is_init = True + + def init_random_weights(self, device: str = "cuda") -> None: + """ + Initialize the random weights. + + Returns: + None + """ + n = self.n_routed_experts + 1 + bias = torch.randn(self.n_routed_experts, dtype=torch.float32, device=device) + gate_weights = list( + torch.randn(n, self.moe_inter_dim, self.dim, dtype=torch.bfloat16, device=device) + .to(torch.float8_e4m3fn) + .unbind(0) + ) + up_weights = list( + torch.randn(n, self.moe_inter_dim, self.dim, dtype=torch.bfloat16, device=device) + .to(torch.float8_e4m3fn) + .unbind(0) + ) + moe_inter_dim_scale_dim = self.moe_inter_dim // self.block_size + dim_scale_dim = self.dim // self.block_size + scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 + gate_scales = list( + torch.randn( + n, moe_inter_dim_scale_dim, dim_scale_dim, dtype=scale_dtype, device=device + ).unbind(0) + ) + up_scales = list( + torch.randn( + n, moe_inter_dim_scale_dim, dim_scale_dim, dtype=scale_dtype, device=device + ).unbind(0) + ) + tensor_list = [ + bias, + *gate_weights, + *up_weights, + *gate_scales, + *up_scales, + ] + ref_state_dict = dict(zip(self.ref_weights_alias(), tensor_list)) + self.init_reference_weights(ref_state_dict) + sharded = self.device_sharding(ref_state_dict) + per_device_state = {k: v[self.device_id] for k, v in sharded.items()} + self.init_tilert_weights(per_device_state) + + def _ref_expert_select_glm5(self, scores: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + scores = scores.sigmoid() + original_scores = scores + if self.ref_bias is not None: + scores = scores + self.ref_bias + indices = torch.topk(scores, self.n_activated_experts, dim=-1)[1] + indices = indices.view(*original_scores.shape[:-1], self.n_activated_experts) + weights = original_scores.gather(-1, indices) + weights /= weights.sum(dim=-1, keepdim=True) + weights *= self.route_scale + return weights, indices + + def golden_forward( + self, + x_in: torch.Tensor, + scores: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + assert self.ref_gate is not None + assert self.ref_up is not None + bsz = x_in.shape[0] + seq_len = x_in.shape[1] + assert bsz == 1 + if self.arch_name == "deepseek_v3_2": + weights, indices = self._ref_expert_select_ds(scores) + elif self.arch_name == "glm_5": + weights, indices = self._ref_expert_select_glm5(scores) + else: + raise ValueError(f"Unsupported architecture: {self.arch_name}") + hidden_out_list = [] + for s in range(seq_len): + hidden_out_w1_list = [] + hidden_out_w3_list = [] + hidden_out_w1_shared = x_in[0, s].float() @ self.ref_gate[0].float().T + hidden_out_w3_shared = x_in[0, s].float() @ self.ref_up[0].float().T + hidden_out_w1_list.append(hidden_out_w1_shared) + hidden_out_w3_list.append(hidden_out_w3_shared) + ref_gate_sel = self.ref_gate[1:][indices[0, s]] + ref_up_sel = self.ref_up[1:][indices[0, s]] + for i in range(self.n_activated_experts): + hidden_out_w1_sel = x_in[0, s].float() @ ref_gate_sel[i].float().T + hidden_out_w3_sel = x_in[0, s].float() @ ref_up_sel[i].float().T + hidden_out_w1_list.append(hidden_out_w1_sel) + hidden_out_w3_list.append(hidden_out_w3_sel) + hidden_out_w1 = torch.stack(hidden_out_w1_list, dim=0) + hidden_out_w3 = torch.stack(hidden_out_w3_list, dim=0) + hidden_out = F.silu(hidden_out_w1.float()) * hidden_out_w3.float() + hidden_out = hidden_out.to(torch.bfloat16) + hidden_out_list.append(hidden_out) + hidden_out = torch.stack(hidden_out_list, dim=0) + hidden_out = hidden_out[None, ...] + return hidden_out, weights, indices + + def tilert_forward( + self, + x_in: torch.Tensor, + scores: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Run the kernel.""" + assert self.algorithm is not None, "Algorithm is not set" + expert_select_up_gate_silu( + x_in, + scores, + self.tilert_bias, + self.tilert_weights, + self.hidden_out, + self.expert_probs, + self.expert_indices, + self.profile_logs, + self.algorithm.value, + model_arch=self.model_args.arch_name, + ) + return self.hidden_out, self.expert_probs, self.expert_indices diff --git a/python/models/deepseek_v3_2/ops/flash_sparse_mla.py b/tilert/models/deepseek_v3_2/ops/flash_sparse_mla.py similarity index 88% rename from python/models/deepseek_v3_2/ops/flash_sparse_mla.py rename to tilert/models/deepseek_v3_2/ops/flash_sparse_mla.py index deebddc..4513b5f 100644 --- a/python/models/deepseek_v3_2/ops/flash_sparse_mla.py +++ b/tilert/models/deepseek_v3_2/ops/flash_sparse_mla.py @@ -1,12 +1,12 @@ """Flash Sparse MLA operation module.""" import math +from enum import Enum import torch from tilert.models.base import TileRTModule from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.profiler.utils import parse_profile_log_tensor from tilert.utils import get_profile_log_tensor __all__ = [ @@ -25,6 +25,9 @@ def flash_sparse_mla( output: torch.Tensor, profile_logs: torch.Tensor, split_size: int = 64, + compute_kernel_type: str = "bf16mma", + *, + model_arch: str, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Flash Sparse MLA operation for GLM5. @@ -60,7 +63,7 @@ def flash_sparse_mla( acc_type = torch.float32 dim = key_value.shape[-1] - max_num_splits = 32 # topk / split_size = 2048/64 + max_num_splits = 32 lse = torch.empty((batch, seqlen, heads), device=device, dtype=acc_type) lse_acc = torch.empty((batch, seqlen, heads, max_num_splits), device=device, dtype=acc_type) @@ -68,44 +71,42 @@ def flash_sparse_mla( batch, seqlen, heads, max_num_splits, dim, device=device, dtype=acc_type ) - if heads == 16: - torch.ops.tilert.flash_sparse_mla_op( - query, - query_pe, - key_value, - key_pe, - indices, - cur_pos, - output, - output_acc, - lse, - lse_acc, - profile_logs, - split_size, - ) - elif heads == 8: - torch.ops.tilert.flash_sparse_mla_glm5_op( - query, - query_pe, - key_value, - key_pe, - indices, - cur_pos, - output, - output_acc, - lse, - lse_acc, - profile_logs, - split_size, - ) - else: + if heads not in (8, 10, 16, 20): raise ValueError(f"Unsupported heads: {heads}") + torch.ops.tilert.flash_sparse_mla_op( + query, + query_pe, + key_value, + key_pe, + indices, + cur_pos, + output, + output_acc, + lse, + lse_acc, + split_size, + model_arch, + compute_kernel_type, + profile_logs, + torch.empty(0, dtype=torch.int64, device=query.device), + ) return lse, lse_acc, output_acc +class FlashSparseMLACombineAlgorithm(Enum): + """FlashSparseMLACombine algorithm.""" + + BF16MMA = "bf16mma" + + class FlashSparseMLACombine(TileRTModule): """Flash Sparse MLA combine module; no weights, uses model_args for scale and config.""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [FlashSparseMLACombineAlgorithm.BF16MMA], + "glm_5": [FlashSparseMLACombineAlgorithm.BF16MMA], + } + def __init__( self, model_args: ModelArgs, @@ -239,13 +240,8 @@ def tilert_forward( cur_pos, output, self.profile_logs, + model_arch=self.model_args.arch_name, ) - if self.flag_enable_profiling_log: - # TODO: bug fix for this - torch.cuda.synchronize() - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) return output def to_tilert_weights(self) -> None: diff --git a/python/models/deepseek_v3_2/ops/layernorm_rope_rotate.py b/tilert/models/deepseek_v3_2/ops/layernorm_rope_rotate.py similarity index 92% rename from python/models/deepseek_v3_2/ops/layernorm_rope_rotate.py rename to tilert/models/deepseek_v3_2/ops/layernorm_rope_rotate.py index b1bd0a7..ae6e6c1 100644 --- a/python/models/deepseek_v3_2/ops/layernorm_rope_rotate.py +++ b/tilert/models/deepseek_v3_2/ops/layernorm_rope_rotate.py @@ -1,6 +1,7 @@ """Layernorm_rope_rotate operation module.""" from dataclasses import dataclass +from enum import Enum import torch import torch.nn.functional as F @@ -9,7 +10,6 @@ from tilert.models.deepseek_v3_2.model_args import ModelArgs from tilert.models.deepseek_v3_2.ops.rotate import rotate_activation from tilert.models.utils import apply_rotary_emb -from tilert.profiler.utils import parse_profile_log_tensor from tilert.utils import get_profile_log_tensor __all__ = [ @@ -28,6 +28,8 @@ def layernorm_rope_rotate( bias: torch.Tensor, freqs_cis: torch.Tensor, profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", ) -> None: """ Layernorm_rope_rotate operation. @@ -69,7 +71,15 @@ def layernorm_rope_rotate( raise ValueError("batch must be 1 in this version") torch.ops.tilert.layernorm_rope_rotate_op( - input_raw, cur_pos, k_cache_raw, weight, bias, freqs_cis, profile_logs + input_raw, + cur_pos, + k_cache_raw, + weight, + bias, + freqs_cis, + model_arch, + compute_kernel_type, + profile_logs, ) @@ -103,9 +113,20 @@ def __call__(self) -> list[str]: return self.tilert_tensor_alias +class LayerNormRoPERotateAlgorithm(Enum): + """LayerNormRoPERotate algorithm.""" + + GENERAL = "general" + + class LayerNormRoPERotate(TileRTModule): """LayerNormRoPERotate module: LayerNorm + RoPE + rotate on K indexer output.""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [LayerNormRoPERotateAlgorithm.GENERAL], + "glm_5": [LayerNormRoPERotateAlgorithm.GENERAL], + } + def __init__( self, model_args: ModelArgs, @@ -194,7 +215,7 @@ def golden_forward(self, idx_k: torch.Tensor, freqs_cis: torch.Tensor) -> torch. k_pe, k_nope = torch.split( k, [self.rope_head_dim, self.head_dim - self.rope_head_dim], dim=-1 ) - k_pe = apply_rotary_emb(k_pe.unsqueeze(2), freqs_cis).squeeze(2) + k_pe = apply_rotary_emb(k_pe.unsqueeze(2), freqs_cis, interleaved=False).squeeze(2) k = torch.cat([k_pe, k_nope], dim=-1) return rotate_activation(k) @@ -212,11 +233,8 @@ def tilert_forward(self, idx_k: torch.Tensor, freqs_cis: torch.Tensor) -> torch. self.tilert_bias, rope_freqs, self.profile_logs, + model_arch=self.model_args.arch_name, ) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) return self.output def __call__(self, idx_k: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: diff --git a/tilert/models/deepseek_v3_2/ops/padded_allreduce_add.py b/tilert/models/deepseek_v3_2/ops/padded_allreduce_add.py new file mode 100644 index 0000000..0ea3221 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/padded_allreduce_add.py @@ -0,0 +1,147 @@ +"""PaddedAllReduceAdd operation module.""" + +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "padded_allreduce_add", + "PaddedAllReduceAdd", +] + + +def padded_allreduce_add( + partial_buf: torch.Tensor, + x_in: torch.Tensor, + flag: int, + vec_out: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", +) -> None: + """Padded AllReduce + residual add for Device Group A (GPU 0). + + GPU 0 contributes zeros to the 8-GPU AllReduce, then adds the residual. + + Args: + partial_buf: Zero-filled partial buffer [1, L, hidden_dim] bf16. + x_in: Residual input [1, L, hidden_dim] bf16. + flag: AllReduce sync flag. + vec_out: Output tensor [1, L, hidden_dim] bf16. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16"). + """ + torch.ops.tilert.padded_allreduce_add_op( + partial_buf, x_in, flag, vec_out, profile_logs, model_arch, compute_kernel_type + ) + + +class PaddedAllReduceAddAlgorithm(Enum): + """PaddedAllReduceAdd algorithm.""" + + BF16 = "bf16" + + +class PaddedAllReduceAdd(TileRTModule): + """PaddedAllReduceAdd module — zero-partial AllReduce + residual add.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [PaddedAllReduceAddAlgorithm.BF16], + "glm_5": [PaddedAllReduceAddAlgorithm.BF16], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int = 0, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.dim = self.model_args.dim + + self.partial_buf: torch.Tensor | None = None + + self.hidden_out: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + self.is_var_init = False + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + """Allocate output buffer and persistent zero-filled partial buffer. + + Args: + batch_size: Batch size. + seq_len: Sequence length. + """ + self.hidden_out = torch.zeros( + (batch_size, seq_len, self.dim), + dtype=torch.bfloat16, + device=f"cuda:{self.device_id}", + ) + self.partial_buf = torch.zeros( + (batch_size, seq_len, self.dim), + dtype=torch.bfloat16, + device=f"cuda:{self.device_id}", + ) + self.profile_logs = get_profile_log_tensor(device=f"cuda:{self.device_id}") + self.is_var_init = True + + def golden_forward( + self, + x_in: torch.Tensor, + ) -> torch.Tensor: + """Golden reference: allreduce(zeros) + x_in = x_in (single-GPU). + + On a single GPU, allreduce of zeros returns zeros, so output = x_in. + + Args: + x_in: Residual input [1, L, hidden_dim]. + + Returns: + Output tensor (copy of x_in). + """ + return x_in.clone() + + def tilert_forward( + self, + x_in: torch.Tensor, + flag: int, + ) -> torch.Tensor: + """Run TileRT kernel forward. + + Args: + x_in: Residual input [1, L, hidden_dim]. + flag: AllReduce sync flag. + + Returns: + Output tensor [1, L, hidden_dim]. + """ + assert self.hidden_out is not None + assert self.partial_buf is not None + assert self.profile_logs is not None + padded_allreduce_add( + self.partial_buf, + x_in, + flag, + self.hidden_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + return self.hidden_out + + def __call__( + self, + x_in: torch.Tensor, + ) -> torch.Tensor: + return self.golden_forward(x_in) diff --git a/tilert/models/deepseek_v3_2/ops/projo_wkvb.py b/tilert/models/deepseek_v3_2/ops/projo_wkvb.py new file mode 100644 index 0000000..845bd60 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/projo_wkvb.py @@ -0,0 +1,483 @@ +"""ProjOWkvb operation module.""" + +import math +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import init_func, weight_dequant +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "projo_wkvb", + "ProjoWKVb", + "ProjoWKVbAlgorithm", + "ProjoWKVbWeightsConverter", + "ProjoWKVbRefWeightsAlias", + "ProjoWKVbTilertWeightsAlias", +] + + +def projo_wkvb( + o_in: torch.Tensor, + wkv_b_b: torch.Tensor, + wkv_b_scales: torch.Tensor, + output: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "fp16mma", +) -> None: + """ + Define the ProjOWkvb operation. + + Args: + o_in: Input tensor. + wkv_b_b: Weight tensor. + wkv_b_scales: Scale tensor. + output: Output tensor. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Kernel type ("fp16mma" for both DSv32 and GLM5). + """ + torch.ops.tilert.projo_wkvb_op( + o_in, + wkv_b_b, + wkv_b_scales, + output, + model_arch, + compute_kernel_type, + profile_logs, + torch.empty(0, dtype=torch.int64, device=o_in.device), + ) + + +class ProjoWKVbAlgorithm(Enum): + """ProjoWKVb algorithm""" + + GENERAL = "general" + FP16MMA = "fp16mma" + BF16MMA = "bf16mma" + + +class ProjoWKVbWeightsConverter(TilertWeightsConverter): + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args, num_devices) + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle a [*, 16, 16] sub-block for the MMA kernel.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x16_for_pages(mat_in: torch.Tensor, k_dim: int, pages: int) -> torch.Tensor: + """Swizzle [*, 16, K] matrix for paged MMA layout.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == k_dim + pre_shape = mat_in.shape[:-2] + k_per_page = k_dim // pages + n_k_tiles = k_per_page // 16 + mat_in = mat_in.reshape(*pre_shape, 16, pages, k_per_page).transpose(-3, -2) + mat_in = mat_in.reshape(*pre_shape, pages, 16, n_k_tiles, 16).transpose(-3, -2) + mat_in = ProjoWKVbWeightsConverter._swizzle_mma_16x16(mat_in) + return mat_in.contiguous() + + def convert_to_fp16mma(self, weights: list[torch.Tensor]) -> torch.Tensor: + """Convert weights to the FP16 MMA packed format.""" + with torch.inference_mode(): + wkv_b_b, wkv_b_b_scales = self.convert_to_general(weights) + + n_heads = wkv_b_b.size(0) + v_head_dim = wkv_b_b.size(1) + kv_lora_rank = wkv_b_b.size(2) + num_ctas = 80 + rows_per_cta = (n_heads * v_head_dim) // num_ctas + + is_glm5 = self.model_args.arch_name == "glm_5" + + w_flat = wkv_b_b.reshape(num_ctas, rows_per_cta // 16, 16, kv_lora_rank) + w_swizzled = ProjoWKVbWeightsConverter._swizzle_mma_16x16_for_pages( + w_flat, kv_lora_rank, pages=1 + ) + w_bytes = w_swizzled.reshape(num_ctas, -1) + + scale_k_block = 128 + n_scale_k = kv_lora_rank // scale_k_block + ctas_per_head = num_ctas // n_heads + + if is_glm5: + ctas_per_scale_row = 64 // rows_per_cta + scales_per_cta = wkv_b_b_scales.repeat_interleave(ctas_per_scale_row, dim=1) + scales_per_cta = scales_per_cta.reshape(num_ctas, n_scale_k) + else: + scales_per_cta = wkv_b_b_scales.squeeze(1).repeat_interleave(ctas_per_head, dim=0) + + scale_dtype = torch.float32 + scales_per_cta = scales_per_cta.to(scale_dtype) + + mat_bytes = rows_per_cta * kv_lora_rank + scale_bytes = n_scale_k * 4 + page_size = (mat_bytes + scale_bytes + 127) // 128 * 128 + + scales_raw = scales_per_cta.contiguous().view(torch.float8_e4m3fn) + padding_size = page_size - mat_bytes - scales_raw.shape[-1] + padding = torch.zeros( + num_ctas, padding_size, dtype=torch.float8_e4m3fn, device=wkv_b_b.device + ) + return torch.cat([w_bytes, scales_raw, padding], dim=-1).contiguous() + + def convert_to_bf16mma(self, weights: list[torch.Tensor]) -> torch.Tensor: + """Convert weights to the BF16 MMA packed format.""" + with torch.inference_mode(): + tilert_wkv_b_weights, tilert_wkv_b_scales = weights + + wkvb_head_dim = self.model_args.qk_nope_head_dim + self.model_args.v_head_dim + left_head_dim = wkvb_head_dim % self.model_args.block_size + hd_block = left_head_dim if left_head_dim != 0 else self.model_args.block_size + + if self.model_args.n_heads % self.num_devices == 0: + n_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local_heads = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local_heads % 2 != 0: + n_local_heads += 1 + + v_head_dim = self.model_args.v_head_dim + kv_lora_rank = self.model_args.kv_lora_rank + n_block = self.model_args.block_size + + w = tilert_wkv_b_weights + s = tilert_wkv_b_scales + if self.model_args.n_heads % self.num_devices != 0: + n_current = w.size(0) + if n_current < n_local_heads: + pad_w = torch.zeros( + n_local_heads - n_current, *w.shape[1:], dtype=w.dtype, device=w.device + ) + w = torch.cat([w, pad_w], dim=0) + pad_s = torch.zeros( + n_local_heads - n_current, *s.shape[1:], dtype=s.dtype, device=s.device + ) + s = torch.cat([s, pad_s], dim=0) + + s = s.float() + s = s.repeat_interleave(hd_block, dim=1).repeat_interleave(n_block, dim=2) + wkv_bf16 = (w.float() * s).to(torch.bfloat16) + n_heads = n_local_heads + + num_ctas = 80 + rows_per_cta = (n_heads * v_head_dim) // num_ctas + + w_flat = wkv_bf16.reshape(num_ctas, rows_per_cta // 16, 16, kv_lora_rank) + w_swizzled = ProjoWKVbWeightsConverter._swizzle_mma_16x16_for_pages( + w_flat, kv_lora_rank, pages=1 + ) + w_bytes = w_swizzled.reshape(num_ctas, -1).contiguous().view(torch.float8_e4m3fn) + + mat_bytes = rows_per_cta * kv_lora_rank * 2 + page_size = (mat_bytes + 127) // 128 * 128 + padding_size = page_size - w_bytes.shape[-1] + + if padding_size > 0: + padding = torch.zeros( + num_ctas, padding_size, dtype=torch.float8_e4m3fn, device=wkv_bf16.device + ) + return torch.cat([w_bytes, padding], dim=-1).contiguous() + return w_bytes.contiguous() + + def convert_to_general(self, weights: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: + with torch.inference_mode(): + tilert_wkv_b_weights, tilert_wkv_b_scales = weights + + wkv_b_b = tilert_wkv_b_weights.contiguous() + wkv_b_b_scales = tilert_wkv_b_scales.contiguous() + if self.model_args.arch_name == "glm_5": + if wkv_b_b_scales.dtype != torch.float32: + print( + "Warning: ProjoWKVbWeightsConverter: " + + f"wkv_b_b_scales.dtype: {wkv_b_b_scales.dtype} " + + "is not float32, convert to float32." + ) + wkv_b_b_scales = wkv_b_b_scales.to(torch.float32) + else: + wkv_b_b_scales = wkv_b_b_scales.to(torch.bfloat16) + + wkv_b_b = wkv_b_b.detach() + wkv_b_b_scales = wkv_b_b_scales.detach() + + if self.model_args.n_heads % self.num_devices != 0: + n_target = math.ceil(self.model_args.n_heads / self.num_devices) + if n_target % 2 != 0: + n_target += 1 + n_current = wkv_b_b.size(0) + if n_current < n_target: + pad_b = torch.zeros( + n_target - n_current, + *wkv_b_b.shape[1:], + dtype=wkv_b_b.dtype, + device=wkv_b_b.device, + ) + wkv_b_b = torch.cat([wkv_b_b, pad_b], dim=0) + pad_s = torch.zeros( + n_target - n_current, + *wkv_b_b_scales.shape[1:], + dtype=wkv_b_b_scales.dtype, + device=wkv_b_b_scales.device, + ) + wkv_b_b_scales = torch.cat([wkv_b_b_scales, pad_s], dim=0) + wkv_b_b = wkv_b_b.contiguous() + wkv_b_b_scales = wkv_b_b_scales.contiguous() + + return wkv_b_b, wkv_b_b_scales + + +@dataclass +class ProjoWKVbRefWeightsAlias: + """Reference weights alias for ProjoWKVb.""" + + wkv_b_weights = "self_attn.kv_b_proj.weight" + wkv_b_scales = "self_attn.kv_b_proj.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [self.wkv_b_weights, self.wkv_b_scales] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class ProjoWKVbTilertWeightsAlias: + """TileRT weights alias for ProjoWKVb.""" + + wkv_b_weights = "wkv_b2_weights" + wkv_b_scales = "wkv_b2_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.wkv_b_weights, self.wkv_b_scales] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class ProjoWKVb(TileRTModule): + """ProjoWKVb module: O projection (wkv_b) for output.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ProjoWKVbAlgorithm.FP16MMA], + "glm_5": [ProjoWKVbAlgorithm.FP16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int = 0, + ref_weights_alias: ProjoWKVbRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = ProjoWKVbTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else ProjoWKVbRefWeightsAlias() + ) + + self.ref_wkv_b: torch.Tensor | None = None + self.tilert_wkv_b_b: torch.Tensor | None = None + self.tilert_wkv_b_b_scales: torch.Tensor | None = None + self.output: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + + if self.model_args.n_heads % self.num_devices == 0: + self.num_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local % 2 != 0: + n_local += 1 + self.num_local_heads = n_local + + self.wkvb_lora_rank = self.model_args.kv_lora_rank + self.wkvb_lora_rank_qsize = self.wkvb_lora_rank // self.model_args.block_size + + self.wkvb_head_dim = self.model_args.qk_nope_head_dim + self.model_args.v_head_dim + self.wkvb_v_head_dim = self.model_args.v_head_dim + left_head_dim = self.wkvb_head_dim % self.model_args.block_size + if left_head_dim != 0: + assert self.model_args.block_size % left_head_dim == 0 + self.head_dim_block_size = left_head_dim + self.head_dim_scale_repeat = self.model_args.block_size // self.head_dim_block_size + else: + self.head_dim_scale_repeat = 1 + self.head_dim_block_size = self.model_args.block_size + self.wkvb_head_qsize = self.wkvb_head_dim // self.head_dim_block_size + self.wkvb_v_head_qsize = self.wkvb_v_head_dim // self.head_dim_block_size + + self.compute_kernel_type = "fp16mma" + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_wkv_b_b, self.tilert_wkv_b_b_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """ + Device sharding: split weights and scales per device. + + Args: + weights_map: Map from ref weight alias to tensor. + + Returns: + Map from tilert weight alias to (num_devices, ...) tensors. + """ + kv_b_proj_weight = weights_map[self.ref_weights_alias.wkv_b_weights] + kv_b_proj_weight_scale = weights_map[self.ref_weights_alias.wkv_b_scales] + + if self.model_args.n_heads % self.num_devices == 0: + dev_weights = kv_b_proj_weight.view( + self.num_devices, self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank + ) + dev_scale_rows = self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size + dev_scales = kv_b_proj_weight_scale.view( + self.num_devices, dev_scale_rows, 1, self.wkvb_lora_rank_qsize + ) + else: + from tilert.models.deepseek_v3_2.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqbWeightsConverter, + ) + + wq_b_list, scale_list = RmsnormProjqWqbWeightsConverter._redistribute_heads( + kv_b_proj_weight, + kv_b_proj_weight_scale, + n_total_heads=self.model_args.n_heads, + n_local_heads=self.num_local_heads, + num_devices=self.num_devices, + qk_head_dim=self.wkvb_head_dim, + block_size=self.model_args.block_size, + ) + dev_weights = torch.stack(wq_b_list, dim=0).view( + self.num_devices, self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank + ) + dev_scale_rows = self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size + dev_scales = torch.stack(scale_list, dim=0).view( + self.num_devices, dev_scale_rows, 1, self.wkvb_lora_rank_qsize + ) + + wkvb = dev_weights[:, :, -self.wkvb_v_head_dim :] + wkvb_scales = ( + dev_scales.contiguous() + .repeat(1, 1, self.head_dim_scale_repeat, 1) + .view( + self.num_devices, + self.num_local_heads, + self.wkvb_head_qsize, + self.wkvb_lora_rank_qsize, + ) + .contiguous()[:, :, -self.wkvb_v_head_qsize :] + ) + return { + self.tilert_weights_alias.wkv_b_weights: wkvb.contiguous(), + self.tilert_weights_alias.wkv_b_scales: wkvb_scales.contiguous(), + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + sharding_size = self.num_local_heads * self.wkvb_head_dim + sharding_start = self.device_id * sharding_size + sharding_end = sharding_start + sharding_size + wkv_b = weight_dequant( + state_dict[self.ref_weights_alias.wkv_b_weights], + state_dict[self.ref_weights_alias.wkv_b_scales], + ) + wkv_b = wkv_b[sharding_start:sharding_end, :] + wkv_b = wkv_b.view(self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank) + self.ref_wkv_b = wkv_b[:, -self.wkvb_v_head_dim :] + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + self.init_tilert_weights_hmma(state_dict) + + def init_tilert_weights_hmma(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize with HMMA-packed weights.""" + packed = ProjoWKVbWeightsConverter(self.model_args, self.num_devices).dispatch( + ProjoWKVbAlgorithm.FP16MMA, + [ + state_dict[self.tilert_weights_alias.wkv_b_weights], + state_dict[self.tilert_weights_alias.wkv_b_scales], + ], + ) + self.tilert_wkv_b_b = packed + self.tilert_wkv_b_b_scales = torch.empty(1, dtype=torch.float8_e4m3fn, device=packed.device) + self.compute_kernel_type = "fp16mma" + + def init_tilert_weights_hmma_bf16(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize with BF16 HMMA-packed weights (dequantized, no scales).""" + packed = ProjoWKVbWeightsConverter(self.model_args, self.num_devices).dispatch( + ProjoWKVbAlgorithm.BF16MMA, + [ + state_dict[self.tilert_weights_alias.wkv_b_weights], + state_dict[self.tilert_weights_alias.wkv_b_scales], + ], + ) + self.tilert_wkv_b_b = packed + self.tilert_wkv_b_b_scales = torch.empty(1, dtype=torch.float8_e4m3fn, device=packed.device) + self.compute_kernel_type = "bf16mma" + + def init_random_weights(self) -> None: + padded_total_heads = self.num_local_heads * self.num_devices + wkv_b = init_func( + torch.empty( + padded_total_heads * self.wkvb_head_dim, + self.wkvb_lora_rank, + dtype=torch.float8_e4m3fn, + ) + ) + wkv_b_scales = init_func( + torch.empty( + padded_total_heads * self.wkvb_head_dim // self.model_args.block_size, + self.wkvb_lora_rank_qsize, + dtype=torch.float32, + ) + ) + ref_state_dict = dict( + zip( + self.ref_weights_alias(), + [wkv_b, wkv_b_scales], + ) + ) + self.init_reference_weights(ref_state_dict) + sharded = self.device_sharding(ref_state_dict) + self.init_tilert_weights({k: v[self.device_id] for k, v in sharded.items()}) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + self.output = torch.zeros( + (batch_size, seq_len, self.num_local_heads, self.wkvb_v_head_dim), + dtype=torch.bfloat16, + ) + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward(self, x_out: torch.Tensor) -> torch.Tensor: + assert self.ref_wkv_b is not None + return torch.einsum("bshc,hdc->bshd", x_out, self.ref_wkv_b) + + def tilert_forward(self, x_out: torch.Tensor) -> torch.Tensor: + assert self.tilert_wkv_b_b is not None + assert self.tilert_wkv_b_b_scales is not None + assert self.output is not None + assert self.profile_logs is not None + projo_wkvb( + x_out, + self.tilert_wkv_b_b, + self.tilert_wkv_b_b_scales, + self.output, + self.profile_logs, + model_arch=self.model_args.arch_name, + compute_kernel_type=self.compute_kernel_type, + ) + return self.output diff --git a/python/models/deepseek_v3_2/ops/projq_wqb.py b/tilert/models/deepseek_v3_2/ops/projq_wqb.py similarity index 51% rename from python/models/deepseek_v3_2/ops/projq_wqb.py rename to tilert/models/deepseek_v3_2/ops/projq_wqb.py index 7287aa2..bc2bc12 100644 --- a/python/models/deepseek_v3_2/ops/projq_wqb.py +++ b/tilert/models/deepseek_v3_2/ops/projq_wqb.py @@ -1,5 +1,6 @@ """ProjQB operation module.""" +import math from dataclasses import dataclass from enum import Enum @@ -8,7 +9,6 @@ from tilert.models.base import TileRTModule, TilertWeightsConverter from tilert.models.common import init_func, weight_dequant from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.profiler.utils import parse_profile_log_tensor from tilert.utils import get_profile_log_tensor __all__ = [ @@ -27,6 +27,9 @@ def projq_wqb( wkv_b_a_scales: torch.Tensor, output: torch.Tensor, profile_logs: torch.Tensor, + compute_kernel_type: str = "fp16mma", + *, + model_arch: str, ) -> None: """ Define the ProjqWqb operation. @@ -37,17 +40,26 @@ def projq_wqb( wkv_b_a_scales: Scale tensor. output: Output tensor. profile_logs: Profile logs tensor. + compute_kernel_type: Kernel type ("fp16mma"). + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). """ - if q_nope_in.shape[-1] == 128: - torch.ops.tilert.projq_wqb_op(q_nope_in, wkv_b_a, wkv_b_a_scales, output, profile_logs) - elif q_nope_in.shape[-1] == 192: - torch.ops.tilert.proj_qb_glm5_op(q_nope_in, wkv_b_a, wkv_b_a_scales, output, profile_logs) + torch.ops.tilert.projq_wqb_op( + q_nope_in, + wkv_b_a, + wkv_b_a_scales, + output, + model_arch, + compute_kernel_type, + profile_logs, + ) class ProjqWqbAlgorithm(Enum): """ProjqWqb algorithm""" GENERAL = "general" + FP16MMA = "fp16mma" + BF16MMA = "bf16mma" class ProjqWqbWeightsConverter(TilertWeightsConverter): @@ -56,11 +68,119 @@ def __init__(self, model_args: ModelArgs, num_devices: int, head_dim_block_size: self.head_dim_block_size = head_dim_block_size self.impl_block_size = 64 + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle a [*, 16, 16] sub-block for the MMA kernel.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x16_for_pages(mat_in: torch.Tensor, k_dim: int, pages: int) -> torch.Tensor: + """Swizzle [*, 16, K] matrix for paged MMA layout.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == k_dim + pre_shape = mat_in.shape[:-2] + k_per_page = k_dim // pages + n_k_tiles = k_per_page // 16 + mat_in = mat_in.reshape(*pre_shape, 16, pages, k_per_page).transpose(-3, -2) + mat_in = mat_in.reshape(*pre_shape, pages, 16, n_k_tiles, 16).transpose(-3, -2) + mat_in = ProjqWqbWeightsConverter._swizzle_mma_16x16(mat_in) + return mat_in.contiguous() + + def convert_to_fp16mma(self, weights: list[torch.Tensor]) -> torch.Tensor: + """Convert weights to the FP16 MMA packed format.""" + with torch.inference_mode(): + wkv_b_a, wkv_b_a_scales = self.convert_to_general(weights) + + n_heads = wkv_b_a.size(0) + head_dim = wkv_b_a.size(2) + kv_lora_rank = wkv_b_a.size(1) + num_ctas = 80 + rows_per_cta = (n_heads * kv_lora_rank) // num_ctas + + is_glm5 = self.model_args.arch_name == "glm_5" + + w_flat = wkv_b_a.reshape(num_ctas, rows_per_cta // 16, 16, head_dim) + w_swizzled = self._swizzle_mma_16x16_for_pages(w_flat, head_dim, pages=1) + w_bytes = w_swizzled.reshape(num_ctas, -1) + + kScalesPerPage = head_dim // 64 + + if is_glm5: + ctas_per_scale_row = 128 // rows_per_cta + scales_expanded = wkv_b_a_scales.repeat_interleave(ctas_per_scale_row, dim=1) + scales_per_cta = scales_expanded.reshape(num_ctas, kScalesPerPage) + scale_dtype = torch.float32 + else: + scales_per_cta = wkv_b_a_scales.reshape(num_ctas, kScalesPerPage) + scale_dtype = torch.bfloat16 + + mat_bytes = rows_per_cta * head_dim + scale_elem_bytes = 4 if scale_dtype == torch.float32 else 2 + scale_bytes = kScalesPerPage * scale_elem_bytes + page_size = (mat_bytes + scale_bytes + 127) // 128 * 128 + + scales_raw = scales_per_cta.to(scale_dtype).contiguous().view(torch.float8_e4m3fn) + padding_size = page_size - mat_bytes - scales_raw.shape[-1] + padding = torch.zeros( + num_ctas, padding_size, dtype=torch.float8_e4m3fn, device=wkv_b_a.device + ) + return torch.cat([w_bytes, scales_raw, padding], dim=-1).contiguous() + + def convert_to_bf16mma(self, weights: list[torch.Tensor]) -> torch.Tensor: + """Convert weights to the BF16 MMA packed format.""" + with torch.inference_mode(): + tilert_wkv_b_weights, tilert_wkv_b_scales = weights + + if self.model_args.n_heads % self.num_devices == 0: + n_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local_heads = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local_heads % 2 != 0: + n_local_heads += 1 + + nope_head_dim = self.model_args.qk_nope_head_dim + kv_lora_rank = self.model_args.kv_lora_rank + hd_block = self.head_dim_block_size + n_block = self.model_args.block_size + + s = tilert_wkv_b_scales.float() + s = s.repeat_interleave(hd_block, dim=1).repeat_interleave(n_block, dim=2) + wkv_bf16 = ( + (tilert_wkv_b_weights.float() * s).transpose(1, 2).contiguous().to(torch.bfloat16) + ) + n_heads = n_local_heads + head_dim = nope_head_dim + + num_ctas = 80 + rows_per_cta = (n_heads * kv_lora_rank) // num_ctas + + w_flat = wkv_bf16.reshape(num_ctas, rows_per_cta // 16, 16, head_dim) + w_swizzled = self._swizzle_mma_16x16_for_pages(w_flat, head_dim, pages=1) + w_bytes = w_swizzled.reshape(num_ctas, -1).contiguous().view(torch.float8_e4m3fn) + + mat_bytes = rows_per_cta * head_dim * 2 + page_size = (mat_bytes + 127) // 128 * 128 + padding_size = page_size - w_bytes.shape[-1] + + if padding_size > 0: + padding = torch.zeros( + num_ctas, padding_size, dtype=torch.float8_e4m3fn, device=wkv_bf16.device + ) + return torch.cat([w_bytes, padding], dim=-1).contiguous() + return w_bytes.contiguous() + def convert_to_general(self, weights: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: with torch.inference_mode(): tilert_wkv_b_weights, tilert_wkv_b_scales = weights - n_local_heads = self.model_args.n_heads // self.num_devices + if self.model_args.n_heads % self.num_devices == 0: + n_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local_heads = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local_heads % 2 != 0: + n_local_heads += 1 wkv_b = tilert_wkv_b_weights wkv_b_scales_raw = tilert_wkv_b_scales @@ -84,9 +204,8 @@ def convert_to_general(self, weights: list[torch.Tensor]) -> tuple[torch.Tensor, + "is not float32, convert to float32." ) wkv_b_a_scales = wkv_b_a_scales.to(torch.float32) - else: # DS v3.2, use bfloat16 for wkv_b_a_scales + else: wkv_b_a_scales = wkv_b_a_scales.to(torch.bfloat16) - # Tiling to fit tilert input if self.head_dim_block_size != self.impl_block_size: repeats = self.head_dim_block_size // self.impl_block_size wkv_b_a_scales = wkv_b_a_scales.repeat(1, 1, repeats).contiguous() @@ -130,6 +249,11 @@ def __call__(self) -> list[str]: class ProjqWqb(TileRTModule): """ProjqWqb module: Q projection (wkv_b) for KV LoRA.""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ProjqWqbAlgorithm.FP16MMA], + "glm_5": [ProjqWqbAlgorithm.FP16MMA], + } + def __init__( self, model_args: ModelArgs, @@ -155,9 +279,16 @@ def __init__( self.output: torch.Tensor | None = None self.profile_logs: torch.Tensor | None = None - self.num_local_heads = self.model_args.n_heads // self.num_devices + self.compute_kernel_type = "fp16mma" + + if self.model_args.n_heads % self.num_devices == 0: + self.num_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local % 2 != 0: + n_local += 1 + self.num_local_heads = n_local - # lora dim and quant block size self.wkvb_lora_rank = self.model_args.kv_lora_rank self.wkvb_lora_rank_qsize = self.wkvb_lora_rank // self.model_args.block_size @@ -194,18 +325,39 @@ def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, tor kv_b_proj_weight = weights_map[self.ref_weights_alias.wkv_b_weights] kv_b_proj_weight_scale = weights_map[self.ref_weights_alias.wkv_b_scales] - dev_heads = (self.num_devices, self.num_local_heads) - wkvb = kv_b_proj_weight.view(*dev_heads, self.wkvb_head_dim, self.wkvb_lora_rank)[ - :, :, : self.wkvb_nope_head_dim - ] - wkvb_scales = ( - kv_b_proj_weight_scale.view( - self.num_devices, - self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size, - 1, - self.wkvb_lora_rank_qsize, + if self.model_args.n_heads % self.num_devices == 0: + dev_weights = kv_b_proj_weight.view( + self.num_devices, self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank + ) + dev_scale_rows = self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size + dev_scales = kv_b_proj_weight_scale.view( + self.num_devices, dev_scale_rows, 1, self.wkvb_lora_rank_qsize ) - .contiguous() + else: + from tilert.models.deepseek_v3_2.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqbWeightsConverter, + ) + + wq_b_list, scale_list = RmsnormProjqWqbWeightsConverter._redistribute_heads( + kv_b_proj_weight, + kv_b_proj_weight_scale, + n_total_heads=self.model_args.n_heads, + n_local_heads=self.num_local_heads, + num_devices=self.num_devices, + qk_head_dim=self.wkvb_head_dim, + block_size=self.model_args.block_size, + ) + dev_weights = torch.stack(wq_b_list, dim=0).view( + self.num_devices, self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank + ) + dev_scale_rows = self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size + dev_scales = torch.stack(scale_list, dim=0).view( + self.num_devices, dev_scale_rows, 1, self.wkvb_lora_rank_qsize + ) + + wkvb = dev_weights[:, :, : self.wkvb_nope_head_dim] + wkvb_scales = ( + dev_scales.contiguous() .repeat(1, 1, self.head_dim_scale_repeat, 1) .view( self.num_devices, @@ -233,29 +385,50 @@ def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: self.ref_wkv_b = wkv_b[:, : self.wkvb_nope_head_dim] def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: - self.tilert_wkv_b_a, self.tilert_wkv_b_a_scales = ProjqWqbWeightsConverter( + self.init_tilert_weights_hmma(state_dict) + + def init_tilert_weights_hmma(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize with HMMA-packed weights.""" + packed = ProjqWqbWeightsConverter( self.model_args, self.num_devices, self.head_dim_block_size ).dispatch( - ProjqWqbAlgorithm.GENERAL, + ProjqWqbAlgorithm.FP16MMA, [ state_dict[self.tilert_weights_alias.wkv_b_weights], state_dict[self.tilert_weights_alias.wkv_b_scales], ], ) + self.tilert_wkv_b_a = packed + self.tilert_wkv_b_a_scales = torch.empty(1, dtype=torch.float8_e4m3fn, device=packed.device) + self.compute_kernel_type = "fp16mma" + + def init_tilert_weights_hmma_bf16(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize with BF16 HMMA-packed weights (dequantized, no scales).""" + packed = ProjqWqbWeightsConverter( + self.model_args, self.num_devices, self.head_dim_block_size + ).dispatch( + ProjqWqbAlgorithm.BF16MMA, + [ + state_dict[self.tilert_weights_alias.wkv_b_weights], + state_dict[self.tilert_weights_alias.wkv_b_scales], + ], + ) + self.tilert_wkv_b_a = packed + self.tilert_wkv_b_a_scales = torch.empty(1, dtype=torch.float8_e4m3fn, device=packed.device) + self.compute_kernel_type = "bf16mma" def init_random_weights(self) -> None: + padded_total_heads = self.num_local_heads * self.num_devices wkv_b = init_func( torch.empty( - self.model_args.n_heads * self.wkvb_head_dim, + padded_total_heads * self.wkvb_head_dim, self.wkvb_lora_rank, dtype=torch.float8_e4m3fn, ) ) wkv_b_scales = init_func( torch.empty( - # Block quant should be applied to the original weight dimension (including head - # dimension) - self.model_args.n_heads * self.wkvb_head_dim // self.model_args.block_size, + padded_total_heads * self.wkvb_head_dim // self.model_args.block_size, self.wkvb_lora_rank_qsize, dtype=torch.float32, ) @@ -287,9 +460,7 @@ def tilert_forward(self, q_nope: torch.Tensor) -> torch.Tensor: self.tilert_wkv_b_a_scales, self.output, self.profile_logs, + self.compute_kernel_type, + model_arch=self.model_args.arch_name, ) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) return self.output diff --git a/python/models/deepseek_v3_2/ops/projx_wis.py b/tilert/models/deepseek_v3_2/ops/projx_wis.py similarity index 62% rename from python/models/deepseek_v3_2/ops/projx_wis.py rename to tilert/models/deepseek_v3_2/ops/projx_wis.py index e264659..ebd7ff3 100644 --- a/python/models/deepseek_v3_2/ops/projx_wis.py +++ b/tilert/models/deepseek_v3_2/ops/projx_wis.py @@ -1,13 +1,13 @@ """ProjxWis operation module.""" from dataclasses import dataclass +from enum import Enum import torch from tilert.models.base import TileRTModule from tilert.models.common import init_func from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.profiler.utils import parse_profile_log_tensor from tilert.utils import get_profile_log_tensor __all__ = [ @@ -22,7 +22,9 @@ def projx_wis( x_in: torch.Tensor, w: torch.Tensor, output: torch.Tensor, + compute_kernel_type: str, profile_logs: torch.Tensor, + model_arch: str, ) -> None: """ Define the ProjxWis operation. @@ -31,12 +33,11 @@ def projx_wis( x_in: Input tensor. w: Weight tensor. output: Output tensor. + compute_kernel_type: Compute kernel type ("bf16" or "bf16mma"). profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). """ - if x_in.shape[-1] == 7168: - torch.ops.tilert.proj_w_op(x_in, w, output, profile_logs) - elif x_in.shape[-1] == 6144: - torch.ops.tilert.proj_w_glm5_op(x_in, w, output, profile_logs) + torch.ops.tilert.proj_w_op(x_in, w, output, model_arch, compute_kernel_type, profile_logs) @dataclass @@ -67,15 +68,33 @@ def __call__(self) -> list[str]: return self.tilert_tensor_alias +class ProjxWisAlgorithm(Enum): + """ProjxWis algorithm.""" + + BF16 = "bf16" + BF16MMA = "bf16mma" + + class ProjxWis(TileRTModule): """ProjxWis module: linear projection for indexer score weights.""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ProjxWisAlgorithm.BF16, ProjxWisAlgorithm.BF16MMA], + "glm_5": [ProjxWisAlgorithm.BF16, ProjxWisAlgorithm.BF16MMA], + } + + _HMMA_CONFIGS = { + 7168: (4, 16, 7), + 6144: (2, 16, 6), + } + def __init__( self, model_args: ModelArgs, num_devices: int, device_id: int = 0, ref_weights_alias: ProjxWisRefWeightsAlias | None = None, + compute_kernel_type: str | None = None, ): super().__init__( self.__class__.__name__, @@ -89,7 +108,6 @@ def __init__( ref_weights_alias if ref_weights_alias is not None else ProjxWisRefWeightsAlias() ) - # Backward compatibility: expose list for load_weights_for_layer etc. self.ref_tensor_alias = self.ref_weights_alias.ref_tensor_alias self.ref_w: torch.Tensor | None = None @@ -100,6 +118,33 @@ def __init__( self.dim = model_args.dim self.index_n_heads = model_args.index_n_heads + if compute_kernel_type is not None: + self.compute_kernel_type = compute_kernel_type + else: + self.compute_kernel_type = "bf16" + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle a 16x16 BF16 tile for the MMA kernel.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _to_hmma_layout( + w_orig: torch.Tensor, n_ctas: int, rows_per_cta: int, x_dim: int, num_pages: int + ) -> torch.Tensor: + """Convert [output_dim, x_dim] BF16 weights to the MMA layout.""" + cols_per_page = x_dim // num_pages + n_k_tiles = cols_per_page // 16 + w = w_orig.reshape(n_ctas, rows_per_cta, num_pages, cols_per_page) + w = w.transpose(1, 2) + n_row_tiles = rows_per_cta // 16 + w = w.reshape(n_ctas, num_pages, n_row_tiles, 16, n_k_tiles, 16).transpose(-3, -2) + w = ProjxWis._swizzle_mma_16x16(w) + return w.reshape(n_ctas, -1).contiguous() + @property def tilert_tensor_alias(self) -> list[str]: return self.tilert_weights_alias.tilert_tensor_alias @@ -117,8 +162,14 @@ def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, tor Returns: Map from tilert weight alias to (num_devices, ...) tensors. """ - w = weights_map[self.ref_weights_alias.w_weights][None, ...].repeat(self.num_devices, 1, 1) - return {self.tilert_weights_alias.w_weights: w} + w = weights_map[self.ref_weights_alias.w_weights] + if self.compute_kernel_type == "bf16mma": + n_ctas, rows_per_cta, num_pages = self._HMMA_CONFIGS[self.dim] + w_hmma = self._to_hmma_layout(w, n_ctas, rows_per_cta, self.dim, num_pages) + w_out = w_hmma[None, ...].repeat(self.num_devices, 1, 1) + else: + w_out = w[None, ...].repeat(self.num_devices, 1, 1) + return {self.tilert_weights_alias.w_weights: w_out} def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: w = state_dict[self.ref_weights_alias.w_weights] @@ -149,9 +200,12 @@ def tilert_forward(self, x_norm: torch.Tensor) -> torch.Tensor: assert self.tilert_w is not None assert self.output is not None assert self.profile_logs is not None - projx_wis(x_norm, self.tilert_w, self.output, self.profile_logs) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) + projx_wis( + x_norm, + self.tilert_w, + self.output, + self.compute_kernel_type, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) return self.output diff --git a/tilert/models/deepseek_v3_2/ops/projx_wqaki.py b/tilert/models/deepseek_v3_2/ops/projx_wqaki.py new file mode 100644 index 0000000..9bc90b5 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/projx_wqaki.py @@ -0,0 +1,247 @@ +"""ProjxWqaki operation module.""" + +import torch + +__all__ = [ + "projx_wqaki", + "ProjxWqakiWeightsConverter", +] + + +def projx_wqaki( + x_quant: torch.Tensor, + x_scale: torch.Tensor, + wqaki: torch.Tensor, + out_q: torch.Tensor, + out_ki: torch.Tensor, + profile_logs: torch.Tensor, + compute_kernel_type: str = "fp8mma", + *, + model_arch: str, +) -> None: + """FP8 MMA projection for q, ki. + + Args: + x_quant: FP8 quantized hidden states [1, seq_len, hidden_dim]. + x_scale: Scale factors for x_quant. + wqaki: Packed FP8 weights + scales for q, ki. + out_q: Output q tensor. + out_ki: Output ki tensor. + profile_logs: Profile logs tensor. + compute_kernel_type: Kernel type ("fp8mma", "fp8mma_68cta", "fp8mma_136cta"). + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + """ + torch.ops.tilert.projx_wqaki_op( + x_quant, + x_scale, + wqaki, + out_q, + out_ki, + model_arch, + compute_kernel_type, + profile_logs, + torch.empty(0, dtype=torch.int64, device=x_quant.device), + ) + + +class ProjxWqakiWeightsConverter: + """Weight converter for ProjxWqaki kernel.""" + + @staticmethod + def _swizzle_qmma_16x32(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 + assert mat_in.dtype == torch.float8_e4m3fn + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) + + @staticmethod + def convert_dsv32( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wki: torch.Tensor, + wki_scale: torch.Tensor, + ) -> torch.Tensor: + """Convert DSV3.2 weights to the FP8 MMA page layout.""" + with torch.inference_mode(): + wq_a_scale = wq_a_scale.to(torch.bfloat16) + wki_scale = wki_scale.to(torch.bfloat16) + + dim = 7168 + q_rows = 1536 + ki_rows = 128 + total_rows = q_rows + ki_rows + n_blocks = total_rows // 16 + scale_dim = dim // 128 + + n_q_blocks = q_rows // 16 + n_ki_blocks = ki_rows // 16 + wq_a = wq_a.reshape(n_q_blocks, 16, dim) + wq_a_scale = ( + wq_a_scale.reshape(wq_a_scale.shape[0], 1, scale_dim) + .repeat(1, n_q_blocks // wq_a_scale.shape[0], 1) + .reshape(n_q_blocks, scale_dim) + ) + wki = wki.reshape(n_ki_blocks, 16, dim) + wki_scale = ( + wki_scale.reshape(wki_scale.shape[0], 1, scale_dim) + .repeat(1, n_ki_blocks // wki_scale.shape[0], 1) + .reshape(n_ki_blocks, scale_dim) + ) + + wqaki = torch.cat([wq_a, wki], dim=0) + wqaki_scale = torch.cat([wq_a_scale, wki_scale], dim=0) + + swizzle = ProjxWqakiWeightsConverter._swizzle_qmma_16x32 + + wqaki_0 = wqaki[..., :2048] + wqaki_0_scale = wqaki_scale[..., :16].contiguous().view(torch.float8_e4m3fn) + wqaki_1 = wqaki[..., 2048:4096] + wqaki_1_scale = wqaki_scale[..., 16:32].contiguous().view(torch.float8_e4m3fn) + wqaki_2 = wqaki[..., 4096:6144] + wqaki_2_scale = wqaki_scale[..., 32:48].contiguous().view(torch.float8_e4m3fn) + wqaki_3 = wqaki[..., 6144:7168] + wqaki_3_scale = wqaki_scale[..., 48:56].contiguous().view(torch.float8_e4m3fn) + + wqaki_0 = wqaki_0.reshape(n_blocks, 16, 64, 32).transpose(1, 2) + wqaki_0 = swizzle(wqaki_0).reshape(n_blocks, 16 * 2048) + + wqaki_1 = wqaki_1.reshape(n_blocks, 16, 64, 32).transpose(1, 2) + wqaki_1 = swizzle(wqaki_1).reshape(n_blocks, 16 * 2048) + + wqaki_2 = wqaki_2.reshape(n_blocks, 16, 64, 32).transpose(1, 2) + wqaki_2 = swizzle(wqaki_2).reshape(n_blocks, 16 * 2048) + + wqaki_3 = wqaki_3.reshape(n_blocks, 16, 32, 32).transpose(1, 2) + wqaki_3 = swizzle(wqaki_3).reshape(n_blocks, 16 * 1024) + + padding_scale0 = torch.zeros( + (n_blocks, 48), dtype=torch.bfloat16, device=wq_a.device + ).view(torch.float8_e4m3fn) + padding_scale1 = torch.zeros( + (n_blocks, 48), dtype=torch.bfloat16, device=wq_a.device + ).view(torch.float8_e4m3fn) + padding_scale2 = torch.zeros( + (n_blocks, 48), dtype=torch.bfloat16, device=wq_a.device + ).view(torch.float8_e4m3fn) + padding_scale3 = torch.zeros( + (n_blocks, 56), dtype=torch.bfloat16, device=wq_a.device + ).view(torch.float8_e4m3fn) + + return torch.cat( + [ + wqaki_0, + wqaki_0_scale, + padding_scale0, + wqaki_1, + wqaki_1_scale, + padding_scale1, + wqaki_2, + wqaki_2_scale, + padding_scale2, + wqaki_3, + wqaki_3_scale, + padding_scale3, + ], + dim=1, + ).contiguous() + + @staticmethod + def convert_glm5_68cta( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wki: torch.Tensor, + wki_scale: torch.Tensor, + ) -> torch.Tensor: + """Convert GLM5 weights to the FP8 MMA page layout (68CTA).""" + with torch.inference_mode(): + wq_a_scale = wq_a_scale.to(torch.float32) + wki_scale = wki_scale.to(torch.float32) + + dim = 6144 + q_rows = 2048 + ki_rows = 128 + total_rows = q_rows + ki_rows + n_blocks = total_rows // 32 + scale_dim = dim // 128 + + n_q_blocks = q_rows // 32 + n_ki_blocks = ki_rows // 32 + + wqaki_raw = torch.cat([wq_a, wki], dim=0).reshape(n_blocks, 32, dim) + + wq_a_scale = ( + wq_a_scale.reshape(wq_a_scale.shape[0], 1, scale_dim) + .repeat(1, n_q_blocks // wq_a_scale.shape[0], 1) + .reshape(n_q_blocks, scale_dim) + ) + wki_scale = ( + wki_scale.reshape(wki_scale.shape[0], 1, scale_dim) + .repeat(1, n_ki_blocks // wki_scale.shape[0], 1) + .reshape(n_ki_blocks, scale_dim) + ) + wqaki_scales = torch.cat([wq_a_scale, wki_scale], dim=0) + + swizzle = ProjxWqakiWeightsConverter._swizzle_qmma_16x32 + + wqaki_raw = wqaki_raw.reshape(n_blocks, 32, 6, 1024).transpose(1, 2) + wqaki_raw = wqaki_raw.reshape(n_blocks, 6, 2, 16, 32, 32).transpose(3, 4) + wqaki_raw = swizzle(wqaki_raw).reshape(n_blocks, 6, 32 * 1024) + wqaki_scales = wqaki_scales.reshape(n_blocks, 6, 8).view(torch.float8_e4m3fn) + wqaki_padding = torch.zeros( + (n_blocks, 6, 128 - wqaki_scales.shape[-1]), + dtype=torch.float8_e4m3fn, + device=wq_a.device, + ) + return torch.cat([wqaki_raw, wqaki_scales, wqaki_padding], dim=-1).contiguous() + + @staticmethod + def convert_glm5_136cta( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wki: torch.Tensor, + wki_scale: torch.Tensor, + ) -> torch.Tensor: + """Convert GLM5 weights to the FP8 MMA page layout (136CTA).""" + with torch.inference_mode(): + wq_a_scale = wq_a_scale.to(torch.float32) + wki_scale = wki_scale.to(torch.float32) + + dim = 6144 + q_rows = 2048 + ki_rows = 128 + total_rows = q_rows + ki_rows + n_blocks = total_rows // 16 + scale_dim = dim // 128 + + n_q_blocks = q_rows // 16 + n_ki_blocks = ki_rows // 16 + + wq_a = wq_a.reshape(n_q_blocks, 16, dim) + wq_a_scale = ( + wq_a_scale.reshape(wq_a_scale.shape[0], 1, scale_dim) + .repeat(1, n_q_blocks // wq_a_scale.shape[0], 1) + .reshape(n_q_blocks, scale_dim) + ) + wki = wki.reshape(n_ki_blocks, 16, dim) + wki_scale = ( + wki_scale.reshape(wki_scale.shape[0], 1, scale_dim) + .repeat(1, n_ki_blocks // wki_scale.shape[0], 1) + .reshape(n_ki_blocks, scale_dim) + ) + + wqaki_raw = torch.cat([wq_a, wki], dim=0) + wqaki_scales = torch.cat([wq_a_scale, wki_scale], dim=0) + + swizzle = ProjxWqakiWeightsConverter._swizzle_qmma_16x32 + + wqaki_raw = wqaki_raw.reshape(n_blocks, 16, 3, 2048).transpose(1, 2) + wqaki_raw = wqaki_raw.reshape(n_blocks, 3, 1, 16, 64, 32).transpose(3, 4) + wqaki_raw = swizzle(wqaki_raw).reshape(n_blocks, 3, 16 * 2048) + wqaki_scales = wqaki_scales.reshape(n_blocks, 3, 16).view(torch.float8_e4m3fn) + wqaki_padding = torch.zeros( + (n_blocks, 3, 128 - wqaki_scales.shape[-1]), + dtype=torch.float8_e4m3fn, + device=wq_a.device, + ) + return torch.cat([wqaki_raw, wqaki_scales, wqaki_padding], dim=-1).contiguous() diff --git a/tilert/models/deepseek_v3_2/ops/projx_wqkva.py b/tilert/models/deepseek_v3_2/ops/projx_wqkva.py new file mode 100644 index 0000000..0d36af8 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/projx_wqkva.py @@ -0,0 +1,329 @@ +"""ProjXWqkva operation module.""" + +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule +from tilert.models.common import weight_dequant +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.models.deepseek_v3_2.ops.rmsnorm_projx_wqkva import ( + RMSNormProjQKVAFP8MMAWeightsConverter, + RMSNormProjQKVAFP16MMAWeightsConverter, +) +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "ProjXWqkva", + "projx_wqkva", +] + + +def projx_wqkva( + x_quant: torch.Tensor, + x_scale: torch.Tensor, + wqkva: torch.Tensor, + cur_pos: torch.Tensor, + q_out: torch.Tensor, + kv_out: torch.Tensor, + pe_cache_out: torch.Tensor, + profile_logs: torch.Tensor, + compute_kernel_type: str = "fp8mma", + *, + model_arch: str, +) -> None: + """FP8 MMA projection for q, kv, pe_cache (DSV3.2).""" + torch.ops.tilert.projx_wqkva_op( + x_quant, + x_scale, + wqkva, + cur_pos, + q_out, + kv_out, + pe_cache_out, + model_arch, + compute_kernel_type, + profile_logs, + torch.empty(0, dtype=torch.int64, device=x_quant.device), + ) + + +class ProjXWqkvaRefWeightsAlias: + """Reference weight aliases for ProjXWqkva.""" + + x_rmsnorm_gamma = "input_layernorm.weight" + q_a_weights = "self_attn.q_a_proj.weight" + q_a_scales = "self_attn.q_a_proj.weight_scale_inv" + kv_a_weights = "self_attn.kv_a_proj_with_mqa.weight" + kv_a_scales = "self_attn.kv_a_proj_with_mqa.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.kv_a_weights, + self.kv_a_scales, + ] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +class ProjXWqkvaTilertWeightsAlias: + """Tilert weight aliases for ProjXWqkva.""" + + q_a_weights = "q_a_weights" + q_a_scales = "q_a_scales" + kv_a_weights = "kv_a_weights" + kv_a_scales = "kv_a_scales" + w_pe_weights = "w_pe_weights" + w_pe_scales = "w_pe_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.q_a_weights, + self.q_a_scales, + self.kv_a_weights, + self.kv_a_scales, + self.w_pe_weights, + self.w_pe_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class ProjXWqkvaAlgorithm(Enum): + """ProjXWqkva algorithm.""" + + FP8MMA = "fp8mma" + FP16MMA = "fp16mma" + + +class ProjXWqkva(TileRTModule): + """FP8 MMA projection module for q, kv, pe_cache.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ProjXWqkvaAlgorithm.FP8MMA], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int, + ref_weights_alias: ProjXWqkvaRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = ProjXWqkvaTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else ProjXWqkvaRefWeightsAlias() + ) + + self.dim = self.model_args.dim + self.q_lora_rank = self.model_args.q_lora_rank + self.kv_lora_rank = self.model_args.kv_lora_rank + self.qk_rope_head_dim = self.model_args.qk_rope_head_dim + self.block_size = self.model_args.block_size + self.eps = self.model_args.eps + + self.ref_wq_a: torch.Tensor | None = None + self.ref_wkv_a: torch.Tensor | None = None + self.ref_w_pe: torch.Tensor | None = None + + self.tilert_wqkva: torch.Tensor | None = None + + self.q_out: torch.Tensor | None = None + self.kv_out: torch.Tensor | None = None + self.pe_cache_out: torch.Tensor | None = None + self.cur_pos: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + self.compute_kernel_type = "fp8mma" + + def set_algorithm(self, algorithm: Enum) -> None: + super().set_algorithm(algorithm) + if algorithm == ProjXWqkvaAlgorithm.FP16MMA: + self.compute_kernel_type = "fp16mma" + else: + self.compute_kernel_type = "fp8mma" + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Repeat weights for device sharding.""" + q_a_proj_weight = weights_map[self.ref_weights_alias.q_a_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + q_a_proj_weight_scale = weights_map[self.ref_weights_alias.q_a_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + kv_a_mqa = weights_map[self.ref_weights_alias.kv_a_weights] + kv_a_proj_weight = kv_a_mqa[: self.kv_lora_rank, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + w_pe_weight = kv_a_mqa[self.kv_lora_rank :, :][None, ...].repeat(self.num_devices, 1, 1) + kv_a_mqa_scale = weights_map[self.ref_weights_alias.kv_a_scales] + kv_scale_rows = (self.kv_lora_rank + self.block_size - 1) // self.block_size + kv_a_proj_weight_scale = kv_a_mqa_scale[:kv_scale_rows, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + w_pe_weight_scale = kv_a_mqa_scale[kv_scale_rows:, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + return { + self.tilert_weights_alias.q_a_weights: q_a_proj_weight, + self.tilert_weights_alias.q_a_scales: q_a_proj_weight_scale, + self.tilert_weights_alias.kv_a_weights: kv_a_proj_weight, + self.tilert_weights_alias.kv_a_scales: kv_a_proj_weight_scale, + self.tilert_weights_alias.w_pe_weights: w_pe_weight, + self.tilert_weights_alias.w_pe_scales: w_pe_weight_scale, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + aliases = self.ref_weights_alias() + self.ref_wq_a = weight_dequant(state_dict[aliases[1]], state_dict[aliases[2]]) + kv_a_mqa = weight_dequant(state_dict[aliases[3]], state_dict[aliases[4]]) + self.ref_wkv_a = kv_a_mqa[: self.kv_lora_rank, :] + self.ref_w_pe = kv_a_mqa[self.kv_lora_rank :, :] + + assert self.ref_wq_a.shape == (self.q_lora_rank, self.dim) + assert self.ref_wkv_a.shape == (self.kv_lora_rank, self.dim) + assert self.ref_w_pe.shape == (self.qk_rope_head_dim, self.dim) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + tilert_aliases = self.tilert_weights_alias() + wq_a = state_dict[tilert_aliases[0]] + wq_a_scale = state_dict[tilert_aliases[1]] + wkv_a = state_dict[tilert_aliases[2]] + wkv_a_scale = state_dict[tilert_aliases[3]] + w_pe = state_dict[tilert_aliases[4]] + w_pe_scale = state_dict[tilert_aliases[5]] + dummy_gamma = torch.zeros(self.dim, dtype=torch.float32, device=wq_a.device) + + if self.algorithm == ProjXWqkvaAlgorithm.FP16MMA: + self.tilert_wqkva, _ = RMSNormProjQKVAFP16MMAWeightsConverter.convert_to_fp16_mma_gemv( + wq_a, + wq_a_scale, + wkv_a, + wkv_a_scale, + w_pe, + w_pe_scale, + dummy_gamma, + hidden_dim=self.dim, + q_lora_rank=self.q_lora_rank, + ) + else: + self.tilert_wqkva, _ = RMSNormProjQKVAFP8MMAWeightsConverter.convert_to_fp8_mma_gemv( + wq_a, + wq_a_scale, + wkv_a, + wkv_a_scale, + w_pe, + w_pe_scale, + dummy_gamma, + hidden_dim=self.dim, + q_lora_rank=self.q_lora_rank, + ) + + def init_tilert_vars(self, batch_size: int, seq_len: int, max_len: int = 128) -> None: + self.q_out = torch.zeros((batch_size, seq_len, self.q_lora_rank), dtype=torch.bfloat16) + self.kv_out = torch.zeros((batch_size, seq_len, self.kv_lora_rank), dtype=torch.bfloat16) + self.pe_cache_out = torch.zeros( + (batch_size, max_len, self.qk_rope_head_dim), dtype=torch.bfloat16 + ) + self.cur_pos = torch.zeros((1,), dtype=torch.int32) + self.profile_logs = get_profile_log_tensor() + self.is_init = True + + def init_random_weights(self) -> None: + bs = self.block_size + dim_scale_dim = self.dim // bs + q_scale_dim = (self.q_lora_rank + bs - 1) // bs + kv_mqa_rows = self.kv_lora_rank + self.qk_rope_head_dim + kv_mqa_scale_dim = (kv_mqa_rows + bs - 1) // bs + scale_dtype = torch.bfloat16 + + tensor_list = [ + torch.randn(self.dim, dtype=torch.float32), + torch.randn(self.q_lora_rank, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(q_scale_dim, dim_scale_dim, dtype=scale_dtype), + torch.randn(kv_mqa_rows, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(kv_mqa_scale_dim, dim_scale_dim, dtype=scale_dtype), + ] + ref_state_dict = dict(zip(self.ref_weights_alias(), tensor_list)) + self.init_reference_weights(ref_state_dict) + self.init_tilert_weights( + {_k: _v[self.device_id] for _k, _v in self.device_sharding(ref_state_dict).items()} + ) + + def golden_forward( + self, + x_quant: torch.Tensor, + x_scale: torch.Tensor, + cur_pos: int = 0, # noqa: U100 + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Pure PyTorch reference: dequant FP8 -> matmul -> q, kv, pe.""" + assert self.ref_wq_a is not None + assert self.ref_wkv_a is not None + assert self.ref_w_pe is not None + + if self.algorithm == ProjXWqkvaAlgorithm.FP16MMA: + x_float = x_quant.float() + else: + x_fp8 = x_quant.to(torch.float32) + scale_expanded = x_scale.unsqueeze(-1).repeat(1, 1, 1, self.block_size) + scale_expanded = scale_expanded.reshape(x_quant.shape) + x_float = x_fp8 * scale_expanded + + q_out = torch.matmul(x_float, self.ref_wq_a.transpose(0, 1).float()) + kv_out = torch.matmul(x_float, self.ref_wkv_a.transpose(0, 1).float()) + pe_out = torch.matmul(x_float, self.ref_w_pe.transpose(0, 1).float()) + return ( + q_out.to(torch.bfloat16), + kv_out.to(torch.bfloat16), + pe_out.to(torch.bfloat16), + ) + + def tilert_forward( + self, + x_quant: torch.Tensor, + x_scale: torch.Tensor, + cur_pos: int = 0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Run FP8 QMMA GEMV via TileRT CUDA kernel.""" + assert self.cur_pos is not None + assert self.pe_cache_out is not None + self.cur_pos.fill_(cur_pos) + projx_wqkva( + x_quant, + x_scale, + self.tilert_wqkva, + self.cur_pos, + self.q_out, + self.kv_out, + self.pe_cache_out, + self.profile_logs, + self.compute_kernel_type, + model_arch=self.model_args.arch_name, + ) + + seq_len = x_quant.size(-2) + pe_at_pos = self.pe_cache_out[:, cur_pos : cur_pos + seq_len, :] + return self.q_out, self.kv_out, pe_at_pos + + def __call__( + self, + x_quant: torch.Tensor, + x_scale: torch.Tensor, + cur_pos: int = 0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + return self.golden_forward(x_quant, x_scale, cur_pos) diff --git a/python/models/deepseek_v3_2/ops/qkv_rope.py b/tilert/models/deepseek_v3_2/ops/qkv_rope.py similarity index 77% rename from python/models/deepseek_v3_2/ops/qkv_rope.py rename to tilert/models/deepseek_v3_2/ops/qkv_rope.py index 7a9a55a..25203a4 100644 --- a/python/models/deepseek_v3_2/ops/qkv_rope.py +++ b/tilert/models/deepseek_v3_2/ops/qkv_rope.py @@ -1,17 +1,13 @@ -"""QKV Rope operation module. - -Unified for deepseek_v3_2 (n_local_heads=16) and glm_5 (n_local_heads=8). -Dispatches by q_pe.shape[2]: 16 -> qkv_rope_op, 8 -> qkv_rope_glm5_op. -""" +"""QKV Rope operation module.""" from dataclasses import dataclass +from enum import Enum import torch from tilert.models.base import TileRTModule from tilert.models.deepseek_v3_2.model_args import ModelArgs from tilert.models.utils import apply_rotary_emb -from tilert.profiler.utils import parse_profile_log_tensor from tilert.utils import get_profile_log_tensor __all__ = [ @@ -28,34 +24,30 @@ def qkv_rope( rope_freqs: torch.Tensor, cur_pos: torch.Tensor, profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", ) -> None: """ Perform QKV Rope operation. - Unified for deepseek_v3_2 (16 heads) and glm_5 (8 heads). Dispatches by - pe_cache (q_pe) shape[2]: 16 -> qkv_rope_op, 8 -> qkv_rope_glm5_op. - Args: pe_cache: Q PE tensor (bsz, seq, n_local_heads, qk_rope_head_dim). kv_cache: K PE cache (bsz, seq, qk_rope_head_dim). rope_freqs: Rope frequencies tensor. cur_pos: Current position tensor. profile_logs: Profile logs tensor. + model_arch: Model architecture string. + compute_kernel_type: Compute kernel type string. """ - n_local_heads = pe_cache.shape[2] - qk_rope_head_dim = pe_cache.shape[3] - if qk_rope_head_dim != 64: - raise ValueError(f"Unsupported qk_rope_head_dim: {qk_rope_head_dim}") - - if n_local_heads == 16: - torch.ops.tilert.qkv_rope_op(pe_cache, kv_cache, rope_freqs, cur_pos, profile_logs) - elif n_local_heads == 8: - torch.ops.tilert.qkv_rope_glm5_op(pe_cache, kv_cache, rope_freqs, cur_pos, profile_logs) - else: - raise ValueError( - f"Unsupported n_local_heads: {n_local_heads}. " - "QKVRoPE supports n_local_heads=16 (deepseek_v3_2) or 8 (glm_5)." - ) + torch.ops.tilert.qkv_rope_op( + pe_cache, + kv_cache, + rope_freqs, + cur_pos, + model_arch, + compute_kernel_type, + profile_logs, + ) @dataclass @@ -82,9 +74,20 @@ def __call__(self) -> list[str]: return self.tilert_tensor_alias +class QKVRoPEAlgorithm(Enum): + """QKVRoPE algorithm.""" + + GENERAL = "general" + + class QKVRoPE(TileRTModule): """QKV RoPE module. Unified for deepseek_v3_2 and glm_5.""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [QKVRoPEAlgorithm.GENERAL], + "glm_5": [QKVRoPEAlgorithm.GENERAL], + } + def __init__( self, model_args: ModelArgs, @@ -165,12 +168,13 @@ def tilert_forward( cur_pos = torch.tensor([start_pos], dtype=torch.int32) qkv_rope( - q_pe_rope, pe_cache[:bsz, start_pos:end_pos], rope_freqs, cur_pos, self.profile_logs + q_pe_rope, + pe_cache[:bsz, start_pos:end_pos], + rope_freqs, + cur_pos, + self.profile_logs, + model_arch=self.model_args.arch_name, ) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) return q_pe_rope diff --git a/tilert/models/deepseek_v3_2/ops/receive_selected_token_ids.py b/tilert/models/deepseek_v3_2/ops/receive_selected_token_ids.py new file mode 100644 index 0000000..508d13e --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/receive_selected_token_ids.py @@ -0,0 +1,35 @@ +"""ReceiveSelectedTokenIds — receive idx_selects from GPU 0.""" + +import torch + +__all__ = [ + "receive_selected_token_ids", +] + + +def receive_selected_token_ids( + ll_buf: torch.Tensor, + dst: torch.Tensor, + expected_flag: int, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", +) -> None: + """Receive idx_selects from GPU 0. + + Args: + ll_buf: Receive buffer on this GPU (written by GPU 0). + dst: Destination idx_selects tensor [1, S, 2048] int32. + expected_flag: Expected synchronization flag value. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16"). + """ + torch.ops.tilert.receive_selected_token_ids_op( + ll_buf, + dst, + expected_flag, + model_arch, + compute_kernel_type, + profile_logs, + ) diff --git a/python/models/deepseek_v3_2/ops/rmsnorm_expert_proj.py b/tilert/models/deepseek_v3_2/ops/rmsnorm_expert_proj.py similarity index 94% rename from python/models/deepseek_v3_2/ops/rmsnorm_expert_proj.py rename to tilert/models/deepseek_v3_2/ops/rmsnorm_expert_proj.py index ce867a7..a12441a 100644 --- a/python/models/deepseek_v3_2/ops/rmsnorm_expert_proj.py +++ b/tilert/models/deepseek_v3_2/ops/rmsnorm_expert_proj.py @@ -1,6 +1,7 @@ """RMSNormExpertProj operation module.""" from dataclasses import dataclass +from enum import Enum import torch from torch import nn @@ -8,7 +9,6 @@ from tilert.models.base import TileRTModule from tilert.models.common import RMSNorm, init_func, linear from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.profiler.utils import parse_profile_log_tensor from tilert.utils import get_profile_log_tensor __all__ = [ @@ -40,9 +40,20 @@ def __call__(self) -> list[str]: return [self.unproj_o_gamma, self.exp_proj_weights] +class RMSNormExpertProjAlgorithm(Enum): + """RMSNormExpertProj algorithm.""" + + GENERAL = "general" + + class RMSNormExpertProj(TileRTModule): """RMS Norm followed by expert projection.""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RMSNormExpertProjAlgorithm.GENERAL], + "glm_5": [RMSNormExpertProjAlgorithm.GENERAL], + } + def __init__( self, model_args: ModelArgs, @@ -151,12 +162,10 @@ def tilert_forward(self, x_in: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor self.tilert_proj_weight, scores_out, hidden_out, + self.model_args.arch_name, + "bf16", self.profile_logs, ) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) return hidden_out, scores_out def __call__(self, x_in: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: diff --git a/tilert/models/deepseek_v3_2/ops/rmsnorm_head_proj.py b/tilert/models/deepseek_v3_2/ops/rmsnorm_head_proj.py new file mode 100644 index 0000000..413a2e7 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/rmsnorm_head_proj.py @@ -0,0 +1,296 @@ +"""RMSNormHeadProj operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "rmsnorm_head_proj", + "RMSNormHeadProj", + "RMSNormHeadProjTilertWeightsAlias", +] + + +def rmsnorm_head_proj( + hidden_in: torch.Tensor, + gamma_in: torch.Tensor, + weight_in: torch.Tensor, + hidden_rmsnorm_out: torch.Tensor, + logits_out: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", +) -> None: + """RMS Norm Head Projection operation.""" + torch.ops.tilert.rmsnorm_head_proj_op( + hidden_in, + gamma_in, + weight_in, + hidden_rmsnorm_out, + logits_out, + model_arch, + compute_kernel_type, + profile_logs, + ) + + +class RMSNormHeadProjAlgorithm(Enum): + """RMSNormHeadProj algorithm""" + + GENERAL = "general" + + +class RMSNormHeadProjWeightsConverter(TilertWeightsConverter): + """RMSNormHeadProj weights converter""" + + @staticmethod + def tilert_to_tilert_native_bf16_warp_gemv( + tilert_weight_in: torch.Tensor, + ) -> torch.Tensor: + """Convert TILERT weights to TILERT native bf16 warp gemv weights.""" + weights = tilert_weight_in.reshape(1010, 16, 7, 1024) + weights = weights.transpose(1, 2).reshape(7070, 16, 1024) + return weights.contiguous() + + def convert_to_general( + self, weights_list: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Convert the weights to general format. + + Args: + weights_list: List of weights. + + Returns: + Tuple of weights. + """ + args = self.model_args + assert args.arch_name == "deepseek_v3_2" or args.arch_name == "glm_5" + + with torch.inference_mode(): + rmsnorm_gamma, mat_in = weights_list + logits_dim = mat_in.shape[-2] + dim = mat_in.shape[-1] + num_steps = dim // 1024 + assert dim % 1024 == 0 + weights = mat_in.reshape(logits_dim // 16, 16, num_steps, 1024) + weights = weights.transpose(1, 2).reshape(logits_dim // 16 * num_steps, 16, 1024) + return rmsnorm_gamma.float(), weights + + +@dataclass +class RMSNormHeadProjTilertWeightsAlias: + """TileRT weights alias for RMSNormHeadProj.""" + + model_norm_weight = "model.norm.weight" + lm_head_weight = "lm_head.weight" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.model_norm_weight, self.lm_head_weight] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RMSNormHeadProj(TileRTModule): + """RMSNormHeadProj module""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RMSNormHeadProjAlgorithm.GENERAL], + "glm_5": [RMSNormHeadProjAlgorithm.GENERAL], + } + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + algorithm: RMSNormHeadProjAlgorithm = RMSNormHeadProjAlgorithm.GENERAL, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + + self.arch_name = self.model_args.arch_name + self.dim = self.model_args.dim + self.logits_dim = self.model_args.vocab_size + self.algorithm = algorithm + self.eps = self.model_args.eps + + self.ref_rmsnorm_gamma: torch.Tensor | None = None + self.ref_head_proj: torch.Tensor | None = None + + self.tilert_rmsnorm_gamma: torch.Tensor | None = None + self.tilert_head_proj: torch.Tensor | None = None + + self.hidden_rmsnorm_out: torch.Tensor | None = None + self.hidden_out: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + self.tilert_weights_alias = RMSNormHeadProjTilertWeightsAlias() + + self.ref_tensor_alias: list[str] = [ + "model.norm.weight", + "lm_head.weight", + ] + + @property + def tilert_tensor_alias(self) -> list[str]: + return self.tilert_weights_alias() + + def get_weights_list(self) -> list[torch.Tensor]: + """ + Get the weights list. + + Returns: + List of weights. + """ + return [self.tilert_rmsnorm_gamma, self.tilert_head_proj] + + def device_sharding( + self, + weights_dict: dict[str, torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Device sharding. + + Args: + weights_dict: Dictionary of weights. + key_prefix: Key prefix. + Returns: + Tuple of weights. + """ + rmsnorm_gamma_key = "model.norm.weight" + head_proj_key = "lm_head.weight" + rmsnorm_gamma = weights_dict[rmsnorm_gamma_key][None, ...] + rmsnorm_gamma = rmsnorm_gamma.repeat(self.num_devices, 1) + head_proj = weights_dict[head_proj_key] + + head_proj = head_proj.reshape(self.num_devices, -1, self.dim) + return rmsnorm_gamma.contiguous(), head_proj.contiguous() + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """ + Initialize the reference weights. + + Args: + state_dict: State dictionary. + device_id: Device ID. + """ + sharded_list = self.device_sharding(state_dict) + + gamma, head_proj = sharded_list[0][self.device_id], sharded_list[1][self.device_id] + self.ref_rmsnorm_gamma = gamma + self.ref_head_proj = head_proj + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """ + Initialize the tilert weights. + + Args: + state_dict: State dictionary. + """ + assert self.algorithm is not None + self.tilert_rmsnorm_gamma, self.tilert_head_proj = RMSNormHeadProjWeightsConverter( + self.model_args, self.num_devices + ).dispatch(self.algorithm, [state_dict[alias] for alias in self.tilert_weights_alias()]) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + """ + Initialize the tilert variables. + + Args: + batch_size: Batch size. + seq_len: Sequence length. + """ + self.hidden_rmsnorm_out = torch.zeros( + (batch_size, seq_len, self.dim), + dtype=torch.bfloat16, + device=f"cuda:{self.device_id}", + ) + self.hidden_out = torch.zeros( + (batch_size, seq_len, self.logits_dim // self.num_devices), + dtype=torch.float32, + device=f"cuda:{self.device_id}", + ) + self.profile_logs = get_profile_log_tensor(device=f"cuda:{self.device_id}") + self.is_init = True + + def init_random_weights(self, device_id: int | None = None) -> None: + """Initialize the random weights.""" + if device_id is None: + device_id = self.device_id + rmsnorm_gamma = torch.randn(self.dim, dtype=torch.float32, device=f"cuda:{device_id}") + head_proj = torch.randn( + self.logits_dim, self.dim, dtype=torch.bfloat16, device=f"cuda:{device_id}" + ) + + tensor_list = [ + rmsnorm_gamma, + head_proj, + ] + state_dict = dict(zip(self.ref_tensor_alias, tensor_list)) + + self.init_reference_weights(state_dict) + sharded_list = self.device_sharding(state_dict) + sharded_state_dict = { + alias: sharded_list[i][self.device_id] + for i, alias in enumerate(self.tilert_weights_alias()) + } + self.init_tilert_weights(sharded_state_dict) + + def golden_forward( + self, + hidden_in: torch.Tensor, + ) -> torch.Tensor: + """ + Forward pass for the down-project module. + + Args: + hidden_in: Input hidden. + + Returns: + Output tensor. + """ + assert self.ref_rmsnorm_gamma is not None + assert self.ref_head_proj is not None + bsz = hidden_in.shape[0] + assert bsz == 1 + hidden_rmsnorm = torch.nn.functional.rms_norm( + hidden_in.float(), [hidden_in.size(-1)], self.ref_rmsnorm_gamma, self.eps + ) + return hidden_rmsnorm.float() @ self.ref_head_proj.T.float() + + def tilert_forward( + self, + hidden_in: torch.Tensor, + ) -> torch.Tensor: + assert self.hidden_out is not None + + rmsnorm_head_proj( + hidden_in, + self.tilert_rmsnorm_gamma, + self.tilert_head_proj, + self.hidden_rmsnorm_out, + self.hidden_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + return self.hidden_out + + def __call__( + self, + hidden_in: torch.Tensor, + ) -> torch.Tensor: + return self.golden_forward(hidden_in) diff --git a/python/models/deepseek_v3_2/ops/rmsnorm_kv.py b/tilert/models/deepseek_v3_2/ops/rmsnorm_kv.py similarity index 88% rename from python/models/deepseek_v3_2/ops/rmsnorm_kv.py rename to tilert/models/deepseek_v3_2/ops/rmsnorm_kv.py index d9c9af0..fcc3464 100644 --- a/python/models/deepseek_v3_2/ops/rmsnorm_kv.py +++ b/tilert/models/deepseek_v3_2/ops/rmsnorm_kv.py @@ -1,12 +1,12 @@ """RMSNormKV operation module.""" from dataclasses import dataclass +from enum import Enum import torch from tilert.models.base import TileRTModule from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.profiler.utils import parse_profile_log_tensor from tilert.utils import get_profile_log_tensor __all__ = [ @@ -23,6 +23,8 @@ def rmsnorm_kv( cur_pos: torch.Tensor, kv_cache: torch.Tensor, profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", ) -> None: """ Define the RMSNormKV operation. @@ -33,8 +35,12 @@ def rmsnorm_kv( cur_pos: Current position tensor. kv_cache: Output tensor. profile_logs: Profile logs tensor. + model_arch: Model architecture string. + compute_kernel_type: Compute kernel type string. """ - torch.ops.tilert.rmsnorm_kv_op(kv, gamma, cur_pos, kv_cache, profile_logs) + torch.ops.tilert.rmsnorm_kv_op( + kv, gamma, cur_pos, kv_cache, model_arch, compute_kernel_type, profile_logs + ) @dataclass @@ -65,9 +71,20 @@ def __call__(self) -> list[str]: return self.tilert_tensor_alias +class KVRMSNormAlgorithm(Enum): + """KVRMSNorm algorithm.""" + + GENERAL = "general" + + class KVRMSNorm(TileRTModule): """KVRMSNorm module: RMSNorm on KV tensor with in-place write to kv_cache.""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [KVRMSNormAlgorithm.GENERAL], + "glm_5": [KVRMSNormAlgorithm.GENERAL], + } + def __init__( self, model_args: ModelArgs, @@ -170,11 +187,14 @@ def tilert_forward( assert self.tilert_kv_norm_weight is not None assert self.profile_logs is not None cur_pos = torch.tensor([start_pos], dtype=torch.int32, device=kv.device) - rmsnorm_kv(kv, self.tilert_kv_norm_weight, cur_pos, kv_cache[:bsz], self.profile_logs) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) + rmsnorm_kv( + kv, + self.tilert_kv_norm_weight, + cur_pos, + kv_cache[:bsz], + self.profile_logs, + model_arch=self.model_args.arch_name, + ) def __call__( self, kv: torch.Tensor, kv_cache: torch.Tensor, start_pos: int, bsz: int, seqlen: int diff --git a/tilert/models/deepseek_v3_2/ops/rmsnorm_projq_wqb.py b/tilert/models/deepseek_v3_2/ops/rmsnorm_projq_wqb.py new file mode 100644 index 0000000..496df47 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/rmsnorm_projq_wqb.py @@ -0,0 +1,540 @@ +"""RmsnormProjqWqb operation module.""" + +import math +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RmsnormProjqWqb", + "RmsnormProjqWqbAlgorithm", + "RmsnormProjqWqbWeightsConverter", +] + + +def rmsnorm_projq_wqb_op( + q: torch.Tensor, + wq_b: torch.Tensor, + wq_b_scales: torch.Tensor, + q_norm_weight: torch.Tensor, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + profile_logs: torch.Tensor, + algorithm: str, + model_arch: str, +) -> None: + torch.ops.tilert.rmsnorm_proj_qb_op( + q, + wq_b, + wq_b_scales, + q_norm_weight, + q_nope, + q_pe, + model_arch, + algorithm, + profile_logs, + torch.empty(0, dtype=torch.int64, device=q.device), + ) + + +class RmsnormProjqWqbAlgorithm(Enum): + """RmsnormProjqWqb algorithm.""" + + FP16MMA = "fp16mma" + BF16MMA = "bf16mma" + + +class RmsnormProjqWqbWeightsConverter(TilertWeightsConverter): + """Weights converter for RmsnormProjqWqb. + + Supports configurations where n_heads is not evenly divisible by + num_devices; in that case n_local_heads is padded and padded head + weight rows are zero-filled. + """ + + kBf16NumCtas = 80 + kGemvPageSize = 8 + + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args=model_args, num_devices=num_devices) + + self.proc_groups = 8 + self.repeat = 16 + + self.block_size = self.model_args.block_size + + self.qk_nope_head_dim = self.model_args.qk_nope_head_dim + self.qk_rope_head_dim = self.model_args.qk_rope_head_dim + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + + self.needs_padding = self.model_args.n_heads % num_devices != 0 + self.n_local_heads = self._compute_n_local_heads( + self.model_args.n_heads, num_devices, self.qk_head_dim + ) + + self.q_lora_dim = self.model_args.q_lora_rank + self.q_lora_qdim = self.q_lora_dim // self.block_size + + self.qk_dim = self.qk_head_dim * self.n_local_heads + self.qk_qdim = self.qk_dim // self.block_size + + assert self.qk_dim % (self.kBf16NumCtas * self.kGemvPageSize) == 0, ( + f"qk_dim ({self.qk_dim}) must be divisible by " + f"kBf16NumCtas * kGemvPageSize ({self.kBf16NumCtas * self.kGemvPageSize})" + ) + assert self.qk_dim % self.block_size == 0, ( + f"qk_dim ({self.qk_dim}) must be divisible by block_size ({self.block_size}) " + f"for scale alignment" + ) + + @classmethod + def _compute_n_local_heads(cls, n_total_heads: int, num_devices: int, qk_head_dim: int) -> int: + """Compute padded n_local_heads per device.""" + if n_total_heads % num_devices == 0: + return n_total_heads // num_devices + + base = math.ceil(n_total_heads / num_devices) + align_unit = cls.kBf16NumCtas * cls.kGemvPageSize + g = math.gcd(qk_head_dim, align_unit) + head_align = align_unit // g + return math.ceil(base / head_align) * head_align + + @staticmethod + def _redistribute_heads( + wq_b_full: torch.Tensor, + wq_b_scale_full: torch.Tensor, + n_total_heads: int, + n_local_heads: int, + num_devices: int, + qk_head_dim: int, + block_size: int, + ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: + """Redistribute heads across devices with padding. + + Args: + wq_b_full: [n_total_heads * qk_head_dim, q_lora_dim] full weight. + wq_b_scale_full: [n_total_heads * qk_head_dim // block_size, q_lora_qdim] full scale. + n_total_heads: Total number of heads (e.g. 128). + n_local_heads: Target heads per GPU (padded, e.g. 20). + num_devices: Number of devices (e.g. 7). + qk_head_dim: Head dimension (e.g. 192). + block_size: Quantization block size (e.g. 128). + + Returns: + Lists of per-device (wq_b, wq_b_scale) with shape + [n_local_heads * qk_head_dim, q_lora_dim] and + [n_local_heads * qk_head_dim // block_size, q_lora_qdim]. + """ + total_rows = n_total_heads * qk_head_dim + rows_per_dev = n_local_heads * qk_head_dim + scale_rows_per_dev = rows_per_dev // block_size + total_scale_rows = total_rows // block_size + + q_lora_dim = wq_b_full.shape[-1] + q_lora_qdim = wq_b_scale_full.shape[-1] + + assert rows_per_dev % block_size == 0, ( + f"n_local_heads * qk_head_dim ({rows_per_dev}) must be " + f"divisible by block_size ({block_size})" + ) + + wq_b_list = [] + scale_list = [] + for dev in range(num_devices): + start_row = dev * rows_per_dev + end_row = min(total_rows, start_row + rows_per_dev) + real_rows = max(0, end_row - start_row) + + dev_wqb = torch.zeros( + rows_per_dev, q_lora_dim, dtype=wq_b_full.dtype, device=wq_b_full.device + ) + if real_rows > 0: + dev_wqb[:real_rows] = wq_b_full[start_row:end_row] + + start_scale = dev * scale_rows_per_dev + end_scale = min(total_scale_rows, start_scale + scale_rows_per_dev) + real_scale_rows = max(0, end_scale - start_scale) + + dev_scale = torch.zeros( + scale_rows_per_dev, + q_lora_qdim, + dtype=wq_b_scale_full.dtype, + device=wq_b_scale_full.device, + ) + if real_scale_rows > 0: + dev_scale[:real_scale_rows] = wq_b_scale_full[start_scale:end_scale] + + wq_b_list.append(dev_wqb) + scale_list.append(dev_scale) + + return wq_b_list, scale_list + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x16_for_pages( + mat_in: torch.Tensor, q_lora_dim: int, pages: int + ) -> torch.Tensor: + """Swizzle 16xK matrix for paged MMA layout, any K divisible by 16.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == q_lora_dim + k_per_page = q_lora_dim // pages + n_k_tiles = k_per_page // 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 16, pages, k_per_page).transpose(-3, -2) + mat_in = mat_in.reshape(*pre_shape, pages, 16, n_k_tiles, 16).transpose(-3, -2) + mat_in = RmsnormProjqWqbWeightsConverter._swizzle_mma_16x16(mat_in) + return mat_in.contiguous() + + def _common_to_tilert_fp16mma( + self, + wq_b: torch.Tensor, + wq_b_scales_raw: torch.Tensor, + rmsnorm_gamma: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common weights to the FP16 MMA layout.""" + pages = 2 + rows_per_cta = 32 + + qk_nope_dim = self.n_local_heads * self.qk_nope_head_dim + qk_pe_dim = self.n_local_heads * self.qk_rope_head_dim + nope_ctas = qk_nope_dim // rows_per_cta + pe_ctas = qk_pe_dim // rows_per_cta + num_ctas = nope_ctas + pe_ctas + + wq_b_scales_f32 = wq_b_scales_raw.to(torch.float32) + wq_b_scales_f32 = ( + wq_b_scales_f32.reshape(self.qk_qdim, 1, self.q_lora_qdim) + .repeat(1, self.block_size, 1) + .reshape(self.qk_dim, self.q_lora_qdim) + ) + + wq_b_scales_f32 = wq_b_scales_f32.reshape( + self.n_local_heads, self.qk_head_dim, self.q_lora_qdim + ) + scale_nope = wq_b_scales_f32[:, : self.qk_nope_head_dim, :].reshape(-1, self.q_lora_qdim) + scale_pe = wq_b_scales_f32[:, self.qk_nope_head_dim :, :].reshape(-1, self.q_lora_qdim) + + scale_nope = scale_nope.reshape( + nope_ctas, rows_per_cta, pages, self.q_lora_qdim // pages + ).transpose(1, 2)[:, :, 0, :] + scale_pe = scale_pe.reshape( + pe_ctas, rows_per_cta, pages, self.q_lora_qdim // pages + ).transpose(1, 2)[:, :, 0, :] + + scales = torch.cat([scale_nope, scale_pe], dim=0) + scales_fp8 = scales.contiguous().view(torch.float8_e4m3fn) + + wq_b = wq_b.reshape(self.n_local_heads, self.qk_head_dim, self.q_lora_dim) + wq_b_nope = wq_b[:, : self.qk_nope_head_dim, :].reshape(-1, self.q_lora_dim) + wq_b_pe = wq_b[:, self.qk_nope_head_dim :, :].reshape(-1, self.q_lora_dim) + + wq_b_nope = wq_b_nope.reshape(nope_ctas, rows_per_cta // 16, 16, self.q_lora_dim) + wq_b_nope = RmsnormProjqWqbWeightsConverter._swizzle_mma_16x16_for_pages( + wq_b_nope, self.q_lora_dim, pages + ) + wq_b_nope = ( + wq_b_nope.reshape(nope_ctas, rows_per_cta // 16, pages, 16, -1) + .transpose(1, 2) + .reshape(nope_ctas, pages, rows_per_cta, -1) + ) + + wq_b_pe = wq_b_pe.reshape(pe_ctas, rows_per_cta // 16, 16, self.q_lora_dim) + wq_b_pe = RmsnormProjqWqbWeightsConverter._swizzle_mma_16x16_for_pages( + wq_b_pe, self.q_lora_dim, pages + ) + wq_b_pe = ( + wq_b_pe.reshape(pe_ctas, rows_per_cta // 16, pages, 16, -1) + .transpose(1, 2) + .reshape(pe_ctas, pages, rows_per_cta, -1) + ) + + weights = torch.cat([wq_b_nope, wq_b_pe], dim=0) + weights = weights.reshape(num_ctas, pages, -1) + + scale_padding_size = 128 - scales_fp8.shape[-1] + scale_padding = torch.zeros( + num_ctas, + pages, + scale_padding_size, + dtype=torch.float8_e4m3fn, + device=wq_b.device, + ) + tilert_wqb = torch.cat([weights, scales_fp8, scale_padding], dim=-1).contiguous() + + tilert_wqb_scales = torch.zeros(1, dtype=torch.bfloat16) + tilert_gamma = rmsnorm_gamma.float().detach().clone() + return tilert_wqb, tilert_wqb_scales, tilert_gamma + + def convert_to_bf16mma( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common-format weights to the BF16 MMA layout.""" + return self.convert_to_fp16mma(weights) + + def convert_to_fp16mma( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common-format weights to TileRT FP16 MMA layout.""" + with torch.inference_mode(): + wq_b, wq_b_scale, q_norm_weight = weights + return self._common_to_tilert_fp16mma(wq_b, wq_b_scale, q_norm_weight) + + +@dataclass +class RmsnormProjqWqbRefWeightsAlias: + """Reference weights alias for RmsnormProjqWqb.""" + + rmsnorm_gamma = "self_attn.q_a_layernorm.weight" + wqb_weights = "self_attn.q_b_proj.weight" + wqb_scales = "self_attn.q_b_proj.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [ + self.rmsnorm_gamma, + self.wqb_weights, + self.wqb_scales, + ] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class RmsnormProjqWqbTilertWeightsAlias: + """TileRT weights alias for RmsnormProjqWqb.""" + + rmsnorm_gamma = "q_rmsnorm_gamma" + wqb_weights = "wqb_weights" + wqb_scales = "wqb_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.rmsnorm_gamma, + self.wqb_weights, + self.wqb_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RmsnormProjqWqb(TileRTModule): + """RmsnormProjqWqb module: RMSNorm + Q projection (wq_b only).""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ + RmsnormProjqWqbAlgorithm.FP16MMA, + RmsnormProjqWqbAlgorithm.BF16MMA, + ], + "glm_5": [RmsnormProjqWqbAlgorithm.FP16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int = 7, + ref_weights_alias: RmsnormProjqWqbRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + + self.tilert_weights_alias = RmsnormProjqWqbTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else RmsnormProjqWqbRefWeightsAlias() + ) + + self.n_local_heads = RmsnormProjqWqbWeightsConverter._compute_n_local_heads( + model_args.n_heads, + num_devices, + model_args.qk_nope_head_dim + model_args.qk_rope_head_dim, + ) + self.q_lora_rank = model_args.q_lora_rank + self.n_heads = model_args.n_heads + self.qk_nope_head_dim = model_args.qk_nope_head_dim + self.qk_rope_head_dim = model_args.qk_rope_head_dim + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + self.qk_local_dim = self.qk_head_dim * self.n_local_heads + + self.block_size = model_args.block_size + self.q_lora_qdim = self.q_lora_rank // self.block_size + self.qk_local_qdim = self.qk_local_dim // self.block_size + self.eps = model_args.eps + + self.ref_q_norm: torch.Tensor | None = None + self.ref_wq_b: torch.Tensor | None = None + + self.tilert_wq_b: torch.Tensor | None = None + self.tilert_wq_b_scales: torch.Tensor | None = None + self.tilert_q_norm_weight: torch.Tensor | None = None + + self.q_nope: torch.Tensor | None = None + self.q_pe: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_q_norm_weight, self.tilert_wq_b, self.tilert_wq_b_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Redistribute heads across devices with padding.""" + gamma = weights_map[self.ref_weights_alias.rmsnorm_gamma][None, ...].repeat( + self.num_devices, 1 + ) + + wq_b_full = weights_map[self.ref_weights_alias.wqb_weights] + wq_b_scale_full = weights_map[self.ref_weights_alias.wqb_scales] + + wq_b_list, scale_list = RmsnormProjqWqbWeightsConverter._redistribute_heads( + wq_b_full, + wq_b_scale_full, + n_total_heads=self.n_heads, + n_local_heads=self.n_local_heads, + num_devices=self.num_devices, + qk_head_dim=self.qk_head_dim, + block_size=self.block_size, + ) + + sharded_wqb_weights = torch.stack(wq_b_list, dim=0) + sharded_wqb_scales = torch.stack(scale_list, dim=0) + + return { + self.tilert_weights_alias.rmsnorm_gamma: gamma, + self.tilert_weights_alias.wqb_weights: sharded_wqb_weights, + self.tilert_weights_alias.wqb_scales: sharded_wqb_scales, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize reference weights from common-format state dict.""" + self.ref_q_norm = state_dict[self.ref_weights_alias.rmsnorm_gamma] + + wq_b_full = state_dict[self.ref_weights_alias.wqb_weights] + wq_b_scale_full = state_dict[self.ref_weights_alias.wqb_scales] + + wq_b_bf16_full = weight_dequant(wq_b_full, wq_b_scale_full) + + total_rows = self.n_heads * self.qk_head_dim + rows_per_dev = self.n_local_heads * self.qk_head_dim + start_row = self.device_id * rows_per_dev + end_row = min(total_rows, start_row + rows_per_dev) + real_rows = max(0, end_row - start_row) + + dev_wqb = torch.zeros( + rows_per_dev, + wq_b_bf16_full.shape[-1], + dtype=wq_b_bf16_full.dtype, + device=wq_b_bf16_full.device, + ) + if real_rows > 0: + dev_wqb[:real_rows] = wq_b_bf16_full[start_row:end_row] + + self.ref_wq_b = dev_wqb.contiguous() + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize TileRT weights from common-format state dict.""" + weights = [ + state_dict[self.tilert_weights_alias.wqb_weights], + state_dict[self.tilert_weights_alias.wqb_scales], + state_dict[self.tilert_weights_alias.rmsnorm_gamma], + ] + assert self.algorithm is not None, "Algorithm is not set" + self.tilert_wq_b, self.tilert_wq_b_scales, self.tilert_q_norm_weight = ( + RmsnormProjqWqbWeightsConverter(self.model_args, self.num_devices).dispatch( + self.algorithm, weights + ) + ) + + def init_random_weights(self) -> None: + """Initialize random reference and TileRT weights for testing.""" + q_norm = torch.randn(self.q_lora_rank, dtype=torch.float32) + + wq_b = torch.randn(self.qk_local_dim, self.q_lora_rank, dtype=torch.bfloat16).to( + torch.float8_e4m3fn + ) + scale_dtype = torch.float32 if self.model_args.arch_name == "glm_5" else torch.bfloat16 + wq_b_scale = torch.randn(self.qk_local_qdim, self.q_lora_qdim, dtype=scale_dtype) + + self.ref_q_norm = q_norm + self.ref_wq_b = weight_dequant(wq_b, wq_b_scale).contiguous() + + assert self.algorithm is not None, "Algorithm is not set" + weights = [wq_b, wq_b_scale, q_norm] + self.tilert_wq_b, self.tilert_wq_b_scales, self.tilert_q_norm_weight = ( + RmsnormProjqWqbWeightsConverter(self.model_args, self.num_devices).dispatch( + self.algorithm, weights + ) + ) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + """Allocate TileRT output buffers.""" + self.q_nope = torch.zeros( + batch_size, seq_len, self.n_local_heads, self.qk_nope_head_dim, dtype=torch.bfloat16 + ) + self.q_pe = torch.zeros( + batch_size, seq_len, self.n_local_heads, self.qk_rope_head_dim, dtype=torch.bfloat16 + ) + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward(self, q: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Reference forward: RMSNorm + linear projection (no iq).""" + assert self.ref_q_norm is not None + assert self.ref_wq_b is not None + + bsz, seqlen, _ = q.shape + if bsz != 1 or seqlen not in [1, 2, 4]: + raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") + + qr = torch.nn.functional.rms_norm(q.float(), [q.size(-1)], self.ref_q_norm, self.eps).to( + q.dtype + ) + + q_out = torch.matmul(qr, self.ref_wq_b.T) + q_out = q_out.view(bsz, seqlen, self.n_local_heads, self.qk_head_dim) + q_nope, q_pe = torch.split(q_out, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + return q_nope, q_pe + + def tilert_forward(self, q: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + assert self.tilert_wq_b is not None + assert self.tilert_wq_b_scales is not None + assert self.tilert_q_norm_weight is not None + assert self.q_nope is not None + assert self.q_pe is not None + assert self.profile_logs is not None + + bsz, seqlen, _ = q.shape + if bsz != 1 or seqlen not in [1, 2, 4]: + raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") + + assert self.algorithm is not None, "Algorithm is not set" + + rmsnorm_projq_wqb_op( + q, + self.tilert_wq_b, + self.tilert_wq_b_scales, + self.tilert_q_norm_weight, + self.q_nope, + self.q_pe, + self.profile_logs, + self.algorithm.value, + model_arch=self.model_args.arch_name, + ) + + return self.q_nope, self.q_pe diff --git a/tilert/models/deepseek_v3_2/ops/rmsnorm_projq_wqi.py b/tilert/models/deepseek_v3_2/ops/rmsnorm_projq_wqi.py new file mode 100644 index 0000000..b91faf6 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/rmsnorm_projq_wqi.py @@ -0,0 +1,340 @@ +"""RmsnormProjqWqi operation module (IQ-only projection).""" + +from dataclasses import dataclass +from enum import Enum + +import torch +from einops import rearrange + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RmsnormProjqWqi", + "RmsnormProjqWqiAlgorithm", + "RmsnormProjqWqiWeightsConverter", +] + + +def rmsnorm_projq_wqi_op( + q: torch.Tensor, + wqi: torch.Tensor, + wqi_scale: torch.Tensor, + rmsnorm_gamma: torch.Tensor, + iq: torch.Tensor, + profile_logs: torch.Tensor, + algorithm: str, + model_arch: str, +) -> None: + torch.ops.tilert.rmsnorm_proj_qi_op( + q, + wqi, + wqi_scale, + rmsnorm_gamma, + iq, + model_arch, + algorithm, + profile_logs, + ) + + +class RmsnormProjqWqiAlgorithm(Enum): + """RmsnormProjqWqi algorithm.""" + + FP16MMA = "fp16mma" + BF16MMA = "bf16mma" + + +class RmsnormProjqWqiWeightsConverter(TilertWeightsConverter): + """Weights converter: common format to TileRT format (IQ only).""" + + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args=model_args, num_devices=num_devices) + + self.block_size = self.model_args.block_size + self.q_lora_dim = self.model_args.q_lora_rank + self.q_lora_qdim = self.q_lora_dim // self.block_size + + self.index_n_heads = self.model_args.index_n_heads + self.index_head_dim = self.index_n_heads * self.model_args.index_head_dim + self.index_head_qdim = self.index_head_dim // self.block_size + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x16_for_pages( + mat_in: torch.Tensor, q_lora_rank: int, pages: int + ) -> torch.Tensor: + """Swizzle 16xK matrix for paged MMA layout, any K divisible by 16.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == q_lora_rank + pre_shape = mat_in.shape[:-2] + k_per_page = q_lora_rank // pages + n_k_tiles = k_per_page // 16 + mat_in = mat_in.reshape(*pre_shape, 16, pages, k_per_page).transpose(-3, -2) + mat_in = mat_in.reshape(*pre_shape, pages, 16, n_k_tiles, 16).transpose(-3, -2) + mat_in = RmsnormProjqWqiWeightsConverter._swizzle_mma_16x16(mat_in) + return mat_in.contiguous() + + def _common_to_tilert_fp16mma( + self, + wqi: torch.Tensor, + wqi_scales: torch.Tensor, + rmsnorm_gamma: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common weights to TileRT FP16 MMA layout (IQ only).""" + sms = 128 + k_per_page = 1024 if self.model_args.arch_name == "glm_5" else 512 + pages = self.q_lora_dim // k_per_page + iq_dim_per_sm = self.index_head_dim // sms + + wqi_scales_f32 = wqi_scales.to(torch.float32) + wqi_scales_f32 = ( + wqi_scales_f32.reshape(self.index_head_qdim, 1, self.q_lora_qdim) + .repeat(1, self.block_size, 1) + .reshape(self.index_head_dim, self.q_lora_qdim) + ) + wqi_scales_f32 = wqi_scales_f32.reshape( + sms, iq_dim_per_sm, pages, self.q_lora_qdim // pages + ).transpose(1, 2) + wqi_scales_f32 = wqi_scales_f32[:, :, 0, :] + wqi_full_scales = wqi_scales_f32.contiguous().view(torch.float8_e4m3fn) + + wqi_mat = wqi.reshape(sms, iq_dim_per_sm // 16, 16, self.q_lora_dim) + wqi_mat = RmsnormProjqWqiWeightsConverter._swizzle_mma_16x16_for_pages( + wqi_mat, self.q_lora_dim, pages + ) + wqi_mat = ( + wqi_mat.reshape(sms, iq_dim_per_sm // 16, pages, 16, -1) + .transpose(1, 2) + .reshape(sms, pages, iq_dim_per_sm, -1) + ) + wqi_mat = wqi_mat.reshape(sms, pages, -1) + + wqi_scales_padding = torch.zeros( + sms, + pages, + 128 - wqi_full_scales.shape[-1], + dtype=torch.float8_e4m3fn, + device=wqi.device, + ) + tilert_wqi = torch.cat([wqi_mat, wqi_full_scales, wqi_scales_padding], dim=-1).contiguous() + tilert_wqi_scales = torch.zeros(1, dtype=torch.bfloat16) + tilert_gamma = rmsnorm_gamma.float().detach().clone() + return tilert_wqi, tilert_wqi_scales, tilert_gamma + + def convert_to_bf16mma( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common-format weights to the BF16 MMA layout.""" + return self.convert_to_fp16mma(weights) + + def convert_to_fp16mma( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common-format weights to TileRT FP16 MMA layout. + + Args: + weights: [wqi, wqi_scale, q_norm_weight]. + """ + with torch.inference_mode(): + wqi, wqi_scale, q_norm_weight = weights + return self._common_to_tilert_fp16mma(wqi, wqi_scale, q_norm_weight) + + +@dataclass +class RmsnormProjqWqiRefWeightsAlias: + """Reference (HuggingFace) weights alias for RmsnormProjqWqi.""" + + rmsnorm_gamma = "self_attn.q_a_layernorm.weight" + wqi_weights = "self_attn.indexer.wq_b.weight" + wqi_scales = "self_attn.indexer.wq_b.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [self.rmsnorm_gamma, self.wqi_weights, self.wqi_scales] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class RmsnormProjqWqiTilertWeightsAlias: + """TileRT weights alias for RmsnormProjqWqi.""" + + rmsnorm_gamma = "q_rmsnorm_gamma_qi" + wqi_weights = "wqi_weights" + wqi_scales = "wqi_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.rmsnorm_gamma, self.wqi_weights, self.wqi_scales] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RmsnormProjqWqi(TileRTModule): + """RmsnormProjqWqi module: RMSNorm + W_qi projection (IQ only, GLM5 v2).""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ + RmsnormProjqWqiAlgorithm.FP16MMA, + RmsnormProjqWqiAlgorithm.BF16MMA, + ], + "glm_5": [RmsnormProjqWqiAlgorithm.FP16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + + self.tilert_weights_alias = RmsnormProjqWqiTilertWeightsAlias() + self.ref_weights_alias = RmsnormProjqWqiRefWeightsAlias() + + self.q_lora_rank = model_args.q_lora_rank + self.index_n_heads = model_args.index_n_heads + self.head_dim = model_args.index_head_dim + self.index_head_dim = model_args.index_n_heads * model_args.index_head_dim + + self.block_size = model_args.block_size + self.q_lora_qdim = self.q_lora_rank // self.block_size + self.index_head_qdim = self.index_head_dim // self.block_size + self.eps = model_args.eps + + self.ref_q_norm: torch.Tensor | None = None + self.ref_wqi: torch.Tensor | None = None + + self.tilert_wqi: torch.Tensor | None = None + self.tilert_wqi_scales: torch.Tensor | None = None + self.tilert_q_norm_weight: torch.Tensor | None = None + + self.iq: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_q_norm_weight, self.tilert_wqi, self.tilert_wqi_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Replicate IQ weights across devices (no per-head redistribution needed).""" + gamma = ( + weights_map[self.ref_weights_alias.rmsnorm_gamma][None, ...] + .float() + .repeat(self.num_devices, 1) + ) + wqi_weights = weights_map[self.ref_weights_alias.wqi_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + wqi_scales = weights_map[self.ref_weights_alias.wqi_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + return { + self.tilert_weights_alias.rmsnorm_gamma: gamma, + self.tilert_weights_alias.wqi_weights: wqi_weights, + self.tilert_weights_alias.wqi_scales: wqi_scales, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize reference weights from common-format state dict.""" + self.ref_q_norm = state_dict[self.tilert_weights_alias.rmsnorm_gamma] + wqi = weight_dequant( + state_dict[self.tilert_weights_alias.wqi_weights], + state_dict[self.tilert_weights_alias.wqi_scales], + ) + self.ref_wqi = wqi.contiguous() + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize TileRT weights from common-format state dict.""" + weights = [ + state_dict[self.tilert_weights_alias.wqi_weights], + state_dict[self.tilert_weights_alias.wqi_scales], + state_dict[self.tilert_weights_alias.rmsnorm_gamma], + ] + assert self.algorithm is not None, "Algorithm is not set" + self.tilert_wqi, self.tilert_wqi_scales, self.tilert_q_norm_weight = ( + RmsnormProjqWqiWeightsConverter(self.model_args, self.num_devices).dispatch( + self.algorithm, weights + ) + ) + + def init_random_weights(self) -> None: + """Initialize random reference and TileRT weights for testing.""" + q_norm = torch.randn(self.q_lora_rank, dtype=torch.float32) + wqi = torch.randn(self.index_head_dim, self.q_lora_rank, dtype=torch.bfloat16).to( + torch.float8_e4m3fn + ) + scale_dtype = torch.float32 if self.model_args.arch_name == "glm_5" else torch.bfloat16 + wqi_scale = torch.randn(self.index_head_qdim, self.q_lora_qdim, dtype=scale_dtype) + + ref_state = { + self.tilert_weights_alias.rmsnorm_gamma: q_norm, + self.tilert_weights_alias.wqi_weights: wqi, + self.tilert_weights_alias.wqi_scales: wqi_scale, + } + + self.init_reference_weights(ref_state) + self.init_tilert_weights(ref_state) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + """Allocate TileRT output buffers.""" + self.iq = torch.zeros( + batch_size, seq_len, self.index_n_heads, self.head_dim, dtype=torch.bfloat16 + ) + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward(self, q: torch.Tensor) -> torch.Tensor: + """Reference forward: RMSNorm + W_qi_b linear projection.""" + assert self.ref_q_norm is not None + assert self.ref_wqi is not None + + bsz, seqlen, _ = q.shape + if bsz != 1 or seqlen not in [1, 2, 4, 8]: + raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") + + qr = torch.nn.functional.rms_norm(q.float(), [q.size(-1)], self.ref_q_norm, self.eps).to( + q.dtype + ) + + return rearrange(torch.matmul(qr, self.ref_wqi.T), "b s (h d) -> b s h d", d=self.head_dim) + + def tilert_forward(self, q: torch.Tensor) -> torch.Tensor: + assert self.tilert_wqi is not None + assert self.tilert_wqi_scales is not None + assert self.tilert_q_norm_weight is not None + assert self.iq is not None + assert self.profile_logs is not None + + bsz, seqlen, _ = q.shape + if bsz != 1 or seqlen not in [1, 2, 4, 8]: + raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") + + assert self.algorithm is not None, "Algorithm is not set" + + rmsnorm_projq_wqi_op( + q, + self.tilert_wqi, + self.tilert_wqi_scales, + self.tilert_q_norm_weight, + self.iq, + self.profile_logs, + self.algorithm.value, + model_arch=self.model_args.arch_name, + ) + + return self.iq diff --git a/tilert/models/deepseek_v3_2/ops/rmsnorm_projx_wqakis.py b/tilert/models/deepseek_v3_2/ops/rmsnorm_projx_wqakis.py new file mode 100644 index 0000000..4fd5b12 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/rmsnorm_projx_wqakis.py @@ -0,0 +1,341 @@ +"""RMSNormProjxWqakis operation module.""" + +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.models.deepseek_v3_2.ops.projx_wis import projx_wis +from tilert.models.deepseek_v3_2.ops.projx_wqaki import ( + ProjxWqakiWeightsConverter, + projx_wqaki, +) +from tilert.models.deepseek_v3_2.ops.rmsnorm_quant import rmsnorm_quant +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RMSNormProjxWqakis", +] + + +class RMSNormProjxWqakisWeightsConverter(TilertWeightsConverter): + """Weight converter for RMSNormProjxWqakis (decoupled FP8 MMA).""" + + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args, num_devices) + + def convert_to_decoupled( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert weights to decoupled FP8 MMA format. + + Args: + weights: [gamma, wq_a, wq_a_scale, wki, wki_scale, wis, wis_scale] + + Returns: + (wqaki_packed, wis_bf16, gamma) + """ + arch_name = self.model_args.arch_name + x_rmsnorm_gamma, wq_a, wq_a_scale, wki, wki_scale, wis, _wis_scale = weights + + if arch_name == "deepseek_v3_2": + wqaki_packed = ProjxWqakiWeightsConverter.convert_dsv32( + wq_a, wq_a_scale, wki, wki_scale + ) + elif arch_name == "glm_5": + wqaki_packed = ProjxWqakiWeightsConverter.convert_glm5_68cta( + wq_a, wq_a_scale, wki, wki_scale + ) + else: + raise ValueError(f"Unsupported architecture: {arch_name}") + + wis_bf16 = wis.to(torch.bfloat16) + return wqaki_packed, wis_bf16, x_rmsnorm_gamma.float() + + +class RMSNormProjxWqakisRefWeightsAlias: + """Reference weight aliases for RMSNormProjxWqakis.""" + + x_rmsnorm_gamma = "input_layernorm.weight" + q_a_weights = "self_attn.q_a_proj.weight" + q_a_scales = "self_attn.q_a_proj.weight_scale_inv" + wk_weights = "self_attn.indexer.wk.weight" + wk_scales = "self_attn.indexer.wk.weight_scale_inv" + wis_weights = "self_attn.indexer.weights_proj.weight" + + @property + def ref_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.wk_weights, + self.wk_scales, + self.wis_weights, + ] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +class RMSNormProjxWqakisTilertWeightsAlias: + """Tilert weight aliases for RMSNormProjxWqakis.""" + + x_rmsnorm_gamma = "x_rmsnorm_gamma" + q_a_weights = "q_a_weights" + q_a_scales = "q_a_scales" + wk_weights = "wk_weights" + wk_scales = "wk_scales" + wis_weights = "wis_weights" + wis_scales = "wis_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.wk_weights, + self.wk_scales, + self.wis_weights, + self.wis_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RMSNormProjxWqakisAlgorithm(Enum): + """RMSNormProjxWqakis algorithm.""" + + FP8MMA = "fp8mma" + + +class RMSNormProjxWqakis(TileRTModule): + """Decoupled RMSNorm + GEMV(W_q_a, W_ki, W_is).""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RMSNormProjxWqakisAlgorithm.FP8MMA], + "glm_5": [RMSNormProjxWqakisAlgorithm.FP8MMA], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int, + ref_weights_alias: RMSNormProjxWqakisRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = RMSNormProjxWqakisTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias + if ref_weights_alias is not None + else RMSNormProjxWqakisRefWeightsAlias() + ) + + self.arch_name = self.model_args.arch_name + self.dim = self.model_args.dim + self.q_lora_rank = self.model_args.q_lora_rank + self.idx_head_dim = self.model_args.index_head_dim + self.idx_score_dim = self.model_args.index_n_heads + self.block_size = self.model_args.block_size + self.eps = self.model_args.eps + + self.ref_norm_gamma: torch.Tensor | None = None + self.ref_wq_a: torch.Tensor | None = None + self.ref_wki: torch.Tensor | None = None + self.ref_wis: torch.Tensor | None = None + + self.tilert_norm_gamma: torch.Tensor | None = None + self.tilert_wqakis: torch.Tensor | None = None + self.tilert_wis: torch.Tensor | None = None + + self.q_out: torch.Tensor | None = None + self.ki_out: torch.Tensor | None = None + self.idx_scores_out: torch.Tensor | None = None + self.x_rmsnorm_out: torch.Tensor | None = None + self.x_rmsnorm_quant_out: torch.Tensor | None = None + self.x_rmsnorm_quant_scale_out: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + if self.arch_name == "glm_5": + self.compute_kernel_type = "fp8mma_68cta" + else: + self.compute_kernel_type = "fp8mma" + + self.tilert_tensor_alias: list[str] = [ + "x_rmsnorm_gamma", + "qakis_weights", + "qakis_scales", + ] + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_norm_gamma, self.tilert_wqakis, self.tilert_wis] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Repeat weights for device sharding.""" + input_layernorm_weight = ( + weights_map[self.ref_weights_alias.x_rmsnorm_gamma][None, ...] + .float() + .repeat(self.num_devices, 1) + ) + q_a_proj_weight = weights_map[self.ref_weights_alias.q_a_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + q_a_proj_weight_scale = weights_map[self.ref_weights_alias.q_a_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + wk_weight = weights_map[self.ref_weights_alias.wk_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + wk_weight_scale = weights_map[self.ref_weights_alias.wk_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + wis_weight = weights_map[self.ref_weights_alias.wis_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + is_n_rows = weights_map[self.ref_weights_alias.wis_weights].shape[0] + is_scale_rows = (is_n_rows + self.block_size - 1) // self.block_size + is_scale_cols = self.dim // self.block_size + wis_weight_scale = torch.ones( + self.num_devices, is_scale_rows, is_scale_cols, dtype=torch.bfloat16 + ) + return { + self.tilert_weights_alias.x_rmsnorm_gamma: input_layernorm_weight, + self.tilert_weights_alias.q_a_weights: q_a_proj_weight, + self.tilert_weights_alias.q_a_scales: q_a_proj_weight_scale, + self.tilert_weights_alias.wk_weights: wk_weight, + self.tilert_weights_alias.wk_scales: wk_weight_scale, + self.tilert_weights_alias.wis_weights: wis_weight, + self.tilert_weights_alias.wis_scales: wis_weight_scale, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + aliases = self.ref_weights_alias() + self.ref_norm_gamma = state_dict[aliases[0]] + self.ref_wq_a = weight_dequant(state_dict[aliases[1]], state_dict[aliases[2]]) + self.ref_wki = weight_dequant(state_dict[aliases[3]], state_dict[aliases[4]]) + self.ref_wis = state_dict[aliases[5]].to(torch.bfloat16) + + assert self.ref_norm_gamma.shape[-1] == self.dim + assert self.ref_wq_a.shape == (self.q_lora_rank, self.dim) + assert self.ref_wki.shape == (self.idx_head_dim, self.dim) + assert self.ref_wis.shape == (self.idx_score_dim, self.dim) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + tilert_aliases = self.tilert_weights_alias() + weights_list = [state_dict[alias] for alias in tilert_aliases] + converter = RMSNormProjxWqakisWeightsConverter(self.model_args, self.num_devices) + result = converter.convert_to_decoupled(weights_list) + self.tilert_wqakis, self.tilert_wis, self.tilert_norm_gamma = result + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + self.q_out = torch.zeros((batch_size, seq_len, self.q_lora_rank), dtype=torch.bfloat16) + self.ki_out = torch.zeros((batch_size, seq_len, self.idx_head_dim), dtype=torch.bfloat16) + self.idx_scores_out = torch.zeros( + (batch_size, seq_len, self.idx_score_dim), dtype=torch.bfloat16 + ) + self.x_rmsnorm_out = torch.zeros((batch_size, seq_len, self.dim), dtype=torch.bfloat16) + self.x_rmsnorm_quant_out = torch.zeros( + (batch_size, seq_len, self.dim), dtype=torch.float8_e4m3fn + ) + self.x_rmsnorm_quant_scale_out = torch.zeros( + (batch_size, seq_len, self.dim // self.block_size), dtype=torch.float32 + ) + self.profile_logs = get_profile_log_tensor() + self.is_init = True + + def init_random_weights(self) -> None: + bs = self.block_size + dim_scale_dim = self.dim // bs + q_scale_dim = (self.q_lora_rank + bs - 1) // bs + ki_scale_dim = (self.idx_head_dim + bs - 1) // bs + scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 + + tensor_list = [ + torch.randn(self.dim, dtype=torch.float32), + torch.randn(self.q_lora_rank, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(q_scale_dim, dim_scale_dim, dtype=scale_dtype), + torch.randn(self.idx_head_dim, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(ki_scale_dim, dim_scale_dim, dtype=scale_dtype), + torch.randn(self.idx_score_dim, self.dim, dtype=torch.bfloat16), + ] + ref_state_dict = dict(zip(self.ref_weights_alias(), tensor_list)) + self.init_reference_weights(ref_state_dict) + self.init_tilert_weights( + {_k: _v[self.device_id] for _k, _v in self.device_sharding(ref_state_dict).items()} + ) + + def golden_forward( + self, + x: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Pure PyTorch reference: RMSNorm -> q, ki, idx_scores.""" + assert self.ref_norm_gamma is not None + assert self.ref_wq_a is not None + assert self.ref_wki is not None + assert self.ref_wis is not None + + x_rmsnorm = torch.nn.functional.rms_norm( + x.float(), [x.size(-1)], self.ref_norm_gamma, self.eps + ) + q_out = torch.matmul(x_rmsnorm.float(), self.ref_wq_a.transpose(0, 1).float()) + ki_out = torch.matmul(x_rmsnorm.float(), self.ref_wki.transpose(0, 1).float()) + idx_scores_out = torch.matmul(x_rmsnorm.float(), self.ref_wis.transpose(0, 1).float()) + return ( + q_out.to(torch.bfloat16), + ki_out.to(torch.bfloat16), + idx_scores_out.to(torch.bfloat16), + ) + + def tilert_forward( + self, + x: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Run decoupled RMSNorm + ProjXWqaki + ProjXWis via TileRT CUDA kernels.""" + rmsnorm_quant( + x.to(torch.bfloat16), + self.tilert_norm_gamma, + self.x_rmsnorm_out, + self.x_rmsnorm_quant_out, + self.x_rmsnorm_quant_scale_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + projx_wqaki( + self.x_rmsnorm_quant_out, + self.x_rmsnorm_quant_scale_out, + self.tilert_wqakis, + self.q_out, + self.ki_out, + self.profile_logs, + self.compute_kernel_type, + model_arch=self.model_args.arch_name, + ) + wis_compute_kernel_type = "bf16" + projx_wis( + self.x_rmsnorm_out, + self.tilert_wis, + self.idx_scores_out, + wis_compute_kernel_type, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + + return self.q_out, self.ki_out, self.idx_scores_out + + def __call__( + self, + x: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + return self.golden_forward(x) diff --git a/tilert/models/deepseek_v3_2/ops/rmsnorm_projx_wqkva.py b/tilert/models/deepseek_v3_2/ops/rmsnorm_projx_wqkva.py new file mode 100644 index 0000000..8e58a24 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/rmsnorm_projx_wqkva.py @@ -0,0 +1,516 @@ +"""RMSNormProjxWqkva operation module.""" + +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RMSNormProjxWqkva", + "RMSNormProjxWqkvaAlgorithm", +] + + +class RMSNormProjQKVAFP8MMAWeightsConverter: + """Weight converter: pack FP8 weights for the FP8 MMA kernel.""" + + HIDDEN_DIM = 6144 + Q_LORA_RANK = 2048 + KV_LORA_RANK = 512 + QK_ROPE_HEAD_DIM = 64 + TOTAL_ROWS = Q_LORA_RANK + KV_LORA_RANK + QK_ROPE_HEAD_DIM + ROWS_PER_CTA = 32 + NUM_CTAS = TOTAL_ROWS // ROWS_PER_CTA + COLS_PER_PAGE = 1024 + NUM_PAGES = HIDDEN_DIM // COLS_PER_PAGE + SCALES_PER_PAGE = COLS_PER_PAGE // 128 + BLOCK_SIZE = 128 + + MAT_BYTES = ROWS_PER_CTA * COLS_PER_PAGE + SCALE_OFFSET = MAT_BYTES + PAGE_BYTES = ((MAT_BYTES + 128 + 127) // 128) * 128 + + @staticmethod + def _swizzle_mma_16x32(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle a [*, 16, 32] tile for the MMA kernel.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) + + @staticmethod + def convert_to_fp8_mma_gemv( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wkv_a: torch.Tensor, + wkv_a_scale: torch.Tensor, + w_pe: torch.Tensor, + w_pe_scale: torch.Tensor, + attn_norm_weight: torch.Tensor, + *, + hidden_dim: int = 6144, + q_lora_rank: int = 2048, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Pack FP8 weights for the FP8 MMA kernel. + + Args: + hidden_dim: Model hidden dimension. + q_lora_rank: Q projection rank. + """ + C = RMSNormProjQKVAFP8MMAWeightsConverter + block_size = C.BLOCK_SIZE + kv_lora_rank = C.KV_LORA_RANK + qk_rope_head_dim = C.QK_ROPE_HEAD_DIM + + expected = q_lora_rank * hidden_dim + assert wq_a.numel() == expected, f"wq_a numel {wq_a.numel()} != expected {expected}" + expected = kv_lora_rank * hidden_dim + assert wkv_a.numel() == expected, f"wkv_a numel {wkv_a.numel()} != expected {expected}" + expected = qk_rope_head_dim * hidden_dim + assert w_pe.numel() == expected, f"w_pe numel {w_pe.numel()} != expected {expected}" + + total_rows = q_lora_rank + kv_lora_rank + qk_rope_head_dim + num_ctas = total_rows // C.ROWS_PER_CTA + num_pages = hidden_dim // C.COLS_PER_PAGE + + wq_a_f = weight_dequant(wq_a.reshape(q_lora_rank, hidden_dim), wq_a_scale) + wkv_a_f = weight_dequant(wkv_a.reshape(kv_lora_rank, hidden_dim), wkv_a_scale) + w_pe_f = weight_dequant(w_pe.reshape(qk_rope_head_dim, hidden_dim), w_pe_scale) + w_float = torch.cat([wq_a_f, wkv_a_f, w_pe_f], dim=0) + + w_blocks = w_float.reshape(total_rows, hidden_dim // block_size, block_size) + col_max = w_blocks.abs().amax(dim=(0, 2)) + fp8_max = torch.finfo(torch.float8_e4m3fn).max + w_scales = (col_max / fp8_max).clamp(min=1e-12) + + scales_expanded = w_scales.repeat_interleave(block_size) + w_scaled = w_float / scales_expanded.unsqueeze(0) + w_fp8 = w_scaled.to(torch.float8_e4m3fn) + + assert C.MAT_BYTES == C.SCALE_OFFSET, "Layout mismatch: scales must follow mat" + assert block_size == C.COLS_PER_PAGE // C.SCALES_PER_PAGE, "Block size mismatch" + assert w_scales.numel() == num_pages * C.SCALES_PER_PAGE, "Scale count mismatch" + + w_bytes = w_fp8.view(torch.uint8) + num_tiles = C.COLS_PER_PAGE // 32 + + mat = w_bytes.reshape(num_ctas, C.ROWS_PER_CTA, num_pages, C.COLS_PER_PAGE) + mat = mat.transpose(1, 2) + + mat = mat.reshape(num_ctas, num_pages, 2, 16, num_tiles, 32) + mat = mat.transpose(3, 4) + mat = C._swizzle_mma_16x32(mat) + mat = mat.contiguous().reshape(num_ctas, num_pages, C.MAT_BYTES) + + scales_f32 = w_scales.reshape(num_pages, C.SCALES_PER_PAGE).to(torch.float32).contiguous() + scales_bytes = scales_f32.view(torch.uint8) + scales_bytes = scales_bytes.unsqueeze(0).expand(num_ctas, -1, -1) + + pad_size = C.PAGE_BYTES - C.MAT_BYTES - C.SCALES_PER_PAGE * 4 + padding = torch.zeros(num_ctas, num_pages, pad_size, dtype=torch.uint8, device=w_fp8.device) + + packed = torch.cat([mat, scales_bytes, padding], dim=-1) + packed = packed.contiguous().reshape(-1) + + return packed.view(torch.float8_e4m3fn), attn_norm_weight.clone() + + +class RMSNormProjQKVAFP16MMAWeightsConverter: + """Weight converter: pack FP16 weights for the FP16 MMA kernel.""" + + KV_LORA_RANK = 512 + QK_ROPE_HEAD_DIM = 64 + ROWS_PER_CTA = 32 + COLS_PER_PAGE = 512 + BLOCK_SIZE = 128 + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle a [*, 16, 16] tile for the MMA kernel.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def convert_to_fp16_mma_gemv( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wkv_a: torch.Tensor, + wkv_a_scale: torch.Tensor, + w_pe: torch.Tensor, + w_pe_scale: torch.Tensor, + attn_norm_weight: torch.Tensor, + *, + hidden_dim: int = 6144, + q_lora_rank: int = 2048, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Pack weights into the FP16 MMA layout.""" + C = RMSNormProjQKVAFP16MMAWeightsConverter + kv_lora_rank = C.KV_LORA_RANK + qk_rope_head_dim = C.QK_ROPE_HEAD_DIM + cols_per_page = C.COLS_PER_PAGE + rows_per_cta = C.ROWS_PER_CTA + + total_rows = q_lora_rank + kv_lora_rank + qk_rope_head_dim + num_ctas = total_rows // rows_per_cta + num_pages = hidden_dim // cols_per_page + num_k_tiles = cols_per_page // 16 + + wq_a_f = weight_dequant(wq_a.reshape(q_lora_rank, hidden_dim), wq_a_scale) + wkv_a_f = weight_dequant(wkv_a.reshape(kv_lora_rank, hidden_dim), wkv_a_scale) + w_pe_f = weight_dequant(w_pe.reshape(qk_rope_head_dim, hidden_dim), w_pe_scale) + w_float = torch.cat([wq_a_f, wkv_a_f, w_pe_f], dim=0) + + w_fp16 = w_float.to(torch.float16) + + mat = w_fp16.reshape(num_ctas, rows_per_cta, num_pages, cols_per_page) + mat = mat.transpose(1, 2) + + mat = mat.reshape(num_ctas, num_pages, 2, 16, num_k_tiles, 16) + mat = mat.transpose(3, 4) + mat = C._swizzle_mma_16x16(mat) + mat = mat.contiguous() + + mat_bytes = mat.view(torch.uint8).reshape(num_ctas, num_pages, -1) + packed = mat_bytes.contiguous().reshape(-1) + + return packed.view(torch.float16), attn_norm_weight.clone() + + +class RMSNormProjxWqkvaAlgorithm(Enum): + """RMSNormProjxWqkva algorithm.""" + + DECOUPLED = "decoupled" + + +class RMSNormProjxWqkvaWeightsConverter(TilertWeightsConverter): + """Dispatch weight converter for RMSNormProjxWqkva.""" + + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args, num_devices) + + def convert_to_fp8_mma_gemv( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tilert weights list to FP8 MMA kernel-ready format. + + Args: + weights: [gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, w_pe, w_pe_scale] + """ + gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, w_pe, w_pe_scale = weights + return RMSNormProjQKVAFP8MMAWeightsConverter.convert_to_fp8_mma_gemv( + wq_a, + wq_a_scale, + wkv_a, + wkv_a_scale, + w_pe, + w_pe_scale, + gamma, + hidden_dim=self.model_args.dim, + q_lora_rank=self.model_args.q_lora_rank, + ) + + def convert_to_fp16_mma_gemv( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tilert weights list to FP16 MMA kernel-ready format. + + Args: + weights: [gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, w_pe, w_pe_scale] + """ + gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, w_pe, w_pe_scale = weights + return RMSNormProjQKVAFP16MMAWeightsConverter.convert_to_fp16_mma_gemv( + wq_a, + wq_a_scale, + wkv_a, + wkv_a_scale, + w_pe, + w_pe_scale, + gamma, + hidden_dim=self.model_args.dim, + q_lora_rank=self.model_args.q_lora_rank, + ) + + +class RMSNormProjxWqkvaRefWeightsAlias: + """Reference weight aliases for RMSNormProjxWqkva.""" + + x_rmsnorm_gamma = "input_layernorm.weight" + q_a_weights = "self_attn.q_a_proj.weight" + q_a_scales = "self_attn.q_a_proj.weight_scale_inv" + kv_a_weights = "self_attn.kv_a_proj_with_mqa.weight" + kv_a_scales = "self_attn.kv_a_proj_with_mqa.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.kv_a_weights, + self.kv_a_scales, + ] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +class RMSNormProjxWqkvaTilertWeightsAlias: + """Tilert weight aliases for RMSNormProjxWqkva.""" + + x_rmsnorm_gamma = "x_rmsnorm_gamma" + q_a_weights = "q_a_weights" + q_a_scales = "q_a_scales" + kv_a_weights = "kv_a_weights" + kv_a_scales = "kv_a_scales" + w_pe_weights = "w_pe_weights" + w_pe_scales = "w_pe_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.kv_a_weights, + self.kv_a_scales, + self.w_pe_weights, + self.w_pe_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RMSNormProjxWqkva(TileRTModule): + """Fused RMSNorm + GEMV(W_q_a, W_kv_a, W_pe).""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RMSNormProjxWqkvaAlgorithm.DECOUPLED], + "glm_5": [RMSNormProjxWqkvaAlgorithm.DECOUPLED], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int, + ref_weights_alias: RMSNormProjxWqkvaRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = RMSNormProjxWqkvaTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias + if ref_weights_alias is not None + else RMSNormProjxWqkvaRefWeightsAlias() + ) + + self.dim = self.model_args.dim + self.q_lora_rank = self.model_args.q_lora_rank + self.kv_lora_rank = self.model_args.kv_lora_rank + self.qk_rope_head_dim = self.model_args.qk_rope_head_dim + self.block_size = self.model_args.block_size + self.eps = self.model_args.eps + self.algorithm = RMSNormProjxWqkvaAlgorithm.DECOUPLED + + self.ref_norm_gamma: torch.Tensor | None = None + self.ref_wq_a: torch.Tensor | None = None + self.ref_wkv_a: torch.Tensor | None = None + self.ref_w_pe: torch.Tensor | None = None + + self.tilert_norm_gamma: torch.Tensor | None = None + self.tilert_wqkva: torch.Tensor | None = None + self.tilert_wqkva_scales = torch.zeros((1, 1), dtype=torch.bfloat16) + + self.x_rmsnorm_out: torch.Tensor | None = None + self.x_rmsnorm_quant_out: torch.Tensor | None = None + self.x_rmsnorm_quant_scale_out: torch.Tensor | None = None + + self.q_out: torch.Tensor | None = None + self.kv_out: torch.Tensor | None = None + self.pe_cache_out: torch.Tensor | None = None + self.cur_pos: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + self.tilert_tensor_alias: list[str] = [ + "x_rmsnorm_gamma", + "qkva_weights", + "qkva_scales", + ] + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_norm_gamma, self.tilert_wqkva, self.tilert_wqkva_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Repeat weights for device sharding.""" + input_layernorm_weight = ( + weights_map[self.ref_weights_alias.x_rmsnorm_gamma][None, ...] + .float() + .repeat(self.num_devices, 1) + ) + q_a_proj_weight = weights_map[self.ref_weights_alias.q_a_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + q_a_proj_weight_scale = weights_map[self.ref_weights_alias.q_a_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + kv_a_mqa = weights_map[self.ref_weights_alias.kv_a_weights] + kv_a_proj_weight = kv_a_mqa[: self.kv_lora_rank, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + w_pe_weight = kv_a_mqa[self.kv_lora_rank :, :][None, ...].repeat(self.num_devices, 1, 1) + kv_a_mqa_scale = weights_map[self.ref_weights_alias.kv_a_scales] + kv_scale_rows = (self.kv_lora_rank + self.block_size - 1) // self.block_size + kv_a_proj_weight_scale = kv_a_mqa_scale[:kv_scale_rows, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + w_pe_weight_scale = kv_a_mqa_scale[kv_scale_rows:, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + return { + self.tilert_weights_alias.x_rmsnorm_gamma: input_layernorm_weight, + self.tilert_weights_alias.q_a_weights: q_a_proj_weight, + self.tilert_weights_alias.q_a_scales: q_a_proj_weight_scale, + self.tilert_weights_alias.kv_a_weights: kv_a_proj_weight, + self.tilert_weights_alias.kv_a_scales: kv_a_proj_weight_scale, + self.tilert_weights_alias.w_pe_weights: w_pe_weight, + self.tilert_weights_alias.w_pe_scales: w_pe_weight_scale, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + aliases = self.ref_weights_alias() + self.ref_norm_gamma = state_dict[aliases[0]] + self.ref_wq_a = weight_dequant(state_dict[aliases[1]], state_dict[aliases[2]]) + kv_a_mqa = weight_dequant(state_dict[aliases[3]], state_dict[aliases[4]]) + self.ref_wkv_a = kv_a_mqa[: self.kv_lora_rank, :] + self.ref_w_pe = kv_a_mqa[self.kv_lora_rank :, :] + + assert self.ref_norm_gamma.shape[-1] == self.dim + assert self.ref_wq_a.shape == (self.q_lora_rank, self.dim) + assert self.ref_wkv_a.shape == (self.kv_lora_rank, self.dim) + assert self.ref_w_pe.shape == (self.qk_rope_head_dim, self.dim) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + tilert_aliases = self.tilert_weights_alias() + weights_list = [state_dict[alias] for alias in tilert_aliases] + converter = RMSNormProjxWqkvaWeightsConverter(self.model_args, self.num_devices) + self.tilert_wqkva, self.tilert_norm_gamma = converter.convert_to_fp8_mma_gemv(weights_list) + self.tilert_wqkva_scales = torch.zeros((1,), dtype=torch.float32) + + def init_tilert_vars(self, batch_size: int, seq_len: int, max_len: int = 128) -> None: + self.q_out = torch.zeros((batch_size, seq_len, self.q_lora_rank), dtype=torch.bfloat16) + self.kv_out = torch.zeros((batch_size, seq_len, self.kv_lora_rank), dtype=torch.bfloat16) + self.pe_cache_out = torch.zeros( + (batch_size, max_len, self.qk_rope_head_dim), dtype=torch.bfloat16 + ) + self.cur_pos = torch.zeros((1,), dtype=torch.int32) + self.x_rmsnorm_out = torch.zeros((batch_size, seq_len, self.dim), dtype=torch.bfloat16) + self.x_rmsnorm_quant_out = torch.zeros( + (batch_size, seq_len, self.dim), dtype=torch.float8_e4m3fn + ) + self.x_rmsnorm_quant_scale_out = torch.zeros( + (batch_size, seq_len, self.dim // self.block_size), dtype=torch.float32 + ) + self.profile_logs = get_profile_log_tensor() + self.is_init = True + + def init_random_weights(self) -> None: + bs = self.block_size + dim_scale_dim = self.dim // bs + q_scale_dim = (self.q_lora_rank + bs - 1) // bs + kv_mqa_rows = self.kv_lora_rank + self.qk_rope_head_dim + kv_mqa_scale_dim = (kv_mqa_rows + bs - 1) // bs + scale_dtype = torch.bfloat16 + + tensor_list = [ + torch.randn(self.dim, dtype=torch.float32), + torch.randn(self.q_lora_rank, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(q_scale_dim, dim_scale_dim, dtype=scale_dtype), + torch.randn(kv_mqa_rows, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(kv_mqa_scale_dim, dim_scale_dim, dtype=scale_dtype), + ] + ref_state_dict = dict(zip(self.ref_weights_alias(), tensor_list)) + self.init_reference_weights(ref_state_dict) + self.init_tilert_weights( + {_k: _v[self.device_id] for _k, _v in self.device_sharding(ref_state_dict).items()} + ) + + def golden_forward( + self, + x: torch.Tensor, + cur_pos: int = 0, # noqa: U100 + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Pure PyTorch reference: RMSNorm -> q, kv, pe.""" + assert self.ref_norm_gamma is not None + assert self.ref_wq_a is not None + assert self.ref_wkv_a is not None + assert self.ref_w_pe is not None + + x_rmsnorm = torch.nn.functional.rms_norm( + x.float(), [x.size(-1)], self.ref_norm_gamma, self.eps + ) + q_out = torch.matmul(x_rmsnorm.float(), self.ref_wq_a.transpose(0, 1).float()) + kv_out = torch.matmul(x_rmsnorm.float(), self.ref_wkv_a.transpose(0, 1).float()) + pe_out = torch.matmul(x_rmsnorm.float(), self.ref_w_pe.transpose(0, 1).float()) + return ( + q_out.to(torch.bfloat16), + kv_out.to(torch.bfloat16), + pe_out.to(torch.bfloat16), + ) + + def tilert_forward( + self, + x: torch.Tensor, + cur_pos: int = 0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Run RMSNorm + 3-way GEMV via TileRT CUDA kernel (DECOUPLED).""" + assert self.cur_pos is not None + assert self.pe_cache_out is not None + self.cur_pos.fill_(cur_pos) + + from tilert.models.deepseek_v3_2.ops.projx_wqkva import projx_wqkva as _projx_wqkva + from tilert.models.deepseek_v3_2.ops.rmsnorm_quant import rmsnorm_quant as _rmsnorm_quant + + _rmsnorm_quant( + x.to(torch.bfloat16), + self.tilert_norm_gamma, + self.x_rmsnorm_out, + self.x_rmsnorm_quant_out, + self.x_rmsnorm_quant_scale_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + _projx_wqkva( + self.x_rmsnorm_quant_out, + self.x_rmsnorm_quant_scale_out, + self.tilert_wqkva, + self.cur_pos, + self.q_out, + self.kv_out, + self.pe_cache_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + + seq_len = x.size(-2) + pe_at_pos = self.pe_cache_out[:, cur_pos : cur_pos + seq_len, :] + return self.q_out, self.kv_out, pe_at_pos + + def __call__( + self, + x: torch.Tensor, + cur_pos: int = 0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + return self.golden_forward(x, cur_pos) diff --git a/python/models/deepseek_v3_2/ops/rmsnorm_quant.py b/tilert/models/deepseek_v3_2/ops/rmsnorm_quant.py similarity index 56% rename from python/models/deepseek_v3_2/ops/rmsnorm_quant.py rename to tilert/models/deepseek_v3_2/ops/rmsnorm_quant.py index 770db02..1d399c5 100644 --- a/python/models/deepseek_v3_2/ops/rmsnorm_quant.py +++ b/tilert/models/deepseek_v3_2/ops/rmsnorm_quant.py @@ -1,8 +1,4 @@ -"""RMSNormQuant operation module. - -Unified for deepseek_v3_2 (dim=7168) and glm_5 (dim=6144). -Dispatches by hidden_in.shape[-1]: 7168 -> rmsnorm_*_op, 6144 -> rmsnorm_*_glm5_op. -""" +"""RMSNormQuant operation module.""" from __future__ import annotations @@ -27,14 +23,13 @@ def rmsnorm_quant( quant_hidden_out: torch.Tensor | None = None, quant_hidden_scale_out: torch.Tensor | None = None, profile_logs: torch.Tensor | None = None, + compute_kernel_type: str = "general", + *, + model_arch: str, ) -> None: """ Rmsnorm with optional activation quantization. - Unified for deepseek_v3_2 (dim=7168) and glm_5 (dim=6144). Dispatches by - hidden_in.shape[-1]: 7168 -> rmsnorm_op / rmsnorm_quant_op, - 6144 -> rmsnorm_glm5_op / rmsnorm_quant_glm5_op. - Args: hidden_in: Input tensor (..., dim). gamma_in: RMSNorm gamma (dim,). @@ -43,31 +38,27 @@ def rmsnorm_quant( quant_hidden_scale_out: Optional quant scale (..., dim // block_size). If None, no quant. profile_logs: Optional profile logs tensor. """ - dim = hidden_in.shape[-1] - if dim == DIM_GLM_5: - glm5_flag = "_glm5" - elif dim == DIM_DEEPSEEK_V3_2: - glm5_flag = "" - else: - raise ValueError( - f"Unsupported hidden_in.shape[-1]: {dim}. " - f"rmsnorm_quant supports {DIM_DEEPSEEK_V3_2} (deepseek_v3_2) or {DIM_GLM_5} (glm_5)." - ) + if profile_logs is None: + raise ValueError("profile_logs is required when calling rmsnorm_quant.") + if quant_hidden_out is None or quant_hidden_scale_out is None: - quant_flag = "" - quant_args = [hidden_in, gamma_in, hidden_out, profile_logs] + torch.ops.tilert.rmsnorm_op( + hidden_in, + gamma_in, + hidden_out, + model_arch, + compute_kernel_type, + profile_logs, + ) else: - quant_flag = "_quant" - quant_args = [ + torch.ops.tilert.rmsnorm_quant_op( hidden_in, gamma_in, hidden_out, quant_hidden_out, quant_hidden_scale_out, + model_arch, + compute_kernel_type, profile_logs, - ] - if profile_logs is None: - raise ValueError("profile_logs is required when calling rmsnorm_quant.") - func_name = f"rmsnorm{quant_flag}{glm5_flag}_op" - func_call = getattr(torch.ops.tilert, func_name) - func_call(*quant_args) + torch.empty(0, dtype=torch.int64, device=hidden_in.device), + ) diff --git a/python/models/deepseek_v3_2/ops/rmsnorm_up_gate_silu.py b/tilert/models/deepseek_v3_2/ops/rmsnorm_up_gate_silu.py similarity index 93% rename from python/models/deepseek_v3_2/ops/rmsnorm_up_gate_silu.py rename to tilert/models/deepseek_v3_2/ops/rmsnorm_up_gate_silu.py index e2f5c59..db991da 100644 --- a/python/models/deepseek_v3_2/ops/rmsnorm_up_gate_silu.py +++ b/tilert/models/deepseek_v3_2/ops/rmsnorm_up_gate_silu.py @@ -13,7 +13,6 @@ ExpertSelectUpGateSiLU, ExpertSelectUpGateSiLUWeightsConverter, ) -from tilert.profiler.utils import parse_profile_log_tensor from tilert.utils import get_profile_log_tensor __all__ = [ @@ -30,6 +29,7 @@ def rmsnorm_up_gate_silu( weights_in: torch.Tensor, hidden_out: torch.Tensor, profile_logs: torch.Tensor, + model_arch: str, compute_kernel_type: str = "fp8mma", ) -> None: """rmsnorm_up_gate_silu operation.""" @@ -38,8 +38,9 @@ def rmsnorm_up_gate_silu( gamma_in, weights_in, hidden_out, - profile_logs, + model_arch, compute_kernel_type, + profile_logs, ) @@ -48,6 +49,7 @@ class RMSNormUpGateSiLUAlgorithm(Enum): FP8MMA = "fp8mma" FP16MMA = "fp16mma" + BF16MMA = "bf16mma" RMSNormUpGateSiLUWeightsConverter = ExpertSelectUpGateSiLUWeightsConverter @@ -81,6 +83,15 @@ def __call__(self) -> list[str]: class RMSNormUpGateSiLU(TileRTModule): """RMSNormUpGateSiLU module""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ + RMSNormUpGateSiLUAlgorithm.FP8MMA, + RMSNormUpGateSiLUAlgorithm.FP16MMA, + RMSNormUpGateSiLUAlgorithm.BF16MMA, + ], + "glm_5": [RMSNormUpGateSiLUAlgorithm.FP8MMA, RMSNormUpGateSiLUAlgorithm.FP16MMA], + } + def __init__( self, model_args: ModelArgs, @@ -102,38 +113,31 @@ def __init__( self.moe_inter_dim = self.model_args.moe_inter_dim self.moe_inter_dim_per_device = self.moe_inter_dim // self.num_devices self.inter_dim_per_device = self.inter_dim // self.num_devices - # effective number of experts self.n_experts = self.inter_dim_per_device // self.moe_inter_dim_per_device self.eps = self.model_args.eps self.block_size = self.model_args.block_size self.algorithm = algorithm - # reference weights self.ref_norm_gamma: torch.Tensor | None = None self.ref_gate: torch.Tensor | None = None self.ref_up: torch.Tensor | None = None - # tilert weights self.tilert_norm_gamma: torch.Tensor | None = None self.tilert_weights: torch.Tensor | None = None - # for compatibility, to be removed in the future self.tilert_scales = torch.zeros( 9, 4, 64, dtype=torch.bfloat16, device=torch.device("cuda") ) - # tilert vars self.hidden_out: torch.Tensor | None = None self.profile_logs: torch.Tensor | None = None self.is_init = False - # tilert_funcs self.rmsnorm_up_gate_silu_func = rmsnorm_up_gate_silu self.tilert_weights_alias = RMSNormUpGateSiLUTilertWeightsAlias() - # reference tensor aliases self.ref_tensor_alias: list[str] = [ "post_attention_layernorm.weight", "mlp.gate_proj.weight", @@ -158,7 +162,7 @@ def get_weights_list(self) -> list[torch.Tensor]: def device_sharding( self, weights_dict: dict[str, torch.Tensor], - key_prefix: str, # e.g. model.layers.{layer_id}.mlp + key_prefix: str, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Device sharding. @@ -176,7 +180,6 @@ def device_sharding( elif key_prefix == "mlp": rmsnorm_gamma_key = "post_attention_layernorm.weight" rmsnorm_gamma = weights_dict[rmsnorm_gamma_key] - # repeat rmsnorm_gamma for each device rmsnorm_gamma = rmsnorm_gamma[None, :].repeat(self.num_devices, 1) gate_weights, gate_scales, up_weights, up_scales = ( @@ -186,7 +189,6 @@ def device_sharding( self.num_devices, ) ) - # Transpose split so to match the old convertcode gate_weights = gate_weights.reshape(self.n_experts, self.num_devices, -1, self.dim) gate_weights = gate_weights.transpose(0, 1) gate_scales = gate_scales.reshape( @@ -210,7 +212,7 @@ def device_sharding( def init_reference_weights( self, state_dict: dict[str, torch.Tensor], - key_prefix: str, # e.g. model.layers.{layer_id}.mlp + key_prefix: str, device_id: int = 0, ) -> None: """ @@ -259,7 +261,6 @@ def init_tilert_vars(self, batch_size: int, seq_len: int, dev_id: int = 0) -> No batch_size: Batch size. seq_len: Sequence length. """ - # tilert vars self.hidden_out = torch.zeros( ( batch_size, @@ -274,13 +275,15 @@ def init_tilert_vars(self, batch_size: int, seq_len: int, dev_id: int = 0) -> No self.profile_logs = get_profile_log_tensor(device=f"cuda:{dev_id}") self.is_init = True - def init_random_weights(self, dev_id: int = 0) -> None: + def init_random_weights(self, dev_id: int | None = None) -> None: """ Initialize the random weights. Returns: None """ + if dev_id is None: + dev_id = self.device_id gamma = torch.randn(self.dim, dtype=torch.float32, device=f"cuda:{dev_id}") gate_weights = torch.randn( self.inter_dim, self.dim, dtype=torch.bfloat16, device=f"cuda:{dev_id}" @@ -326,7 +329,6 @@ def golden_forward( ) hidden_out_list = [] for s in range(seq_len): - # ref up-gate silu hidden_out_w1_list = [] hidden_out_w3_list = [] @@ -356,12 +358,9 @@ def tilert_forward( self.tilert_weights, self.hidden_out, self.profile_logs, - self.algorithm.value, + model_arch=self.model_args.arch_name, + compute_kernel_type=self.algorithm.value, ) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) return self.hidden_out def __call__( diff --git a/tilert/models/deepseek_v3_2/ops/rotate.py b/tilert/models/deepseek_v3_2/ops/rotate.py new file mode 100644 index 0000000..19c2746 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/rotate.py @@ -0,0 +1,226 @@ +"""Rotate(hadamard transform) operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch +import torch.nn.functional as F + +from tilert.models.base import TileRTModule +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.models.utils import apply_rotary_emb +from tilert.utils import get_profile_log_tensor + +try: + from fast_hadamard_transform import hadamard_transform + + def rotate_activation(x: torch.Tensor) -> torch.Tensor: + assert x.dtype == torch.bfloat16 + hidden_size = x.size(-1) + return hadamard_transform(x, scale=hidden_size**-0.5) + +except ImportError: + print( + "Cannot import hadamard_transform, fallback to scipy.linalg.hadamard." + "please install fast_hadamard_transform for correct performance." + ) + import math + + from scipy.linalg import hadamard + + def hadamard_transform_ref(x: torch.Tensor, scale: float = 1.0) -> torch.Tensor: + x_shape = x.shape + dim = x.shape[-1] + x = x.reshape(-1, dim) + log_dim = math.ceil(math.log2(dim)) + dim_padded = 2**log_dim + if dim != dim_padded: + x = F.pad(x, (0, dim_padded - dim)) + out = F.linear( + x, + torch.tensor(hadamard(dim_padded, dtype=float), dtype=x.dtype, device=x.device), + ) + out = out * scale + return out[..., :dim].reshape(*x_shape) + + def rotate_activation(x: torch.Tensor) -> torch.Tensor: + assert x.dtype == torch.bfloat16 + hidden_size = x.size(-1) + return hadamard_transform_ref(x, scale=hidden_size**-0.5) + + +__all__ = [ + "rotate", + "rotate_activation", + "Rotate", + "RotateRefWeightsAlias", + "RotateTilertWeightsAlias", +] + + +def rotate( + input_raw: torch.Tensor, + output_raw: torch.Tensor, + freqs_cis_raw: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", + kv_cache: torch.Tensor | None = None, + cur_pos: torch.Tensor | None = None, + cache_base: int = 0, + cache_stride: int = 0, + cache_compressed: bool = False, +) -> None: + """ + Rotate (hadamard transform) operation. + + Args: + input_raw (torch.Tensor): The input tensor [..., head, 128]. + output_raw (torch.Tensor): The output tensor where the result will be stored. + freqs_cis_raw (torch.Tensor): The frequency tensor. + profile_logs (torch.Tensor): Tensor for storing profiling logs. + model_arch: Model architecture string. + compute_kernel_type: Compute kernel type string. + kv_cache: Optional cache write target. + cur_pos: Optional [1] int32 tensor. Required when kv_cache is set. + cache_base: Base row index. + cache_stride: Stride. Must be > 0 when ``cache_compressed=True``. + cache_compressed: Cache write mode selector. + + Returns: + None + """ + torch.ops.tilert.rotate_op( + input_raw, + output_raw, + freqs_cis_raw, + model_arch, + compute_kernel_type, + profile_logs, + kv_cache, + cur_pos, + cache_base, + cache_stride, + cache_compressed, + ) + + +@dataclass +class RotateRefWeightsAlias: + """Reference weights alias for Rotate (no weights).""" + + @property + def ref_tensor_alias(self) -> list[str]: + return [] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class RotateTilertWeightsAlias: + """TileRT weights alias for Rotate (no weights).""" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RotateAlgorithm(Enum): + """Rotate algorithm.""" + + GENERAL = "general" + + +class Rotate(TileRTModule): + """Rotate module: RoPE on first qk_rope_head_dim dims + hadamard transform. + + Unified for deepseek_v3_2 (index_n_heads=64) and glm_5 (index_n_heads=32). + No weights; uses model_args for dimensions. + """ + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RotateAlgorithm.GENERAL], + "glm_5": [RotateAlgorithm.GENERAL], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int = 1, + device_id: int = 0, + ref_weights_alias: RotateRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + self.tilert_weights_alias = RotateTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else RotateRefWeightsAlias() + ) + + self.qk_rope_head_dim = model_args.qk_rope_head_dim + self.index_n_heads = model_args.index_n_heads + self.index_head_dim = model_args.index_head_dim + + self.output: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + return [] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + del weights_map + return {} + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + del state_dict + pass + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + del state_dict + pass + + def init_random_weights(self) -> None: + pass + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + self.output = torch.zeros( + (batch_size, seq_len, self.index_n_heads, self.index_head_dim), + dtype=torch.bfloat16, + ) + self.profile_logs = get_profile_log_tensor() + self.is_init = True + + def golden_forward( + self, + idx_q: torch.Tensor, + freqs_cis: torch.Tensor, + ) -> torch.Tensor: + q_pe_idx, q_nope_idx = torch.split( + idx_q, + [self.qk_rope_head_dim, self.index_head_dim - self.qk_rope_head_dim], + dim=-1, + ) + q_pe_idx = apply_rotary_emb(q_pe_idx, freqs_cis, interleaved=False) + idx_q = torch.cat([q_pe_idx, q_nope_idx], dim=-1) + return rotate_activation(idx_q) + + def tilert_forward(self, idx_q: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: + assert self.output is not None + assert self.profile_logs is not None + freqs_cis_real = torch.view_as_real(freqs_cis).reshape(*freqs_cis.shape[:-1], -1) + rotate( + idx_q, + self.output, + freqs_cis_real, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + return self.output diff --git a/python/models/deepseek_v3_2/ops/sparse_index.py b/tilert/models/deepseek_v3_2/ops/sparse_index.py similarity index 87% rename from python/models/deepseek_v3_2/ops/sparse_index.py rename to tilert/models/deepseek_v3_2/ops/sparse_index.py index 0c21ce8..870855b 100644 --- a/python/models/deepseek_v3_2/ops/sparse_index.py +++ b/tilert/models/deepseek_v3_2/ops/sparse_index.py @@ -15,6 +15,9 @@ def sparse_index( logits: torch.Tensor, cur_pos: int, profile_logs: torch.Tensor, + compute_kernel_type: str = "bf16", + *, + model_arch: str, ) -> None: """ Sparse index operation. @@ -28,6 +31,8 @@ def sparse_index( logits (torch.Tensor): The logits tensor. cur_pos (int): The position of the first token. profile_logs (torch.Tensor): Tensor for storing profiling logs. + compute_kernel_type (str): Kernel type ("bf16"). + model_arch (str): Model architecture ("deepseek_v3_2"). Returns: None @@ -59,10 +64,9 @@ def sparse_index( f"q={device}, kv={kv.device}, weights={weights.device}, " f"logits={logits.device}, profile_logs={profile_logs.device}" ) - if head == 64: - torch.ops.tilert.sparse_index_op(q, kv, weights, logits, cur_pos, profile_logs) - elif head == 32: - torch.ops.tilert.sparse_index_glm5_op(q, kv, weights, logits, cur_pos, profile_logs) + torch.ops.tilert.sparse_index_op( + q, kv, weights, logits, cur_pos, model_arch, compute_kernel_type, profile_logs + ) def sparse_index_topk( @@ -103,10 +107,10 @@ def sparse_index_topk( head = q.shape[-2] dim = q.shape[-1] - if head != 32: + if head != 64: raise ValueError( - f"Unsupported head size: {head}. Sparse index topk fused op currently only \ - supports a head number of 32." + f"Unsupported head size: {head}. Sparse index topk fused op " + "supports head number of 64 (DSV3.2)." ) if dim != 128: raise ValueError("dim must be 128, as we precompute scale inner kernel") @@ -118,7 +122,7 @@ def sparse_index_topk( f"q={device}, kv={kv.device}, weights={weights.device}, " f"logits={logits.device}, profile_logs={profile_logs.device}" ) - workspace = torch.zeros(seqlen, (200 * 1024 + 258), dtype=torch.int32, device=device) - torch.ops.tilert.sparse_index_topk_glm5_op( + workspace = torch.zeros(seqlen, (200 * 1024 + 260), dtype=torch.int32, device=device) + torch.ops.tilert.sparse_index_topk_dsv32_op( q, kv, weights, logits, cur_pos, indices, workspace, profile_logs ) diff --git a/tilert/models/deepseek_v3_2/ops/topk.py b/tilert/models/deepseek_v3_2/ops/topk.py new file mode 100644 index 0000000..49c58eb --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/topk.py @@ -0,0 +1,171 @@ +"""topk operations module.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch +import torch.nn as nn + +from tilert.utils import get_profile_log_tensor + +if TYPE_CHECKING: + from tilert.models.deepseek_v3_2.model_args import ModelArgs + + +__all__ = [ + "TopK", + "topk_approximate", + "topk_accurate", +] + + +def topk_approximate( + logits: torch.Tensor, + seq_len: int, + topk: int, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", +) -> torch.Tensor: + """ + Topk approximate operation. + + Topk approximate the input tensor `logits` and stores the result in `output_raw`. + + Args: + logits (torch.Tensor): The input tensor. + seq_len (int): valid data of logits.shape[-1] + topk (int): The number of topk to approximate. + profile_logs (torch.Tensor): The profile logs tensor. + + Returns: + indices (torch.Tensor): The output tensor. + """ + if logits.dtype != torch.float32: + raise ValueError("logits must be a float32 tensor.") + + if topk != 2048: + raise ValueError("topk must be 2048.") + batch = logits.shape[0] + if batch != 1: + raise ValueError("batch must be 1 in this version") + + indices = torch.zeros(batch, topk, dtype=torch.int32, device=logits.device) + torch.ops.tilert.topk_approximate_op( + logits, indices, seq_len, model_arch, compute_kernel_type, profile_logs + ) + + return indices + + +def topk_accurate( + logits: torch.Tensor, + seq_len: int, + topk: int, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", + ratio: int = 1, +) -> torch.Tensor: + """ + Topk approximate operation. + + Topk approximate the input tensor `logits` and stores the result in `output_raw`. + + Args: + logits (torch.Tensor): The input tensor. + seq_len (int): length of last samples, + for k=logits.shape[1] samples, the length is + seq-k+1, seq-k+2, ..., seq-1, seq + topk (int): The number of topk to approximate. + profile_logs (torch.Tensor): The profile logs tensor. + ratio (int): Token-domain to logits-trailing-dim compression factor. + Returns: + indices (torch.Tensor): The output tensor. + """ + if logits.dtype != torch.float32: + raise ValueError("logits must be a float32 tensor.") + + if topk not in (512, 1024, 2048): + raise ValueError("topk must be 512, 1024, or 2048.") + + assert logits.shape[0] == 1, "batch must be 1 in this version" + num_samples = logits.shape[1] + + indices = torch.zeros(num_samples, topk, dtype=torch.int32, device=logits.device) + indices_ws = torch.zeros(1, num_samples, 4, topk * 2, dtype=torch.int32, device=logits.device) + torch.ops.tilert.topk_accurate_op( + logits, + indices, + seq_len - num_samples, + indices_ws, + model_arch, + compute_kernel_type, + profile_logs, + ratio, + ) + + return indices + + +class TopK(nn.Module): + """TopK operation with optional approximate kernel. + + Wraps topk_accurate / topk_approximate and provides golden_forward + (reference implementation) and tilert_forward (TileRT kernel). + """ + + def __init__(self, use_approximate: bool = False, model_args: ModelArgs | None = None) -> None: + super().__init__() + self.use_approximate = use_approximate + if model_args is None: + from tilert.models.deepseek_v3_2.model_args import ModelArgs + + model_args = ModelArgs() + self.model_args = model_args + + def golden_forward( + self, + logits: torch.Tensor, + topk: int, + ) -> torch.Tensor: + """Reference forward: torch.topk on the last dimension. + + Args: + logits: Scores tensor, shape (batch, ..., seq_len). + topk: Number of top indices to return. + + Returns: + Indices of top-k values along the last dimension. + """ + seq_len = logits.shape[-1] + return logits.topk(min(topk, seq_len), dim=-1)[1] + + def tilert_forward( + self, + logits: torch.Tensor, + topk: int, + ) -> torch.Tensor: + """Tilert forward: batch of samples with varying valid length. + + Args: + logits: Shape (batch, num_samples, cache_len). + topk: Number of top indices to return. + + Returns: + Indices tensor of shape (batch, num_samples, topk). + """ + profile_logs = get_profile_log_tensor(device=logits.device) + cache_len = logits.shape[-1] + if self.use_approximate: + indices = topk_approximate( + logits, cache_len, topk, profile_logs, model_arch=self.model_args.arch_name + ) + else: + indices = topk_accurate( + logits, cache_len, topk, profile_logs, model_arch=self.model_args.arch_name + ) + if indices.dim() == 2: + return indices.unsqueeze(0) + return indices diff --git a/tilert/models/deepseek_v3_2/ops/unproj_o_allreduce.py b/tilert/models/deepseek_v3_2/ops/unproj_o_allreduce.py new file mode 100644 index 0000000..33520b5 --- /dev/null +++ b/tilert/models/deepseek_v3_2/ops/unproj_o_allreduce.py @@ -0,0 +1,526 @@ +"""UnprojOAllreduce operation module.""" + +import math +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "unproj_o_allreduce", + "UnProjOAllReduce", + "UnProjOAllReduceAlgorithm", + "UnProjOAllReduceRefWeightsAlias", + "UnProjOAllReduceTilertWeightsAlias", +] + + +def unproj_o_allreduce( + vec_in: torch.Tensor, + mat_in: torch.Tensor, + mat_scale: torch.Tensor, + x_in: torch.Tensor, + flag: int, + vec_out: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", +) -> None: + """ + Fused operation of unprojection and allreduce. + + Args: + vec_in: Input tensor. + mat_in: Input tensor. + mat_scale: Input tensor. + x_in: Input tensor. + flag: Input flag. + vec_out: Output tensor. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16", "fp16mma"). + """ + torch.ops.tilert.unproj_o_allreduce_op( + vec_in, + mat_in, + mat_scale, + x_in, + flag, + vec_out, + profile_logs, + model_arch, + compute_kernel_type, + ) + + +class UnProjOAllReduceAlgorithm(Enum): + """UnprojOAllReduce algorithm""" + + FP16MMA = "fp16mma" + BF16MMA = "bf16mma" + + +@dataclass +class UnProjOAllReduceRefWeightsAlias: + """Reference weights alias for UnProjOAllReduce.""" + + o_proj_weight = "self_attn.o_proj.weight" + o_proj_scale_inv = "self_attn.o_proj.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [self.o_proj_weight, self.o_proj_scale_inv] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class UnProjOAllReduceTilertWeightsAlias: + """TileRT weights alias for UnProjOAllReduce.""" + + unproj_weights = "unproj_weights" + unproj_scales = "unproj_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.unproj_weights, self.unproj_scales] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class UnProjOAllReduceWeightsConverter(TilertWeightsConverter): + """UnProjOAllReduce weights converter""" + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + def convert_to_fp16mma_128cta( + self, + weights_list: list[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert weights to the FP16 MMA layout for the 128-CTA config.""" + with torch.inference_mode(): + mat, scales = weights_list + if scales.dtype != torch.float32: + scales = scales.to(torch.float32) + + dim = self.model_args.dim + block_size = self.model_args.block_size + sms = 128 + vec_dim = mat.shape[-1] + dim_per_sm = dim // sms + full_tiles = dim_per_sm // 16 + remainder_rows = dim_per_sm % 16 + stages = vec_dim // 512 + vec_scale_dim = vec_dim // block_size + scale_per_stage = vec_scale_dim // stages + + dim_scale_dim = dim // block_size + scales_per_full_tile = 2 if remainder_rows > 0 else 1 + rem_scales = 1 if remainder_rows > 0 else 0 + total_scale_slots = (full_tiles * scales_per_full_tile + rem_scales) * scale_per_stage + repeat_factor = 8 if remainder_rows == 0 else 16 + + sc = scales.reshape(dim_scale_dim, 1, vec_scale_dim) + sc = sc.repeat(1, repeat_factor, 1) + scales_per_cta = full_tiles * scales_per_full_tile + rem_scales + sc = ( + sc.reshape(sms, scales_per_cta, stages, scale_per_stage) + .transpose(1, 2) + .reshape(sms, stages, total_scale_slots) + .view(torch.float8_e4m3fn) + ) + sc_packed = sc + + mat_per_sm = mat.reshape(sms, dim_per_sm, vec_dim) + + full_rows = full_tiles * 16 + mat_full = ( + mat_per_sm[:, :full_rows, :] + .reshape(sms, full_tiles, 16, stages, 512) + .transpose(2, 3) + .reshape(sms, full_tiles, stages, 16, 32, 16) + .transpose(3, 4) + .reshape(sms, full_tiles, stages, 32, 16, 16) + ) + mat_full = UnProjOAllReduceWeightsConverter._swizzle_mma_16x16(mat_full) + mat_full = mat_full.transpose(1, 2).reshape(sms, stages, -1) + + if remainder_rows > 0: + mat_rem_raw = mat_per_sm[:, full_rows:, :] + mat_rem_padded = torch.zeros( + sms, 16, vec_dim, dtype=mat_rem_raw.dtype, device=mat_rem_raw.device + ) + mat_rem_padded[:, :remainder_rows, :] = mat_rem_raw + mat_rem = ( + mat_rem_padded.reshape(sms, 1, 16, stages, 512) + .transpose(2, 3) + .reshape(sms, 1, stages, 16, 32, 16) + .transpose(3, 4) + .reshape(sms, 1, stages, 32, 16, 16) + ) + mat_rem = UnProjOAllReduceWeightsConverter._swizzle_mma_16x16(mat_rem) + mat_rem = mat_rem.transpose(1, 2).reshape(sms, stages, -1) + mat_combined = torch.cat([mat_full, mat_rem], dim=-1) + else: + mat_combined = mat_full + + scales_padding = torch.zeros( + sms, + stages, + 128 - sc_packed.shape[-1], + dtype=torch.float8_e4m3fn, + device=mat.device, + ) + mat_all = torch.cat([mat_combined, sc_packed, scales_padding], dim=-1).contiguous() + dummy_scales = torch.zeros(1, dtype=torch.float32, device=mat.device) + return mat_all, dummy_scales + + def convert_to_bf16mma( + self, + weights_list: list[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert common weights to the BF16 MMA layout.""" + assert ( + self.model_args.arch_name == "deepseek_v3_2" + ), "BF16 MMA dispatch is wired only for DeepSeek-V3.2 DevGroupB." + return self.convert_to_fp16mma_128cta(weights_list) + + def convert_to_fp16mma( + self, + weights_list: list[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert common weights to TileRT FP16 MMA layout.""" + if self.model_args.arch_name == "deepseek_v3_2": + return self.convert_to_fp16mma_128cta(weights_list) + assert self.model_args.arch_name == "glm_5", "Only GLM-5 and DSV3.2 support FP16 MMA" + + with torch.inference_mode(): + mat, scales = weights_list + if scales.dtype != torch.float32: + print( + "Warning: UnProjOAllReduceWeightsConverter: " + + f"scales.dtype: {scales.dtype} " + + "is not float32, convert to float32." + ) + scales = scales.to(torch.float32) + + dim = self.model_args.dim + block_size = self.model_args.block_size + sms = 128 + vec_dim = mat.shape[-1] + dim_per_sm = dim // sms + tiles_per_stage = dim_per_sm // 16 + stages = vec_dim // 512 + dim_scale_dim = dim // block_size + vec_scale_dim = vec_dim // block_size + scale_per_stage = vec_scale_dim // stages + + scales = scales.reshape(dim_scale_dim, 1, vec_scale_dim) + scales = scales.repeat(1, 8, 1) + scales = ( + scales.reshape(sms, tiles_per_stage, stages, scale_per_stage) + .transpose(1, 2) + .reshape(sms, stages, tiles_per_stage * scale_per_stage) + .view(torch.float8_e4m3fn) + ) + + mat = ( + mat.reshape(sms, dim_per_sm, vec_dim) + .reshape(sms, tiles_per_stage, 16, stages, 512) + .transpose(2, 3) + .reshape(sms, tiles_per_stage, stages, 16, 32, 16) + .transpose(3, 4) + .reshape(sms, tiles_per_stage, stages, 32, 16, 16) + ) + mat = UnProjOAllReduceWeightsConverter._swizzle_mma_16x16(mat) + mat = mat.transpose(1, 2).reshape(sms, stages, -1) + + scales_padding = torch.zeros( + sms, + stages, + 128 - scales.shape[-1], + dtype=torch.float8_e4m3fn, + device=mat.device, + ) + mat_full = torch.cat([mat, scales, scales_padding], dim=-1).contiguous() + dummy_scales = torch.zeros(1, dtype=torch.float32, device=mat.device) + return mat_full, dummy_scales + + +class UnProjOAllReduce(TileRTModule): + """UnProjOAllReduce module""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ + UnProjOAllReduceAlgorithm.FP16MMA, + UnProjOAllReduceAlgorithm.BF16MMA, + ], + "glm_5": [ + UnProjOAllReduceAlgorithm.FP16MMA, + ], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int = 0, + ref_weights_alias: UnProjOAllReduceRefWeightsAlias | None = None, + tilert_weights_alias: UnProjOAllReduceTilertWeightsAlias | None = None, + algorithm: UnProjOAllReduceAlgorithm = UnProjOAllReduceAlgorithm.FP16MMA, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = ( + tilert_weights_alias + if tilert_weights_alias is not None + else UnProjOAllReduceTilertWeightsAlias() + ) + self.ref_weights_alias = ( + ref_weights_alias + if ref_weights_alias is not None + else UnProjOAllReduceRefWeightsAlias() + ) + + self.arch_name = self.model_args.arch_name + self.dim = self.model_args.dim + self.n_heads = self.model_args.n_heads + self.head_dim = self.model_args.v_head_dim + + if self.n_heads % self.num_devices == 0: + self.num_local_heads = self.n_heads // self.num_devices + else: + n_local = math.ceil(self.n_heads / self.num_devices) + if n_local % 2 != 0: + n_local += 1 + self.num_local_heads = n_local + + self.block_size = self.model_args.block_size + self.algorithm: UnProjOAllReduceAlgorithm = algorithm + + self.ref_unproj_o: torch.Tensor | None = None + + self.tilert_weights: torch.Tensor | None = None + self.tilert_scales: torch.Tensor | None = None + + self.hidden_out: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + self.is_var_init = False + + def get_weights_list(self) -> list[torch.Tensor]: + """ + Get the weights list. + + Returns: + List of weights. + """ + return [self.tilert_weights, self.tilert_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """ + Device sharding. + + Args: + weights_map: Map from ref weight alias to tensor (full model). + + Returns: + Map from tilert weight alias to (num_devices, ...) tensors. + """ + unproj_o_weight = weights_map[self.ref_weights_alias.o_proj_weight] + unproj_o_scale = weights_map[self.ref_weights_alias.o_proj_scale_inv] + + if self.n_heads % self.num_devices == 0: + unproj_o_weight = unproj_o_weight.reshape(self.dim, self.num_devices, -1) + unproj_o_weight = unproj_o_weight.transpose(0, 1) + unproj_o_scale = unproj_o_scale.reshape( + self.dim // self.block_size, self.num_devices, -1 + ) + unproj_o_scale = unproj_o_scale.transpose(0, 1) + else: + cols_per_head = self.head_dim + cols_per_dev = self.num_local_heads * cols_per_head + W = unproj_o_weight.view(self.dim, self.n_heads, cols_per_head) + + scale_cols_per_head = cols_per_head // self.block_size + scale_cols_per_dev = self.num_local_heads * scale_cols_per_head + S = unproj_o_scale.view(self.dim // self.block_size, self.n_heads, scale_cols_per_head) + + W_devs = [] + S_devs = [] + for dev in range(self.num_devices): + start = dev * self.num_local_heads + end = min(self.n_heads, start + self.num_local_heads) + real = max(0, end - start) + + dev_W = torch.zeros( + self.dim, + self.num_local_heads, + cols_per_head, + dtype=W.dtype, + device=W.device, + ) + if real > 0: + dev_W[:, :real] = W[:, start:end] + W_devs.append(dev_W.reshape(self.dim, cols_per_dev)) + + dev_S = torch.zeros( + self.dim // self.block_size, + self.num_local_heads, + scale_cols_per_head, + dtype=S.dtype, + device=S.device, + ) + if real > 0: + dev_S[:, :real] = S[:, start:end] + S_devs.append(dev_S.reshape(self.dim // self.block_size, scale_cols_per_dev)) + + unproj_o_weight = torch.stack(W_devs, dim=0) + unproj_o_scale = torch.stack(S_devs, dim=0) + + return { + self.tilert_weights_alias.unproj_weights: unproj_o_weight.contiguous(), + self.tilert_weights_alias.unproj_scales: unproj_o_scale.contiguous(), + } + + def init_reference_weights( + self, + state_dict: dict[str, torch.Tensor], + device_id: int | None = None, + ) -> None: + """ + Initialize the reference weights. + + Args: + state_dict: State dictionary keyed by ref weight alias (full model). + device_id: Device ID for this shard; defaults to self.device_id. + """ + did = self.device_id if device_id is None else device_id + sharded = self.device_sharding(state_dict) + weights = sharded[self.tilert_weights_alias.unproj_weights][did] + scales = sharded[self.tilert_weights_alias.unproj_scales][did] + self.ref_unproj_o = weight_dequant(weights, scales) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """ + Initialize the tilert weights. + + Args: + state_dict: State dictionary keyed by tilert weight alias (per-device). + """ + assert self.algorithm is not None, "Algorithm is not set" + self.tilert_weights, self.tilert_scales = UnProjOAllReduceWeightsConverter( + self.model_args, self.num_devices + ).dispatch( + self.algorithm, + [state_dict[alias] for alias in self.tilert_weights_alias()], + ) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + """ + Initialize the tilert variables. + + Args: + batch_size: Batch size. + seq_len: Sequence length. + """ + self.hidden_out = torch.zeros( + (batch_size, seq_len, self.dim), + dtype=torch.bfloat16, + device=f"cuda:{self.device_id}", + ) + self.profile_logs = get_profile_log_tensor(device=f"cuda:{self.device_id}") + self.is_var_init = True + + def init_random_weights(self) -> None: + """Initialize the random weights.""" + unproj_o_weights = torch.randn( + self.dim, + self.n_heads * self.head_dim, + dtype=torch.bfloat16, + device=f"cuda:{self.device_id}", + ).to(torch.float8_e4m3fn) + + head_scale_dim = self.head_dim // self.block_size + dim_scale_dim = self.dim // self.block_size + scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 + unproj_o_scales = torch.randn( + dim_scale_dim, + self.n_heads * head_scale_dim, + dtype=scale_dtype, + device=f"cuda:{self.device_id}", + ) + ref_state_dict = { + self.ref_weights_alias.o_proj_weight: unproj_o_weights, + self.ref_weights_alias.o_proj_scale_inv: unproj_o_scales, + } + + self.init_reference_weights(ref_state_dict) + sharded = self.device_sharding(ref_state_dict) + per_device_state = {k: v[self.device_id] for k, v in sharded.items()} + self.init_tilert_weights(per_device_state) + + def golden_forward( + self, + vec_in: torch.Tensor, + ) -> torch.Tensor: + """ + Forward pass for the down-project module. + + Args: + vec_in: Input vector. + + Returns: + Output tensor. + """ + assert self.ref_unproj_o is not None + bsz = vec_in.shape[0] + seq_len = vec_in.shape[1] + assert bsz == 1 + res = vec_in.reshape(bsz, seq_len, -1).float() @ self.ref_unproj_o.T.float() + return res.to(torch.bfloat16) + + def tilert_forward( + self, + vec_in: torch.Tensor, + x_in: torch.Tensor, + flag: int, + ) -> torch.Tensor: + assert self.hidden_out is not None + assert self.profile_logs is not None + assert self.algorithm is not None + unproj_o_allreduce( + vec_in, + self.tilert_weights, + self.tilert_scales, + x_in, + flag, + self.hidden_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + compute_kernel_type=self.algorithm.value, + ) + return self.hidden_out + + def __call__( + self, + vec_in: torch.Tensor, + ) -> torch.Tensor: + return self.golden_forward(vec_in) diff --git a/python/models/deepseek_v3_2/refs/__init__.py b/tilert/models/deepseek_v3_2/refs/__init__.py similarity index 61% rename from python/models/deepseek_v3_2/refs/__init__.py rename to tilert/models/deepseek_v3_2/refs/__init__.py index 25e6872..75aaf30 100644 --- a/python/models/deepseek_v3_2/refs/__init__.py +++ b/tilert/models/deepseek_v3_2/refs/__init__.py @@ -2,6 +2,10 @@ This package exposes helpers like `act_quant`, `fp8_gemm`, and `weight_dequant` for tests and higher-level Python ops. + +Note: `act_quant` and `fp8_gemm` require tilelang at *call* time, and +`weight_dequant` requires triton at *call* time, but importing this package +does not require tilelang or triton to be installed. """ from .kernel import act_quant, fp8_gemm, weight_dequant diff --git a/tilert/models/deepseek_v3_2/refs/kernel.py b/tilert/models/deepseek_v3_2/refs/kernel.py new file mode 100644 index 0000000..cd68a7c --- /dev/null +++ b/tilert/models/deepseek_v3_2/refs/kernel.py @@ -0,0 +1,306 @@ +import torch + +try: + import tilelang + import tilelang.language as T + + _HAS_TILELANG = True +except ImportError: + _HAS_TILELANG = False + +try: + import triton + import triton.language as tl + + _HAS_TRITON = True +except ImportError: + _HAS_TRITON = False + +__all__ = [ + "weight_dequant", + "act_quant", + "fp8_gemm", +] + +FP8 = "float8_e4m3" +BF16 = "bfloat16" +FP32 = "float32" + + +def _require_tilelang(fn_name: str) -> None: + if not _HAS_TILELANG: + raise ImportError(f"{fn_name} requires tilelang. Install with: pip install tilelang") + + +def _require_triton(fn_name: str) -> None: + if not _HAS_TRITON: + raise ImportError(f"{fn_name} requires triton. Install with: pip install triton") + + +if _HAS_TRITON: + + @triton.jit + def weight_dequant_kernel( # type: ignore + x_ptr, + s_ptr, + y_ptr, + M_Size: tl.constexpr, + N_Size: tl.constexpr, + BLOCK_SIZE: tl.constexpr, + ) -> None: + """ + Weight dequantization kernel. + + Dequantizes weights using the provided scaling factors and stores the + result. + + Args: + x_ptr (tl.pointer): Pointer to the quantized weights. + s_ptr (tl.pointer): Pointer to the scaling factors. + y_ptr (tl.pointer): Pointer to the output buffer for dequantized + weights. + M (int): Number of rows in the weight matrix. + N (int): Number of columns in the weight matrix. + BLOCK_SIZE (tl.constexpr): Size of the block for tiling. + + Returns: + None + """ + pid_m = tl.program_id(axis=0) + pid_n = tl.program_id(axis=1) + n_size = tl.cdiv(N_Size, BLOCK_SIZE) + offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + offs = offs_m[:, None] * N_Size + offs_n[None, :] + mask = (offs_m[:, None] < M_Size) & (offs_n[None, :] < N_Size) + x_in = tl.load(x_ptr + offs, mask=mask).to(tl.float32) + s_in = tl.load(s_ptr + pid_m * n_size + pid_n) + y_out = x_in * s_in + tl.store(y_ptr + offs, y_out, mask=mask) + + +def _weight_dequant_torch( + x_in: torch.Tensor, s_in: torch.Tensor, block_size: int = 128 +) -> torch.Tensor: + """Pure-PyTorch fallback for weight_dequant (multi-GPU safe). + + Used when triton is unavailable, or when the triton kernel raises at + launch time (e.g. ``cuPointerGetAttribute`` failing on non-device-0 + GPUs during multi-device ``init_random_weights``). + """ + M, N = x_in.shape + y = x_in.float().reshape(M // block_size, block_size, N // block_size, block_size) + y = y * s_in[:, None, :, None] + return y.reshape(M, N).to(torch.get_default_dtype()) + + +def weight_dequant(x_in: torch.Tensor, s_in: torch.Tensor, block_size: int = 128) -> torch.Tensor: + """ + Dequantizes the given weight tensor using the provided scale tensor. + + Args: + x_in (torch.Tensor): The quantized weight tensor of shape (M, N). + s_in (torch.Tensor): The scale tensor of shape (M//block_size, + N//block_size). + block_size (int, optional): The block size to use for dequantization. + Defaults to 128. + + Returns: + torch.Tensor: The dequantized weight tensor of the same shape as `x`. + + Raises: + AssertionError: If `x` or `s` are not contiguous or if their dimensions + are not 2. + """ + assert x_in.is_contiguous() and s_in.is_contiguous(), "Input tensors must be contiguous" + assert x_in.dim() == 2 and s_in.dim() == 2, "Input tensors must have 2 dimensions" + if not _HAS_TRITON: + return _weight_dequant_torch(x_in, s_in, block_size) + M_Size, N_Size = x_in.size() + grid = lambda meta: ( # noqa: E731 + triton.cdiv(M_Size, meta["BLOCK_SIZE"]), + triton.cdiv(N_Size, meta["BLOCK_SIZE"]), + ) + try: + y_out = torch.empty_like(x_in, dtype=torch.get_default_dtype()) + weight_dequant_kernel[grid](x_in, s_in, y_out, M_Size, N_Size, BLOCK_SIZE=block_size) + except (ValueError, RuntimeError): + return _weight_dequant_torch(x_in, s_in, block_size) + return y_out + + +if _HAS_TILELANG: + tilelang.set_log_level("WARNING") + + _pass_configs = { + tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, + tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, + } + + def _fast_log2_ceil(x): # type: ignore + bits_x = T.reinterpret("uint32", x) + exp_x = (bits_x >> 23) & 0xFF + man_bits = bits_x & ((1 << 23) - 1) + return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0)) + + def _fast_pow2(x): # type: ignore + bits_x = (x + 127) << 23 + return T.reinterpret("float32", bits_x) + + def _fast_round_scale(amax, fp8_max_inv): # type: ignore + return _fast_pow2(_fast_log2_ceil(amax * fp8_max_inv)) + + @tilelang.jit(pass_configs=_pass_configs) + def act_quant_kernel( # type: ignore + N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False # type: ignore + ): # type: ignore + M = T.symbolic("M") + fp8_min = -448.0 + fp8_max = 448.0 + fp8_max_inv = 1 / fp8_max + num_stages = 0 if round_scale else 2 + blk_m = 32 + group_size = 128 + + @T.prim_func + def act_quant_kernel_( # type: ignore + X: T.Tensor[(M, N), in_dtype], + Y: T.Tensor[(M, N), out_dtype], + S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype], + ): # type: ignore + with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as ( + pid_m, + pid_n, + ): + x_shared = T.alloc_shared((blk_m, group_size), in_dtype) + x_local = T.alloc_fragment((blk_m, group_size), in_dtype) + amax_local = T.alloc_fragment((blk_m,), scale_dtype) + s_local = T.alloc_fragment((blk_m,), scale_dtype) + y_local = T.alloc_fragment((blk_m, group_size), out_dtype) + y_shared = T.alloc_shared((blk_m, group_size), out_dtype) + + for _ in T.Pipelined(1, num_stages=num_stages): + T.copy(X[pid_m * blk_m, pid_n * group_size], x_shared) + T.copy(x_shared, x_local) + T.reduce_absmax(x_local, amax_local, dim=1) + for i in T.Parallel(blk_m): + amax_local[i] = T.max(amax_local[i], 1e-4) + if round_scale: + s_local[i] = _fast_round_scale(amax_local[i], fp8_max_inv) + else: + s_local[i] = amax_local[i] * fp8_max_inv + for i, j in T.Parallel(blk_m, group_size): + y_local[i, j] = T.clamp(x_local[i, j] / s_local[i], fp8_min, fp8_max) + for i in T.Parallel(blk_m): + S[pid_m * blk_m + i, pid_n] = s_local[i] + T.copy(y_local, y_shared) + T.copy(y_shared, Y[pid_m * blk_m, pid_n * group_size]) + + return act_quant_kernel_ + + @tilelang.jit(pass_configs=_pass_configs) + def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"): # type: ignore + assert out_dtype in [BF16, "float32"] + + M = T.symbolic("M") + group_size = 128 + block_M = 32 + block_N = 128 + block_K = 128 + + @T.prim_func + def fp8_gemm_kernel_( # type: ignore + A: T.Tensor[(M, K), FP8], + B: T.Tensor[(N, K), FP8], + C: T.Tensor[(M, N), out_dtype], + scales_a: T.Tensor[(M, T.ceildiv(K, group_size)), FP32], + scales_b: T.Tensor[(T.ceildiv(N, group_size), T.ceildiv(K, group_size)), FP32], + ): # type: ignore + with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as ( + bx, + by, + ): + A_shared = T.alloc_shared((block_M, block_K), FP8) + B_shared = T.alloc_shared((block_N, block_K), FP8) + C_shared = T.alloc_shared((block_M, block_N), out_dtype) + Scale_C_shared = T.alloc_shared((block_M), FP32) + C_local = T.alloc_fragment((block_M, block_N), accum_dtype) + C_local_accum = T.alloc_fragment((block_M, block_N), accum_dtype) + + T.use_swizzle(panel_size=10) + + T.clear(C_local) + T.clear(C_local_accum) + K_iters = T.ceildiv(K, block_K) + for k in T.Pipelined(K_iters, num_stages=4): + T.copy(A[by * block_M, k * block_K], A_shared) + T.copy(B[bx * block_N, k * block_K], B_shared) + Scale_B = scales_b[bx * block_N // group_size, k] + for i in T.Parallel(block_M): + Scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B + + T.gemm(A_shared, B_shared, C_local, transpose_B=True) + for i, j in T.Parallel(block_M, block_N): + C_local_accum[i, j] += C_local[i, j] * Scale_C_shared[i] + T.clear(C_local) + T.copy(C_local_accum, C_shared) + T.copy(C_shared, C[by * block_M, bx * block_N]) + + return fp8_gemm_kernel_ + + +def act_quant( + x: torch.Tensor, block_size: int = 128, scale_fmt: str | None = None +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Quantizes the input tensor `x` using block-wise quantization. + + Args: + x (torch.Tensor): The input tensor to be quantized. + Must be contiguous and its last dimension size must be divisible by `block_size`. + block_size (int, optional): The size of the blocks to be used for quantization. + Default is 128. + scale_fmt (Optional[str], optional): The format of the scale. Default is None. + Returns: + Tuple[torch.Tensor, torch.Tensor]: A tuple containing: + - The quantized tensor with dtype `torch.float8_e4m3fn`. + - A tensor of scaling factors with dtype `torch.float32`. + """ + _require_tilelang("act_quant") + assert x.is_contiguous(), "Input tensor must be contiguous" + assert ( + x.size(-1) % block_size == 0 + ), f"Last dimension size must be divisible by block_size (block_size={block_size})" + N = x.size(-1) + y = torch.empty_like(x, dtype=torch.float8_e4m3fn) + s = x.new_empty(*x.size()[:-1], N // block_size, dtype=torch.float32) + kernel = act_quant_kernel(N, round_scale=scale_fmt is not None) + kernel(x.view(-1, N), y.view(-1, N), s.view(-1, N // block_size)) + return y, s + + +def fp8_gemm( + a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor +) -> torch.Tensor: + """ + Perform a matrix multiplication using FP8 precision. + + Args: + a (torch.Tensor): The first input matrix, must be contiguous. + a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous. + b (torch.Tensor): The second input matrix, must be contiguous. + b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous. + + Returns: + torch.Tensor: The result of the matrix multiplication. + """ + _require_tilelang("fp8_gemm") + assert a.is_contiguous() and b.is_contiguous(), "Input tensors must be contiguous" + assert a_s.is_contiguous() and b_s.is_contiguous(), "Scaling factor tensors must be contiguous" + K = a.size(-1) + M = a.numel() // K + N = b.size(0) + c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype()) + kernel = fp8_gemm_kernel(N, K) + kernel(a.view(M, K), b, c.view(M, N), a_s.view(M, -1), b_s) + return c diff --git a/python/models/deepseek_v3_2/temp_var_indices.py b/tilert/models/deepseek_v3_2/temp_var_indices.py similarity index 72% rename from python/models/deepseek_v3_2/temp_var_indices.py rename to tilert/models/deepseek_v3_2/temp_var_indices.py index 552fa3f..a4eca34 100644 --- a/python/models/deepseek_v3_2/temp_var_indices.py +++ b/tilert/models/deepseek_v3_2/temp_var_indices.py @@ -1,8 +1,6 @@ """Named indices for DSA temporary variables. -Mirrors the C++ ``DsaTempVars`` constants defined in -``include/lib/models/deepseek_v3_2/helper.hpp`` so that Python code can -reference temp_vars by name instead of magic numbers. +Lets Python code reference temp_vars by name instead of magic numbers. Usage:: @@ -15,7 +13,7 @@ class DsaTempVarIdx(IntEnum): - """Index constants for DSA temp_vars, mirroring C++ DsaTempVars.""" + """Index constants for DSA temp_vars.""" Q = 0 KV = 1 @@ -28,7 +26,7 @@ class DsaTempVarIdx(IntEnum): IDX_LOGITS = 8 IDX_SELECTS = 9 Q_NOPE = 10 - O = 11 # noqa: E741 — mirrors C++ DsaTempVars::O + O = 11 # noqa: E741 O_ACC = 12 O_LSE = 13 O_LSE_ACC = 14 @@ -68,35 +66,36 @@ class DsaTempVarIdx(IntEnum): SAMPLING_CONFIG = 48 TOP_P_SCORES = 49 TOP_P_DEBUG = 50 + LORA_SLOT_ID = 51 + LORA_RANK = 52 + TOP_N_LOG_PROBS = 53 + TOP_N_INDICES = 54 + LOGPROBS_FLAG = 55 -# Sentinel: total number of temp vars. Must equal C++ DsaTempVars::temp_vars_size. -TEMP_VARS_SIZE = 51 +TEMP_VARS_SIZE = 56 -# Short alias for convenient access Idx = DsaTempVarIdx def validate_temp_vars_layout() -> None: - """Validate that the Python enum matches the C++ DsaTempVars layout. + """Validate the temporary-variable index enum. Checks: 1. Enum member count equals TEMP_VARS_SIZE. 2. Indices are contiguous 0..TEMP_VARS_SIZE-1 with no gaps or duplicates. - 3. (If libtilert.so is loaded) C++ temp_vars_size matches Python TEMP_VARS_SIZE. + 3. (If the backend is loaded) the backend temp_vars_size matches TEMP_VARS_SIZE. Raises: RuntimeError: If any validation check fails. """ members = list(DsaTempVarIdx) - # Check member count if len(members) != TEMP_VARS_SIZE: raise RuntimeError( f"DsaTempVarIdx has {len(members)} members but TEMP_VARS_SIZE={TEMP_VARS_SIZE}" ) - # Check contiguous indices indices = sorted(m.value for m in members) expected = list(range(TEMP_VARS_SIZE)) if indices != expected: @@ -107,16 +106,13 @@ def validate_temp_vars_layout() -> None: f"Missing: {missing}, Duplicates: {set(dupes)}" ) - # Check against C++ if the library is loaded try: import torch cpp_size = torch.ops.tilert.dsa_temp_vars_size() if cpp_size != TEMP_VARS_SIZE: raise RuntimeError( - f"Python TEMP_VARS_SIZE={TEMP_VARS_SIZE} != " - f"C++ DsaTempVars::temp_vars_size={cpp_size}" + f"TEMP_VARS_SIZE={TEMP_VARS_SIZE} != " f"backend temp_vars_size={cpp_size}" ) except (AttributeError, RuntimeError): - # Library not loaded or op not available — skip C++ check pass diff --git a/python/models/glm_5/__init__.py b/tilert/models/glm_5/__init__.py similarity index 100% rename from python/models/glm_5/__init__.py rename to tilert/models/glm_5/__init__.py diff --git a/tilert/models/glm_5/_dsa_v32/__init__.py b/tilert/models/glm_5/_dsa_v32/__init__.py new file mode 100644 index 0000000..4b8633b --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/__init__.py @@ -0,0 +1 @@ +"""DeepSeek v3.2 model package.""" diff --git a/tilert/models/glm_5/_dsa_v32/generator.py b/tilert/models/glm_5/_dsa_v32/generator.py new file mode 100644 index 0000000..26ee685 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/generator.py @@ -0,0 +1,531 @@ +"""DSA show hands for deepseek v3.2.""" + +import math +import time + +import torch +from transformers import AutoTokenizer + +from tilert import logger +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.modules.end2end import ShowHandsDSALayer +from tilert.models.glm_5._dsa_v32.temp_var_indices import Idx +from tilert.tilert_init import tilert_init + +__all__ = [ + "DSAv32Generator", + "stats_time", +] + + +def stats_time(time_list: list[float], title: str) -> None: + if len(time_list) > 0: + avg_time = sum(time_list) / len(time_list) + std_dev = math.sqrt(sum((x - avg_time) ** 2 for x in time_list) / len(time_list)) + logger.info(title) + logger.info(f"--Average time taken to generate token: {avg_time * 1000:.4f} ms") + logger.info(f"--Standard deviation of time: {std_dev * 1000:.4f} ms") + logger.info(f"--Effective tokens per second: {1 / avg_time:.4f}") + + +class DSAv32Generator: + def __init__( + self, + model_args: ModelArgs, + max_new_tokens: int = 100, + temperature: float = 1.0, + model_weights_dir: str = "", + with_mtp: bool = False, + use_topp: bool = False, + top_p: float = 0.9, + top_k: int = 256, + sampling_seed: int = 42, + ): + """Initialize the DSAv32Generator. + + Args: + max_new_tokens: Maximum number of new tokens to generate. Defaults to 100. + temperature: Temperature for sampling. Defaults to 1.0. + model_weights_dir: Path of the model weights directory. + with_mtp: Whether to use MTP (Multi-Token Prediction) for speculative decoding. + use_topp: Whether to use top-p (nucleus) sampling instead of top-1 (argmax). + top_p: Top-p threshold for nucleus sampling. Defaults to 0.9. + top_k: Number of top-k candidates for top-p sampling. Defaults to 256. + sampling_seed: Sampling seed for top-p (fixed per request). Defaults to 42. + """ + torch.set_num_threads(64) + self.model_weights_dir = model_weights_dir + + self.max_new_tokens = max_new_tokens + self.temperature = temperature + self.with_mtp = with_mtp + self.use_topp = use_topp + self.top_p = top_p + self.top_k = top_k + self.sampling_seed = sampling_seed + + self.config = model_args + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_weights_dir, trust_remote_code=True + ) # nosec B615 + self.eos_id = self.tokenizer.eos_token_id + self.batch_size = 1 + + self.default_device = torch.device("cuda:0") + + self.decode_layer = ShowHandsDSALayer( + model_args=self.config, + model_path=self.model_weights_dir, + with_mtp=with_mtp, + use_topp=use_topp, + top_p=top_p, + top_k=top_k, + ) + + self.mtp_seq_len = 4 if with_mtp else 1 + + def init(self) -> None: + """Initialize the ShowHandsGenerator.""" + tilert_init() + + def cleanup(self) -> None: + """Cleanup the ShowHandsGenerator.""" + self.decode_layer.cleanup() + + def init_random_weights(self) -> None: + """Random initialize the weights.""" + self.decode_layer.init_random_weights() + + def from_pretrained(self) -> None: + """Load the model weights from the given path.""" + self.decode_layer.from_pretrained(self.model_weights_dir) + + def extract_ffn_cache(self) -> tuple[dict[int, list], dict[int, set[str]]]: + """Extract MOE/MLP op objects and skip keys from current loaded weights. + + Returns: + Tuple of (cached_ffn_ops_per_device, skip_keys_per_device). + """ + from tilert.models.glm_5._dsa_v32.modules.end2end import ( + _extract_ffn_ops, + _get_moe_weight_keys, + ) + + cached_ffn_ops: dict[int, list] = {} + skip_keys: dict[int, set[str]] = {} + for device_id in range(self.decode_layer.num_devices): + dsa = self.decode_layer._dsa_objects[device_id] + if dsa is None: + raise RuntimeError(f"Device {device_id} Dsa not available for cache extraction") + cached_ffn_ops[device_id] = _extract_ffn_ops(dsa) + skip_keys[device_id] = _get_moe_weight_keys(dsa) + return cached_ffn_ops, skip_keys + + def from_pretrained_with_cache( + self, + cached_ffn_ops_per_device: dict[int, list], + skip_keys_per_device: dict[int, set[str]], + ) -> None: + """Load weights reusing cached MOE/MLP ops.""" + self.decode_layer.from_pretrained_with_cache( + self.model_weights_dir, cached_ffn_ops_per_device, skip_keys_per_device + ) + + def update_sampling_params( + self, + temperature: float = 1.0, + top_p: float = 0.95, + top_k: int = 256, + use_topp: bool = True, + ) -> None: + """Update sampling parameters for the next generation.""" + self.temperature = temperature + self.use_topp = use_topp + self.top_p = top_p + self.top_k = top_k + self.decode_layer.update_sampling_config( + temperature=temperature, top_p=top_p, top_k=top_k, use_topp=use_topp + ) + + @torch.inference_mode() + def generate( + self, + prompt: str, + print_log: bool = True, + with_mtp: bool | None = None, + prompt_tokens: list[int] | None = None, + ) -> tuple[str, list[float], list[int], int]: + """Main function to load the model and perform single sequence generation. + + Args: + prompt: The input prompt string. + print_log: Whether to print generation logs. + with_mtp: Override MTP mode for this call. None uses self.with_mtp. + Requires MTP weights to have been loaded (self.with_mtp=True). + prompt_tokens: Pre-tokenized prompt tokens. If provided, skip tokenization + and use these tokens directly (useful for exact-length benchmarking). + + Returns: + Tuple of (result_text, time_list, accepted_counts, prompt_len). + accepted_counts is empty for non-MTP mode. + """ + active_mtp = with_mtp if with_mtp is not None else self.with_mtp + if active_mtp and not self.with_mtp: + raise ValueError("Cannot use MTP mode: MTP weights were not loaded") + self.decode_layer.set_sampling_seed(self.sampling_seed, with_mtp=active_mtp) + if active_mtp: + return self._generate_with_mtp(prompt, print_log, prompt_tokens=prompt_tokens) + result, time_list, prompt_len = self._generate_without_mtp( + prompt, print_log, with_mtp=active_mtp, prompt_tokens=prompt_tokens + ) + return result, time_list, [], prompt_len + + def _generate_without_mtp( + self, + prompt: str, + print_log: bool = True, + with_mtp: bool = False, + prompt_tokens: list[int] | None = None, + ) -> tuple[str, list[float], int]: + """Standard generation without MTP.""" + if prompt_tokens is None: + prompt_tokens = self.tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], add_generation_prompt=True + ) + + max_seq_len = self.config.max_seq_len + prompt_len = len(prompt_tokens) + total_len = min(max_seq_len, self.max_new_tokens + prompt_len) + + tokens = torch.full( + (self.batch_size, total_len), -1, dtype=torch.long, device=self.default_device + ) + tokens[0, :prompt_len] = torch.tensor( + prompt_tokens, dtype=torch.long, device=self.default_device + ) + prompt_mask = tokens != -1 + + prev_pos = 0 + finished = torch.tensor( + [False] * self.batch_size, dtype=torch.bool, device=self.default_device + ) + + time_list = [] + for cur_pos_val in range(1, total_len): + start_time = time.time() + multi_devices_results = self.decode_layer.forward( + tokens[0, prev_pos], with_mtp=with_mtp + ) + end_time = time.time() + time_list.append(end_time - start_time) + + intermediates, *_ = multi_devices_results[0] + next_token = intermediates[Idx.TOKEN_OUT][0][0] + + next_token = torch.where( + prompt_mask[0, cur_pos_val], tokens[0, cur_pos_val], next_token + ) + tokens[0, cur_pos_val] = next_token + finished |= torch.logical_and(~prompt_mask[0, cur_pos_val], next_token == self.eos_id) + prev_pos = cur_pos_val + if cur_pos_val >= prompt_len: + decoded_tokens = self.tokenizer.decode( + [next_token.item()], skip_special_tokens=True + ) + if print_log: + print(decoded_tokens, end="", flush=True) + + if finished.all(): + break + + if print_log: + print("\n") + logger.info(f"--Number of tokens generated: {len(time_list)}") + + stats_time(time_list, "==== Performance ====") + print("\n") + + self.decode_layer.reset_sequence() + + completion_tokens = [] + for _, toks in enumerate(tokens.tolist()): + toks = toks[prompt_len : prompt_len + self.max_new_tokens] + if self.eos_id in toks: + toks = toks[: toks.index(self.eos_id)] + completion_tokens.append(toks) + + decoded_tokens = self.tokenizer.batch_decode(completion_tokens, skip_special_tokens=True) + + return f"{decoded_tokens[0]}\n" if decoded_tokens else "", time_list, prompt_len + + def _generate_with_mtp( + self, + prompt: str, + print_log: bool = True, + prompt_tokens: list[int] | None = None, + ) -> tuple[str, list[float], list[int], int]: + """Generation with MTP (Multi-Token Prediction) speculative decoding.""" + if prompt_tokens is None: + prompt_tokens = self.tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], add_generation_prompt=True + ) + + max_seq_len = self.config.max_seq_len + prompt_len = len(prompt_tokens) + total_len = min(max_seq_len, self.max_new_tokens + prompt_len) + + tokens = torch.full( + (self.batch_size, total_len), -1, dtype=torch.long, device=self.default_device + ) + tokens[0, :prompt_len] = torch.tensor( + prompt_tokens, dtype=torch.long, device=self.default_device + ) + + prefill_time_list = [] + decode_time_list = [] + decode_accepted_counts = [] + cur_pos = 0 + + while cur_pos < prompt_len - 1: + draft_end = min(cur_pos + self.mtp_seq_len, prompt_len) + draft_tokens = tokens[0, cur_pos:draft_end].clone() + actual_token_count = draft_tokens.shape[0] + + if actual_token_count < self.mtp_seq_len: + pad_token = draft_tokens[-1].item() + padding = torch.full( + (self.mtp_seq_len - actual_token_count,), + pad_token, + dtype=torch.long, + device=self.default_device, + ) + draft_tokens = torch.cat([draft_tokens, padding]) + + draft_tokens = draft_tokens.reshape(1, self.mtp_seq_len).to(torch.int32) + + mtp_extra_pos = cur_pos + self.mtp_seq_len + if mtp_extra_pos < prompt_len: + mtp_extra_token = int(tokens[0, mtp_extra_pos].item()) + else: + mtp_extra_token = int(tokens[0, draft_end - 1].item()) + self.decode_layer.set_prefill_mtp_extra_token(mtp_extra_token) + + self.decode_layer.set_prefill_valid_tokens(actual_token_count) + + start_time = time.time() + self.decode_layer.forward(draft_tokens, with_mtp=True) + end_time = time.time() + prefill_time_list.append(end_time - start_time) + + cur_pos += actual_token_count + + cur_pos = prompt_len - 1 + self.set_cur_pos(prompt_len - 1) + + self.decode_layer.set_prefill_valid_tokens(0) + + finished = False + while cur_pos < total_len - 1 and not finished: + if cur_pos == prompt_len - 1: + last_token = tokens[0, prompt_len - 1].item() + draft_tokens = torch.full( + (self.mtp_seq_len,), + last_token, + dtype=torch.long, + device=self.default_device, + ) + draft_tokens = draft_tokens.reshape(1, self.mtp_seq_len).to(torch.int32) + else: + draft_tokens = self.decode_layer.get_next_draft_tokens(0).reshape( + 1, self.mtp_seq_len + ) + + start_time = time.time() + self.decode_layer.forward(draft_tokens, with_mtp=True) + end_time = time.time() + decode_time_list.append(end_time - start_time) + + num_accepted = self.decode_layer.get_num_accepted(0) + predicted_tokens = self.decode_layer.get_predicted_tokens(0).flatten() + decode_accepted_counts.append(num_accepted) + + num_output_tokens = num_accepted + for i in range(num_output_tokens): + if cur_pos + 1 + i >= total_len: + break + new_token = int(predicted_tokens[i].item()) + tokens[0, cur_pos + 1 + i] = new_token + + if cur_pos + 1 + i >= prompt_len and print_log: + decoded_text = self.tokenizer.decode([new_token], skip_special_tokens=True) + print(decoded_text, end="", flush=True) + + if new_token == self.eos_id: + finished = True + break + + cur_pos += num_accepted + + if print_log: + print("\n") + total_tokens = sum(decode_accepted_counts) + logger.info(f"--Number of forward calls (decode): {len(decode_accepted_counts)}") + logger.info(f"--Total tokens generated: {total_tokens}") + if len(decode_accepted_counts) > 0: + avg_accepted = sum(decode_accepted_counts) / len(decode_accepted_counts) + min_accepted = min(decode_accepted_counts) + max_accepted = max(decode_accepted_counts) + logger.info( + f"--Accepted tokens per call: mean={avg_accepted:.2f}, " + f"min={min_accepted}, max={max_accepted}" + ) + + if decode_time_list: + total_decode_time = sum(decode_time_list) + effective_tps = total_tokens / total_decode_time if total_decode_time > 0 else 0 + avg_time_ms = total_decode_time / len(decode_time_list) * 1000 + logger.info(f"--Avg forward time: {avg_time_ms:.2f}ms") + logger.info(f"--Effective TPS (with MTP): {effective_tps:.2f} tokens/s") + + print("\n") + + self.decode_layer.reset_sequence() + + completion_tokens = [] + for _, toks in enumerate(tokens.tolist()): + toks = toks[prompt_len : prompt_len + self.max_new_tokens] + toks = [t for t in toks if t != -1] + if self.eos_id in toks: + toks = toks[: toks.index(self.eos_id)] + completion_tokens.append(toks) + + decoded_tokens = self.tokenizer.batch_decode(completion_tokens, skip_special_tokens=True) + + return ( + f"{decoded_tokens[0]}\n" if decoded_tokens else "", + decode_time_list, + decode_accepted_counts, + prompt_len, + ) + + def inject_cache( + self, + layer_caches: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]], + start_pos: int = 0, + end_pos: int | None = None, + ) -> None: + """Inject external cache data into TileRT. + + This API allows injecting pre-computed KI/KV/PE cache data from an external + prefill system, enabling prefill-decode disaggregation. + + Args: + layer_caches: List of (ki, kv, pe) tuples for each layer (0 to NUM_LAYERS-1). + Each tensor should be BF16 with shape [seqlen, dim] where: + - ki: [seqlen, 128] - compressed key + - kv: [seqlen, 512] - compressed key-value + - pe: [seqlen, 64] - position encoding cache + start_pos: Start position in cache to write (0-indexed). Defaults to 0. + end_pos: End position in cache (exclusive). If None, uses seqlen from tensors. + + Example: + >>> # Load cache from external prefill system + >>> layer_caches = [] # List of 61 (ki, kv, pe) tuples + >>> for layer_id in range(61): + ... ki = load_ki_for_layer(layer_id) # [seqlen, 128] bf16 + ... kv = load_kv_for_layer(layer_id) # [seqlen, 512] bf16 + ... pe = load_pe_for_layer(layer_id) # [seqlen, 64] bf16 + ... layer_caches.append((ki, kv, pe)) + >>> generator.inject_cache(layer_caches, start_pos=0) + >>> generator.set_cur_pos(seqlen) # Set RoPE position + >>> # Continue generation from cache + """ + num_layers = len(layer_caches) + if num_layers == 0: + logger.warning("inject_cache called with empty layer_caches") + return + + first_ki, _, _ = layer_caches[0] + seqlen = first_ki.size(0) + if end_pos is None: + end_pos = start_pos + seqlen + + cache_len = end_pos - start_pos + logger.info(f"Injecting cache: {num_layers} layers, positions [{start_pos}, {end_pos})") + + num_devices = self.decode_layer.num_devices + + for device_id in range(num_devices): + _, caches, _, _ = self.decode_layer._get_device_result(device_id) + + for layer_id, (ki, kv, pe) in enumerate(layer_caches): + if layer_id >= num_layers: + logger.warning(f"Layer index {layer_id} is out of bounds, skipping.") + break + + base_idx = layer_id * 3 + + ki_src = ki[:cache_len].to(f"cuda:{device_id}") + kv_src = kv[:cache_len].to(f"cuda:{device_id}") + pe_src = pe[:cache_len].to(f"cuda:{device_id}") + + caches[base_idx + 0][0, start_pos:end_pos, :].copy_(ki_src) + caches[base_idx + 1][0, start_pos:end_pos, :].copy_(kv_src) + caches[base_idx + 2][0, start_pos:end_pos, :].copy_(pe_src) + + logger.info(f"Cache injection completed for {num_devices} devices") + + def set_cur_pos(self, cur_pos: int) -> None: + """Set the current position for RoPE. + + This should be called after inject_cache() to ensure the runtime position + matches the injected cache length, for correct RoPE position encoding + during continued generation. + + Args: + cur_pos: The current sequence position (typically the length of prefilled tokens). + + Example: + >>> generator.inject_cache(layer_caches, start_pos=0) + >>> generator.set_cur_pos(prefill_len) # Set position to prefill length + >>> # Now generate continues from the correct position + """ + if self.with_mtp: + num_devices = self.decode_layer.num_devices + for device_id in range(num_devices): + intermediates, _, _, _ = self.decode_layer._get_device_result(device_id) + cur_pos_tensor = intermediates[Idx.CUR_POS] + cur_pos_tensor.fill_(cur_pos) + else: + torch.ops.tilert.dsa_show_hands_set_cur_pos(cur_pos) + + def inject_last_hidden_state(self, last_hidden_state: torch.Tensor) -> None: + """Inject the last hidden state for MTP mode. + + For MTP (Multi-Token Prediction), the MTP preprocess layer needs the + last hidden state from the main model's last token. + + Args: + last_hidden_state: [hidden_size] or [1, hidden_size] BF16 tensor. + The hidden state of the last token from prefill. + + Example: + >>> # After inject_cache, inject the last hidden state for MTP + >>> generator.inject_last_hidden_state(last_hidden_state) + >>> # Then set cur_pos and start generation + """ + if not self.with_mtp: + logger.warning("inject_last_hidden_state called but with_mtp is False, skipping") + return + + if last_hidden_state.dim() == 1: + last_hidden_state = last_hidden_state.unsqueeze(0) + + num_devices = self.decode_layer.num_devices + for device_id in range(num_devices): + intermediates, _, _, _ = self.decode_layer._get_device_result(device_id) + lhs_tensor = intermediates[Idx.LAST_HIDDEN_STATES] + lhs_src = last_hidden_state.to(f"cuda:{device_id}") + lhs_tensor[0, 0, :].copy_(lhs_src.squeeze(0)) + + logger.info(f"Injected last_hidden_state to {num_devices} devices") diff --git a/tilert/models/glm_5/_dsa_v32/model_args.py b/tilert/models/glm_5/_dsa_v32/model_args.py new file mode 100644 index 0000000..441b684 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/model_args.py @@ -0,0 +1,95 @@ +"""Model arguments and hyperparameters.""" + +from dataclasses import dataclass +from typing import Literal + +__all__ = [ + "ModelArgs", +] + + +@dataclass +class ModelArgs: + """ + Data class for defining model arguments and hyperparameters. + + Attributes: + arch_name (str): Architecture name. + max_batch_size (int): Maximum batch size. + max_seq_len (int): Maximum sequence length. + dtype (Literal["bf16", "fp8"]): Data type for computations. + scale_fmt (Optional[str]): Format for quantization scale. + vocab_size (int): Vocabulary size. + dim (int): Model dimension. + inter_dim (int): Intermediate dimension for MLP layers. + moe_inter_dim (int): Intermediate dimension for MoE layers. + n_layers (int): Number of transformer layers. + n_dense_layers (int): Number of dense layers in the model. + n_heads (int): Number of attention heads. + n_routed_experts (int): Number of routed experts for MoE layers. + n_shared_experts (int): Number of shared experts for MoE layers. + n_activated_experts (int): Number of activated experts in MoE layers. + n_expert_groups (int): Number of expert groups. + n_limited_groups (int): Number of limited groups for MoE routing. + score_func (Literal["softmax", "sigmoid"]): Scoring function for MoE routing. + route_scale (float): Scaling factor for routing scores. + q_lora_rank (int): LoRA rank for query projections. + kv_lora_rank (int): LoRA rank for key-value projections. + qk_nope_head_dim (int): Dimension for query-key projections without positional embeddings. + qk_rope_head_dim (int): Dimension for query-key projections with rotary embeddings. + v_head_dim (int): Dimension for value projections. + original_seq_len (Optional[int]): Original sequence length. + rope_theta (float): Base for rotary positional encoding. + rope_factor (Optional[float]): Scaling factor for extended sequence lengths. + beta_fast (Optional[int]): Fast beta correction factor. + beta_slow (Optional[int]): Slow beta correction factor. + mscale (float): Scaling factor for extended attention. + index_head_dim (int): Dimension for index head. + index_topk (int): Top-k for index head. + """ + + arch_name = "deepseek_v3_2" + + max_batch_size: int = 1 + max_seq_len: int = 160 * 1024 + dtype: Literal["bf16", "fp8"] = "fp8" + scale_fmt: str | None = None + + vocab_size: int = 129280 + dim: int = 7168 + inter_dim: int = 18432 + moe_inter_dim: int = 2048 + n_layers: int = 61 + n_dense_layers: int = 3 + n_heads: int = 128 + + n_routed_experts: int = 256 + n_shared_experts: int = 1 + n_activated_experts: int = 8 + n_expert_groups: int = 8 + n_limited_groups: int = 4 + score_func: Literal["softmax", "sigmoid", "sqrtsoftplus"] = "softmax" + route_scale: float = 2.5 + + q_lora_rank: int = 1536 + kv_lora_rank: int = 512 + qk_nope_head_dim: int = 128 + qk_rope_head_dim: int = 64 + v_head_dim: int = 128 + + original_seq_len: int | None = 4096 + rope_theta: float = 10000.0 + rope_factor: float | None = 40 + beta_fast: int | None = 32 + beta_slow: int | None = 1 + mscale: float = 1.0 + + index_n_heads: int = 64 + index_head_dim: int = 128 + index_topk: int = 2048 + + kv_cache_pad: int = 8 + + block_size: int = 128 + + eps: float = 1e-6 diff --git a/tilert/models/glm_5/_dsa_v32/modules/__init__.py b/tilert/models/glm_5/_dsa_v32/modules/__init__.py new file mode 100644 index 0000000..937085b --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/modules/__init__.py @@ -0,0 +1,11 @@ +"""DeepSeek v3.2 high-level Python modules (MLA, MLP, MTP, etc.).""" + +__all__ = [ + "dsa", + "end2end", + "mla", + "mlp", + "moe", + "mtp", + "mtp_preprocess", +] diff --git a/tilert/models/glm_5/_dsa_v32/modules/dsa.py b/tilert/models/glm_5/_dsa_v32/modules/dsa.py new file mode 100644 index 0000000..38a01c1 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/modules/dsa.py @@ -0,0 +1,229 @@ +from typing import Any + +import torch + +from tilert.models.base import SerializableTileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.modules.mlp import MlpBlock +from tilert.models.glm_5._dsa_v32.modules.moe import MoeBlock +from tilert.models.glm_5._dsa_v32.ops import RMSNormHeadProj +from tilert.models.glm_5._dsa_v32.temp_var_indices import TEMP_VARS_SIZE, Idx + + +class Dsa(SerializableTileRTModule): + """DSA module.""" + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + cached_ffn_ops: list | None = None, + ): + super().__init__( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + remove_selected=True, + ) + from tilert.models.glm_5._dsa_v32.modules.mla_v2 import ( + PureMlaV2, + SparseSelectMlaV2, + ) + + mla_cls = SparseSelectMlaV2 if device_id == 0 else PureMlaV2 + mla_kwargs: dict = {} + + dev = f"cuda:{device_id}" + n_peers = num_devices - 1 + if device_id == 0: + self.v2_peer_bufs = torch.zeros(n_peers, dtype=torch.int64, device=dev) + self.v2_partial_buf = torch.zeros( + model_args.max_batch_size, 4, model_args.dim, dtype=torch.bfloat16, device=dev + ) + mla_kwargs = { + "peer_bufs": self.v2_peer_bufs, + "partial_buf": self.v2_partial_buf, + } + else: + max_seq_len = getattr(model_args, "num_mtp", 3) + 1 + topk = model_args.index_topk + self.v2_ll_buf = torch.zeros(max_seq_len * topk * 2, dtype=torch.int32, device=dev) + mla_kwargs = {"ll_buf": self.v2_ll_buf} + + mla_num_devices: int | None = None + if device_id != 0: + mla_num_devices = num_devices - 1 + + if cached_ffn_ops is not None: + assert ( + len(cached_ffn_ops) == model_args.n_layers + ), f"Expected {model_args.n_layers} cached FFN ops, got {len(cached_ffn_ops)}" + + for layer_idx in range(model_args.n_layers): + ffn_op = cached_ffn_ops[layer_idx] if cached_ffn_ops else None + if layer_idx < model_args.n_dense_layers: + block = MlpBlock( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + mla_cls=mla_cls, + mla_num_devices=mla_num_devices, + mla_kwargs=mla_kwargs, + mlp=ffn_op, + ) + else: + block = MoeBlock( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + mla_cls=mla_cls, + mla_num_devices=mla_num_devices, + mla_kwargs=mla_kwargs, + moe=ffn_op, + ) + self.register_op(block, prefix=f"layer_{layer_idx}_", suffix=f"_dev_{device_id}") + + self.register_op( + RMSNormHeadProj(model_args=model_args, device_id=device_id, num_devices=num_devices), + prefix=f"layer_{model_args.n_layers}_", + suffix=f"_dev_{device_id}", + retain_weights=True, + ) + + self.embed_tokens_weight = None + self.freqs_cis = None + + def init_tilert_weights(self, state_dicts: dict[str, torch.Tensor]) -> None: + super().init_tilert_weights(state_dicts) + self.embed_tokens_weight = state_dicts["model.embed_tokens.weight"] + self.freqs_cis = state_dicts["freqs_cis"] + + def get_weights_list(self) -> list[torch.Tensor]: + return [*super().get_weights_list(), self.embed_tokens_weight, self.freqs_cis] + + def get_temp_vars( + self, batch_size: int, seq_len: int, extra_args: dict[str, Any] | None = None + ) -> list[torch.Tensor]: + bf16_desc = {"dtype": torch.bfloat16, "device": f"cuda:{self.device_id}"} + fp32_desc = {"dtype": torch.float32, "device": f"cuda:{self.device_id}"} + int32_desc = {"dtype": torch.int32, "device": f"cuda:{self.device_id}"} + int64_desc = {"dtype": torch.int64, "device": f"cuda:{self.device_id}"} + fp8_desc = {"dtype": torch.float8_e4m3fn, "device": f"cuda:{self.device_id}"} + + assert extra_args is not None + temperature = extra_args["temperature"] + top_p = extra_args["top_p"] + top_k = extra_args["top_k"] + use_topp = extra_args["use_topp"] + + dim = self.model_args.dim + batch_seq = (batch_size, seq_len) + q_lora_rank = self.model_args.q_lora_rank + kv_lora_rank = self.model_args.kv_lora_rank + qk_nope_head_dim = self.model_args.qk_nope_head_dim + if self.device_id != 0: + from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqbWeightsConverter, + ) + + qk_head_dim = self.model_args.qk_nope_head_dim + self.model_args.qk_rope_head_dim + n_local_heads = RmsnormProjqWqbWeightsConverter._compute_n_local_heads( + self.model_args.n_heads, self.num_devices - 1, qk_head_dim + ) + else: + n_local_heads = self.model_args.n_heads // self.num_devices + qk_rope_head_dim = self.model_args.qk_rope_head_dim + index_head_dim = self.model_args.index_head_dim + v_head_dim = self.model_args.v_head_dim + n_index_heads = self.model_args.index_n_heads + max_seq_len = self.model_args.max_seq_len + index_topk = self.model_args.index_topk + n_routed_experts = self.model_args.n_routed_experts + n_activated_experts = self.model_args.n_activated_experts + n_total_experts = self.model_args.n_activated_experts + self.model_args.n_shared_experts + moe_inter_dim = self.model_args.moe_inter_dim // self.num_devices + vocab_size = self.model_args.vocab_size // self.num_devices + + temp_vars: list[torch.Tensor | None] = [None] * TEMP_VARS_SIZE + + temp_vars[Idx.Q] = torch.zeros(*batch_seq, q_lora_rank, **bf16_desc) + temp_vars[Idx.KV] = torch.zeros(*batch_seq, kv_lora_rank, **bf16_desc) + temp_vars[Idx.KI] = torch.zeros(*batch_seq, index_head_dim, **bf16_desc) + temp_vars[Idx.Q_NOPE_DOWN] = torch.zeros( + *batch_seq, n_local_heads, qk_nope_head_dim, **bf16_desc + ) + temp_vars[Idx.Q_PE] = torch.zeros(*batch_seq, n_local_heads, qk_rope_head_dim, **bf16_desc) + temp_vars[Idx.IQ] = torch.zeros(*batch_seq, n_index_heads, index_head_dim, **bf16_desc) + temp_vars[Idx.IQ_RT] = torch.zeros(*batch_seq, n_index_heads, index_head_dim, **bf16_desc) + temp_vars[Idx.IDX_SCORES] = torch.zeros(*batch_seq, n_index_heads, **bf16_desc) + temp_vars[Idx.IDX_LOGITS] = torch.zeros( + *batch_seq, max_seq_len + self.model_args.kv_cache_pad, **fp32_desc + ) + temp_vars[Idx.IDX_SELECTS] = torch.zeros(*batch_seq, index_topk, **int32_desc) + temp_vars[Idx.Q_NOPE] = torch.zeros(*batch_seq, n_local_heads, kv_lora_rank, **bf16_desc) + temp_vars[Idx.O] = torch.zeros(*batch_seq, n_local_heads, kv_lora_rank, **bf16_desc) + temp_vars[Idx.O_ACC] = torch.zeros(*batch_seq, n_local_heads, 32, kv_lora_rank, **fp32_desc) + temp_vars[Idx.O_LSE] = torch.empty(*batch_seq, n_local_heads, **fp32_desc) + temp_vars[Idx.O_LSE_ACC] = torch.empty(*batch_seq, n_local_heads, 32, **fp32_desc) + temp_vars[Idx.PROJ_O] = torch.zeros(*batch_seq, n_local_heads, v_head_dim, **bf16_desc) + temp_vars[Idx.UNPROJ_O] = torch.zeros(*batch_seq, dim, **bf16_desc) + temp_vars[Idx.SCORES] = torch.zeros(*batch_seq, n_routed_experts, **fp32_desc) + temp_vars[Idx.X_MLP_IN] = torch.zeros(*batch_seq, dim, **bf16_desc) + exp_up_gate = torch.zeros(*batch_seq, n_total_experts, moe_inter_dim, **bf16_desc) + temp_vars[Idx.UP_GATE] = exp_up_gate + temp_vars[Idx.SEL_PROBS] = torch.zeros(*batch_seq, n_activated_experts, **fp32_desc) + temp_vars[Idx.SEL_INDICES] = torch.zeros(*batch_seq, n_activated_experts, **int32_desc) + temp_vars[Idx.EXP_OUT] = torch.zeros(*batch_seq, dim, **bf16_desc) + temp_vars[Idx.X_RMSNORM] = torch.zeros(*batch_seq, dim, **bf16_desc) + temp_vars[Idx.LOGITS_OUT] = torch.zeros(*batch_seq, vocab_size, **fp32_desc) + temp_vars[Idx.TOKEN_OUT] = torch.zeros(*batch_seq, 1, **int32_desc) + + temp_vars[Idx.EMBEDDING_RMSNORM] = torch.zeros(*batch_seq, dim, **bf16_desc) + temp_vars[Idx.HIDDEN_RMSNORM] = torch.zeros(*batch_seq, dim, **bf16_desc) + temp_vars[Idx.EH_PROJ] = torch.zeros(*batch_seq, dim, **bf16_desc) + temp_vars[Idx.X_TENSOR] = torch.zeros(*batch_seq, dim, **bf16_desc) + temp_vars[Idx.ROPE_FREQS] = torch.zeros(*batch_seq, qk_rope_head_dim, **fp32_desc) + temp_vars[Idx.CUR_POS] = torch.zeros(batch_size, **int32_desc) + temp_vars[Idx.TOKEN_ID] = torch.zeros(*batch_seq, 1, **int32_desc) + temp_vars[Idx.LAST_HIDDEN_STATES] = torch.zeros(*batch_seq, dim, **bf16_desc) + + temp_vars[Idx.DRAFT_TOKENS] = torch.zeros(*batch_seq, **int32_desc) + temp_vars[Idx.PREDICTED_TOKENS] = torch.zeros(*batch_seq, 1, **int32_desc) + temp_vars[Idx.PREDICTED_HIDDEN] = torch.zeros(*batch_seq, dim, **bf16_desc) + temp_vars[Idx.ACCEPTED_TOKENS] = torch.zeros(batch_size, **int32_desc) + temp_vars[Idx.NEXT_DRAFT_TOKENS] = torch.zeros(*batch_seq, **int32_desc) + + temp_vars[Idx.X_QUANT] = torch.zeros(*batch_seq, dim, **fp8_desc) + temp_vars[Idx.X_SCALE] = torch.zeros( + *batch_seq, dim // self.model_args.block_size, **fp32_desc + ) + temp_vars[Idx.MOE_UP_GATE] = torch.zeros_like(exp_up_gate) + + temp_vars[Idx.IDX_SEL_WS] = torch.zeros(*batch_seq, (200 * 1024 + 260), **int32_desc) + + temp_vars[Idx.MTP0_TOKEN_OUT] = torch.zeros(*batch_seq, 1, **int32_desc) + temp_vars[Idx.MTP1_TOKEN_OUT] = torch.zeros(*batch_seq, 1, **int32_desc) + temp_vars[Idx.MTP0_EXP_OUT] = torch.zeros(*batch_seq, dim, **bf16_desc) + + temp_vars[Idx.SAMPLING_SEED] = torch.zeros(*batch_seq, **int64_desc) + temp_vars[Idx.SAMPLING_POSITIONS] = torch.zeros(*batch_seq, **int64_desc) + temp_vars[Idx.SAMPLING_CONFIG] = torch.tensor( + [temperature, top_p, top_k, use_topp], **fp32_desc + ) + temp_vars[Idx.TOP_P_SCORES] = torch.zeros(*batch_seq, **fp32_desc) + temp_vars[Idx.TOP_P_DEBUG] = torch.zeros(*batch_seq, vocab_size, **fp32_desc) + + temp_vars[Idx.LORA_SLOT_ID] = torch.zeros(1, **int32_desc) + temp_vars[Idx.LORA_RANK] = torch.zeros(1, **int32_desc) + + max_top_n = 256 + temp_vars[Idx.TOP_N_LOG_PROBS] = torch.zeros(*batch_seq, max_top_n, **fp32_desc) + temp_vars[Idx.TOP_N_INDICES] = torch.zeros(*batch_seq, max_top_n, **int32_desc) + temp_vars[Idx.LOGPROBS_FLAG] = torch.zeros(1, **int32_desc) + + for i, t in enumerate(temp_vars): + if t is None: + raise RuntimeError(f"temp_vars[{i}] ({Idx(i).name}) was not initialized") + + return temp_vars # type: ignore[return-value] diff --git a/tilert/models/glm_5/_dsa_v32/modules/end2end.py b/tilert/models/glm_5/_dsa_v32/modules/end2end.py new file mode 100644 index 0000000..6b4e69c --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/modules/end2end.py @@ -0,0 +1,703 @@ +"""DSA show hands for deepseek v3.2.""" + +import json +import os +import sys +import threading +import time +from typing import Any + +import torch +from safetensors import safe_open +from safetensors.torch import load_file + +from tilert import logger +from tilert.models.base import TileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.modules.dsa import Dsa +from tilert.models.glm_5._dsa_v32.modules.mtp import MTP +from tilert.models.glm_5._dsa_v32.temp_var_indices import Idx, validate_temp_vars_layout +from tilert.models.utils import precompute_freqs_cis +from tilert.utils import get_profile_log_tensor + +__all__ = ["ShowHandsDSALayer", "_extract_ffn_ops", "_get_moe_weight_keys"] + + +DeviceResult = tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor], torch.Tensor] + + +def _mark_weights_initialized(module: TileRTModule) -> None: + """Recursively mark a module and all sub-ops as having initialized tilert weights.""" + module.is_tilert_weights_init = True + if hasattr(module, "exec_seq"): + for op in module.exec_seq: + _mark_weights_initialized(op) + + +def _extract_ffn_ops(dsa: "Dsa") -> list: + """Extract Moe/Mlp op objects from a Dsa's layer blocks. + + Returns a list of length n_layers where each element is a Moe or Mlp instance. + """ + from tilert.models.glm_5._dsa_v32.modules.mlp import MlpBlock + from tilert.models.glm_5._dsa_v32.modules.moe import MoeBlock + + ffn_ops = [] + for block in dsa.exec_seq: + if isinstance(block, MoeBlock): + op = block.moe + _mark_weights_initialized(op) + ffn_ops.append(op) + elif isinstance(block, MlpBlock): + op = block.mlp + _mark_weights_initialized(op) + ffn_ops.append(op) + + assert ( + len(ffn_ops) == dsa.model_args.n_layers + ), f"Expected {dsa.model_args.n_layers} FFN ops, got {len(ffn_ops)}" + return ffn_ops + + +def _get_moe_weight_keys(dsa: "Dsa") -> set[str]: + """Get state_dict keys that belong exclusively to MOE/MLP ops in this Dsa.""" + from tilert.models.glm_5._dsa_v32.modules.mlp import MlpBlock + from tilert.models.glm_5._dsa_v32.modules.moe import MoeBlock + + moe_keys: set[str] = set() + mla_keys: set[str] = set() + for block, prefix, suffix in zip(dsa.exec_seq, dsa.prefix_seq, dsa.suffix_seq): + if isinstance(block, (MoeBlock, MlpBlock)): + ffn = block.moe if isinstance(block, MoeBlock) else block.mlp + for alias in ffn.get_tilert_weights_alias(): + moe_keys.add(f"{prefix}{alias}{suffix}") + for alias in block.mla.get_tilert_weights_alias(): + mla_keys.add(f"{prefix}{alias}{suffix}") + return moe_keys - mla_keys + + +def dsa_show_hands_prepare_money( + params: list[torch.Tensor], + temp_vars: list[torch.Tensor], + cache_vars: list[torch.Tensor], + profile_logs: torch.Tensor, + forward_max_seq_len: int, + with_mtp: bool = False, + is_glm5: bool = False, +) -> Any: + """Prepare money for show hands""" + mtp_flag = "_mtp_e2e" if with_mtp else "" + glm5_flag = "_glm5" if is_glm5 else "" + func_name = f"dsa{mtp_flag}_show_hands_prepare_money{glm5_flag}" + if mtp_flag: + return getattr(torch.ops.tilert, func_name)(params, temp_vars, cache_vars, profile_logs) + return getattr(torch.ops.tilert, func_name)( + params, temp_vars, cache_vars, profile_logs, forward_max_seq_len + ) + + +def dsa_show_hands(token_id: torch.Tensor, with_mtp: bool = False, is_glm5: bool = False) -> Any: + """Show hands with native MT""" + mtp_flag = "_mtp_e2e" if with_mtp else "" + glm5_flag = "_glm5" if is_glm5 else "" + func_name = f"dsa{mtp_flag}_show_hands{glm5_flag}" + return getattr(torch.ops.tilert, func_name)(token_id) + + +def dsa_show_hands_reset(with_mtp: bool = False, is_glm5: bool = False) -> Any: + """Reset show one hand""" + mtp_flag = "_mtp_e2e" if with_mtp else "" + glm5_flag = "_glm5" if is_glm5 else "" + func_name = f"dsa{mtp_flag}_show_hands_reset{glm5_flag}" + return getattr(torch.ops.tilert, func_name)() + + +def dsa_show_hands_go_home(with_mtp: bool = False, is_glm5: bool = False) -> Any: + """Go home""" + mtp_flag = "_mtp_e2e" if with_mtp else "" + glm5_flag = "_glm5" if is_glm5 else "" + func_name = f"dsa{mtp_flag}_show_hands_go_home{glm5_flag}" + return getattr(torch.ops.tilert, func_name)() + + +def dsa_show_hands_set_sampling_seed( + seed: int, with_mtp: bool = False, is_glm5: bool = False +) -> Any: + """Set the sampling seed (request-level, fixed for the entire request). + + Args: + seed: The sampling seed value. + """ + mtp_flag = "_mtp_e2e" if with_mtp else "" + glm5_flag = "_glm5" if is_glm5 else "" + func_name = f"dsa{mtp_flag}_show_hands_set_sampling_seed{glm5_flag}" + return getattr(torch.ops.tilert, func_name)(seed) + + +def dsa_mtp_e2e_show_hands_set_prefill_valid_tokens( + num_valid_tokens: int, is_glm5: bool = False +) -> Any: + """Set the number of valid (non-padding) tokens for prefill mode. + + This controls how many tokens are copied from draft_tokens to predicted_tokens + during prefill. Should be called before forward() when the chunk has padding. + + Args: + num_valid_tokens: Number of valid tokens in the chunk (1-4). + """ + mtp_flag = "_mtp_e2e" + glm5_flag = "_glm5" if is_glm5 else "" + func_name = f"dsa{mtp_flag}_show_hands_set_prefill_valid_tokens{glm5_flag}" + return getattr(torch.ops.tilert, func_name)(num_valid_tokens) + + +def dsa_mtp_e2e_show_hands_set_prefill_mtp_extra_token(token: int, is_glm5: bool = False) -> Any: + """Set the extra token for MTP[0] shifted input during prefill. + + Args: + token: The extra prompt token id (int32). + """ + mtp_flag = "_mtp_e2e" + glm5_flag = "_glm5" if is_glm5 else "" + func_name = f"dsa{mtp_flag}_show_hands_set_prefill_mtp_extra_token{glm5_flag}" + return getattr(torch.ops.tilert, func_name)(token) + + +class ShowHandsDSALayer: + """Show hands DSA for deepseek v3.2.""" + + def __init__( + self, + model_args: ModelArgs, + model_path: str = "", + with_weight_conversion: bool = True, + with_mtp: bool = False, + temperature: float = 1.0, + top_p: float = 0.9, + top_k: int = 256, + use_topp: bool = False, + ) -> None: + validate_temp_vars_layout() + print(f"Model args: {model_args.arch_name}") + for k_arg, v_arg in model_args.__dict__.items(): + print(f" - {k_arg}: {v_arg}") + self.model_args = model_args + self.is_glm5 = self.model_args.arch_name == "glm_5" + assert self.model_args.arch_name in ["deepseek_v3_2", "glm_5"] + + self.num_devices = 8 + self.forward_max_seq_len = 4 + + self.model_path = model_path + self.with_weight_conversion = with_weight_conversion + self.with_mtp = with_mtp + + self.multi_devices_results: list[DeviceResult | None] = [None] * torch.cuda.device_count() + self._dsa_objects: list[Dsa | None] = [None] * torch.cuda.device_count() + + self.temperature = temperature + self.top_p = top_p + self.top_k = top_k + self.use_topp = use_topp + + def _gen_freqs_cis(self) -> torch.Tensor: + freqs_cis = precompute_freqs_cis(self.model_args) + return torch.view_as_real(freqs_cis).reshape(freqs_cis.shape[0], -1) + + def load_device_weights( + self, + model_path: str, + device_id: int, + extra_keys: list, + skip_keys: set[str] | None = None, + ) -> dict[str, torch.Tensor]: + index_file = "model.safetensors.index.json" + with open(os.path.join(model_path, index_file), encoding="utf-8") as f: + weights_index = json.load(f) + weight_file_map = weights_index["weight_map"] + + weights_list = [_k for _k in weight_file_map.keys() if _k.endswith(f"dev_{device_id}")] + weights_list = [*weights_list, *extra_keys] + + if skip_keys: + weights_list = [k for k in weights_list if k not in skip_keys] + + target_files = set() + for weight_key in weights_list: + weight_file = weight_file_map[weight_key] + target_files.add(weight_file) + + state_dicts = {} + weights_set = set(weights_list) + for weight_file in target_files: + filepath = os.path.join(model_path, weight_file) + if skip_keys: + logger.info( + f"Selectively loading weights from {weight_file} for device {device_id}" + ) + with safe_open(filepath, framework="pt", device=f"cuda:{device_id}") as f: + for key in f.keys(): + if key in weights_set: + state_dicts[key] = f.get_tensor(key) + torch.cuda.empty_cache() + else: + logger.info(f"Loading weights from {weight_file} for device {device_id}") + state_dict = load_file(filepath, device=f"cuda:{device_id}") + state_dicts.update(state_dict) + del state_dict + torch.cuda.empty_cache() + + state_dicts["freqs_cis"] = self._gen_freqs_cis().to(device_id) + return state_dicts + + def update_sampling_config( + self, temperature: float, top_p: float, top_k: int, use_topp: bool = True + ) -> None: + """Update sampling config, re-capturing CUDA graphs if parameters changed.""" + new_config = (temperature, top_p, top_k, use_topp) + current_config = (self.temperature, self.top_p, self.top_k, self.use_topp) + if new_config == current_config: + return + + print( + f"Recapturing CUDA graphs: " + f"temperature={temperature}, top_p={top_p}, top_k={top_k}, use_topp={use_topp}" + ) + + if self.with_mtp: + dsa_show_hands_go_home(True, self.is_glm5) + dsa_show_hands_go_home(False, self.is_glm5) + else: + dsa_show_hands_go_home(False, self.is_glm5) + + self.temperature = temperature + self.top_p = top_p + self.top_k = top_k + self.use_topp = use_topp + + for device_id in range(self.num_devices): + result = self.multi_devices_results[device_id] + if result is not None: + intermediates = result[0] + intermediates[Idx.SAMPLING_CONFIG].copy_( + torch.tensor( + [temperature, top_p, float(top_k), 1.0 if use_topp else 0.0], + dtype=torch.float32, + device=f"cuda:{device_id}", + ) + ) + + for device_id in range(self.num_devices): + with torch.cuda.device(device_id): + intermediates, caches, params, profile_logs = self._get_device_result(device_id) + dsa_show_hands_prepare_money( + params, + intermediates, + caches, + profile_logs, + self.forward_max_seq_len, + self.with_mtp, + self.is_glm5, + ) + if self.with_mtp: + dsa_show_hands_prepare_money( + params[: self._base_params_count], + intermediates, + caches[: self._base_caches_count], + profile_logs, + self.forward_max_seq_len, + False, + self.is_glm5, + ) + + @staticmethod + def tot_size_in_bytes_aligned(temp_vars: list[torch.Tensor], aligned_size: int) -> int: + tot_size: int = 0 + for param in temp_vars: + aligned_param_size = (param.nbytes + aligned_size - 1) // aligned_size * aligned_size + tot_size += aligned_param_size + return tot_size + + def generate_params_with_continuous_storage( + self, temp_vars: list[torch.Tensor], device: torch.device, aligned_size: int = 1024 + ) -> list[torch.Tensor]: + tot_size = self.tot_size_in_bytes_aligned(temp_vars, aligned_size) + cloned_params = [] + large_tensor = torch.zeros(tot_size, device=device, dtype=torch.uint8) + offset = 0 + for param in temp_vars: + aligned_param_size = (param.nbytes + aligned_size - 1) // aligned_size * aligned_size + cloned_params.append( + large_tensor[offset : offset + param.nbytes].view(param.dtype).view(param.shape) + ) + offset += aligned_param_size + return cloned_params + + def _init_weights( + self, + model_path: str | None, + cached_ffn_ops_per_device: dict[int, list] | None = None, + skip_keys_per_device: dict[int, set[str]] | None = None, + ) -> None: + """Load the model weights from the given path or generate random weights. + + Args: + model_path: Path to the model weights directory. + cached_ffn_ops_per_device: Optional dict mapping device_id to cached FFN ops. + When provided, these ops are injected into the Dsa and their weights + are not re-loaded from disk. + skip_keys_per_device: Optional dict mapping device_id to safetensors keys + to skip during loading. Used together with cached_ffn_ops_per_device. + """ + self._v2_p2p: dict = {} + + def __load_weights(device_id: int, model_path: str | None) -> None: + intermediates: list[torch.Tensor] = [] + caches: list[torch.Tensor] = [] + params: list[torch.Tensor] = [] + state_dicts = {} + start_time = time.time() + with torch.cuda.device(device_id): + assert model_path is not None + skip_keys = ( + skip_keys_per_device.get(device_id) + if skip_keys_per_device is not None + else None + ) + state_dicts = self.load_device_weights( + model_path, + device_id, + [ + "model.embed_tokens.weight", + f"layer_{self.model_args.n_layers}_lm_head.weight_dev_{device_id}", + f"layer_{self.model_args.n_layers}_model.norm.weight_dev_{device_id}", + ], + skip_keys=skip_keys, + ) + + cached_ffn_ops = ( + cached_ffn_ops_per_device.get(device_id) + if cached_ffn_ops_per_device is not None + else None + ) + dsa = Dsa( + self.model_args, + device_id, + self.num_devices, + cached_ffn_ops=cached_ffn_ops, + ) + dsa.init_tilert_weights(state_dicts) + self._dsa_objects[device_id] = dsa + params.extend(dsa.get_weights_list()) + caches.extend(dsa.get_cache_vars()) + + if device_id == 0: + self._v2_p2p[device_id] = { + "peer_bufs": dsa.v2_peer_bufs, + } + else: + self._v2_p2p[device_id] = { + "ll_buf": dsa.v2_ll_buf, + } + intermediates.extend( + self.generate_params_with_continuous_storage( + dsa.get_temp_vars( + 1, + self.forward_max_seq_len, + { + "temperature": self.temperature, + "top_p": self.top_p, + "top_k": self.top_k, + "use_topp": self.use_topp, + }, + ), + device_id, + ) + ) + + sampling_config = intermediates[Idx.SAMPLING_CONFIG] + sampling_config.copy_( + torch.tensor( + [ + self.temperature, + self.top_p, + float(self.top_k), + 1.0 if self.use_topp else 0.0, + ], + dtype=torch.float32, + device=device_id, + ) + ) + + base_params_count = len(params) + base_caches_count = len(caches) + + if self.with_mtp: + from tilert.models.glm_5._dsa_v32.modules.mla_v2 import ( + PureMlaV2, + SparseSelectMlaV2, + ) + + mtp_kwargs: dict = {} + mtp_kwargs["mla_cls"] = SparseSelectMlaV2 if device_id == 0 else PureMlaV2 + mtp_kwargs["mla_num_devices"] = 1 if device_id == 0 else self.num_devices - 1 + if device_id == 0: + mtp_kwargs["mla_kwargs"] = { + "peer_bufs": dsa.v2_peer_bufs, + } + else: + mtp_kwargs["mla_kwargs"] = {"ll_buf": dsa.v2_ll_buf} + mtp = MTP(self.model_args, device_id, self.num_devices, **mtp_kwargs) + mtp.init_tilert_weights(state_dicts) + params.extend(mtp.get_weights_list()) + caches.extend(mtp.get_cache_vars()) + logger.info(f"Loaded real MTP weights for device {device_id}") + + profile_logs = get_profile_log_tensor(device=device_id, num_max_insts=65536) + result = (intermediates, caches, params, profile_logs) + self.multi_devices_results[device_id] = result + self._base_params_count = base_params_count + self._base_caches_count = base_caches_count + + del state_dicts + torch.cuda.empty_cache() + elapsed_time = time.time() - start_time + minutes = int(elapsed_time // 60) + seconds = int(elapsed_time % 60) + time_str = ( + f"{minutes} minutes {seconds} seconds" if minutes > 0 else f"{seconds} seconds" + ) + logger.info(f"Completed loading weights for device {device_id} in {time_str}") + + threads = [] + exceptions: list[Exception | None] = [None] * self.num_devices + for device_id in range(self.num_devices): + + def _runner(dev_id: int) -> None: + try: + __load_weights(dev_id, model_path) + except Exception as exc: # pragma: no cover - surfaced after join + exceptions[dev_id] = exc + + thread = threading.Thread(target=_runner, args=(device_id,)) + threads.append(thread) + thread.start() + for thread in threads: + thread.join() + for device_id, exc in enumerate(exceptions): + if exc is not None: + raise RuntimeError(f"Failed to initialize device {device_id}: {exc}") from exc + + if self._v2_p2p: + gpu0 = self._v2_p2p[0] + peer_bufs_cpu = torch.zeros(self.num_devices - 1, dtype=torch.int64) + for i in range(self.num_devices - 1): + dev_id = i + 1 + peer_bufs_cpu[i] = self._v2_p2p[dev_id]["ll_buf"].data_ptr() + gpu0["peer_bufs"].copy_(peer_bufs_cpu) + logger.info( + "V2 P2P exchange complete: peer_bufs (ll_buf)=%s", + [hex(int(x)) for x in peer_bufs_cpu], + ) + + for device_id in range(self.num_devices): + with torch.cuda.device(device_id): + intermediates, caches, params, profile_logs = self._get_device_result(device_id) + dsa_show_hands_prepare_money( + params, + intermediates, + caches, + profile_logs, + self.forward_max_seq_len, + self.with_mtp, + self.is_glm5, + ) + if self.with_mtp: + dsa_show_hands_prepare_money( + params[: self._base_params_count], + intermediates, + caches[: self._base_caches_count], + profile_logs, + self.forward_max_seq_len, + False, + self.is_glm5, + ) + + def from_pretrained(self, model_path: str) -> None: + """Load the model weights from the given path.""" + if not os.path.exists(model_path): + raise ValueError(f"Model weights directory {model_path} does not exist") + self._init_weights(model_path) + + def from_pretrained_with_cache( + self, + model_path: str, + cached_ffn_ops_per_device: dict[int, list], + skip_keys_per_device: dict[int, set[str]], + ) -> None: + """Load weights with cached MOE/MLP ops.""" + if not os.path.exists(model_path): + raise ValueError(f"Model weights directory {model_path} does not exist") + self._init_weights( + model_path, + cached_ffn_ops_per_device=cached_ffn_ops_per_device, + skip_keys_per_device=skip_keys_per_device, + ) + + def init_random_weights(self) -> None: + """Generate random weights.""" + self._init_weights(None) + + def forward( + self, + token_id: torch.Tensor, + with_mtp: bool | None = None, + ) -> list[DeviceResult]: + active_mtp = with_mtp if with_mtp is not None else self.with_mtp + dsa_show_hands(token_id.cpu(), active_mtp, self.is_glm5) + return [self._get_device_result(device_id) for device_id in range(self.num_devices)] + + def set_sampling_seed(self, seed: int, with_mtp: bool | None = None) -> None: + """Set the sampling seed for top-p sampling. + + The seed is fixed for the entire request. Position provides per-step variation. + + Args: + seed: The sampling seed value. + with_mtp: Override MTP mode for this call. Defaults to self.with_mtp. + """ + active_mtp = with_mtp if with_mtp is not None else self.with_mtp + dsa_show_hands_set_sampling_seed(seed, active_mtp, self.is_glm5) + + def reset_sequence(self) -> None: + if self.with_mtp: + dsa_show_hands_reset(True, self.is_glm5) + dsa_show_hands_reset(False, self.is_glm5) + else: + dsa_show_hands_reset(False, self.is_glm5) + + def cleanup(self) -> None: + if self.with_mtp: + dsa_show_hands_go_home(True, self.is_glm5) + dsa_show_hands_go_home(False, self.is_glm5) + else: + dsa_show_hands_go_home(False, self.is_glm5) + + def __del__(self) -> None: + try: + self.cleanup() + except Exception as e: + print(f"Exception during cleanup: {e}", file=sys.stderr) + + def _get_device_result(self, device_id: int) -> DeviceResult: + device_result = self.multi_devices_results[device_id] + if device_result is None: + raise RuntimeError(f"Device {device_id} is not initialized") + return device_result + + def set_prefill_valid_tokens(self, num_valid_tokens: int) -> None: + """Set the number of valid tokens for prefill mode. + + This controls how many tokens are copied from draft_tokens to predicted_tokens + during prefill. Should be called before forward() when the chunk has padding. + + Args: + num_valid_tokens: Number of valid tokens in the chunk (1-4). + """ + dsa_mtp_e2e_show_hands_set_prefill_valid_tokens(num_valid_tokens, self.is_glm5) + + def set_prefill_mtp_extra_token(self, token: int) -> None: + """Set the extra token for MTP[0] shifted input during prefill. + + Args: + token: The prompt token at (cur_pos + mtp_seq_len). + """ + dsa_mtp_e2e_show_hands_set_prefill_mtp_extra_token(token, self.is_glm5) + + def get_next_draft_tokens(self, device_id: int = 0) -> torch.Tensor: + """Get next_draft_tokens from the specified device. + + Args: + device_id: Device ID to get results from. + + Returns: + next_draft_tokens tensor of shape [1, MTP_SEQ_LEN]. + """ + intermediates, _, _, _ = self._get_device_result(device_id) + return intermediates[Idx.NEXT_DRAFT_TOKENS] + + def get_num_accepted(self, device_id: int = 0) -> int: + """Get number of accepted tokens from the specified device. + + Args: + device_id: Device ID to get results from. + + Returns: + Number of accepted tokens. + """ + intermediates, _, _, _ = self._get_device_result(device_id) + return int(intermediates[Idx.ACCEPTED_TOKENS][0].item()) + + def get_predicted_tokens(self, device_id: int = 0) -> torch.Tensor: + """Get predicted_tokens from the specified device. + + Args: + device_id: Device ID to get results from. + + Returns: + predicted_tokens tensor containing main model predictions. + """ + intermediates, _, _, _ = self._get_device_result(device_id) + return intermediates[Idx.PREDICTED_TOKENS] + + def get_logits(self, device_id: int = 0) -> torch.Tensor: + """Get logits from the specified device. + + Args: + device_id: Device ID to get results from. + + Returns: + Logits tensor of shape [batch, seq_len, vocab_size] (FP32). + """ + intermediates, _, _, _ = self._get_device_result(device_id) + return intermediates[Idx.LOGITS_OUT] + + def get_top_n_logprobs(self, device_id: int = 0) -> tuple[torch.Tensor, torch.Tensor]: + """Get top-N log-probabilities and token IDs from the top_p kernel. + + Args: + device_id: Device ID to get results from. + + Returns: + Tuple of (log_probs, token_ids): + - log_probs: [batch, seq_len, 256] FP32 + - token_ids: [batch, seq_len, 256] INT32 + """ + intermediates, _, _, _ = self._get_device_result(device_id) + return ( + intermediates[Idx.TOP_N_LOG_PROBS], + intermediates[Idx.TOP_N_INDICES], + ) + + def get_token_logprob(self, device_id: int = 0) -> torch.Tensor: + """Get log-probability of the sampled token (from TOP_P_SCORES). + + Args: + device_id: Device ID to get results from. + + Returns: + Tensor of shape [batch, seq_len] (FP32). + """ + intermediates, _, _, _ = self._get_device_result(device_id) + return intermediates[Idx.TOP_P_SCORES] + + def set_logprobs_enabled(self, enabled: bool) -> None: + """Enable or disable logprobs export in the top_p kernel. + + Args: + enabled: True to enable logprobs export, False to disable. + """ + flag_val = 1 if enabled else 0 + for device_id in range(self.num_devices): + intermediates, _, _, _ = self._get_device_result(device_id) + intermediates[Idx.LOGPROBS_FLAG].fill_(flag_val) diff --git a/tilert/models/glm_5/_dsa_v32/modules/mla_v2.py b/tilert/models/glm_5/_dsa_v32/modules/mla_v2.py new file mode 100644 index 0000000..d9a9dd1 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/modules/mla_v2.py @@ -0,0 +1,248 @@ +"""MLA weight generator classes for device-group-specific pipelines.""" + +import torch + +from tilert.models.base import SerializableTileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.ops.layernorm_rope_rotate import LayerNormRoPERotate +from tilert.models.glm_5._dsa_v32.ops.projo_wkvb import ProjoWKVb +from tilert.models.glm_5._dsa_v32.ops.projq_wqb import ProjqWqb +from tilert.models.glm_5._dsa_v32.ops.projx_wis import ProjxWis +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_kv import KVRMSNorm +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqb, + RmsnormProjqWqbAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projq_wqi import ( + RmsnormProjqWqi, + RmsnormProjqWqiAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projx_wqakis import ( + RMSNormProjxWqakis, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projx_wqkva import ( + RMSNormProjxWqkva, + RMSNormProjxWqkvaAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.unproj_o_allreduce import ( + UnProjOAllReduce, + UnProjOAllReduceAlgorithm, +) + + +class SparseSelectMlaV2(SerializableTileRTModule): + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + peer_bufs: torch.Tensor | None = None, + partial_buf: torch.Tensor | None = None, + ): + super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) + + self.rmsnorm_projx_wqakis = RMSNormProjxWqakis( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.rmsnorm_projx_wqakis) + + self.rmsnorm_projq_wqi = RmsnormProjqWqi( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.rmsnorm_projq_wqi.algorithm = RmsnormProjqWqiAlgorithm.FP16MMA + self.register_op(self.rmsnorm_projq_wqi) + + self.layernorm_rope_rotate = LayerNormRoPERotate( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.layernorm_rope_rotate) + + self.projx_wis = ProjxWis( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.projx_wis) + + self.peer_bufs = peer_bufs + self.partial_buf = partial_buf + + self.ki_cache: torch.Tensor | None = None + self.kv_cache: torch.Tensor | None = None + self.pe_cache: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + """Return weight tensors.""" + weights = super().get_weights_list() + + dev = f"cuda:{self.device_id}" + if self.peer_bufs is None: + self.peer_bufs = torch.zeros(self.num_devices - 1, dtype=torch.int64, device=dev) + if self.partial_buf is None: + self.partial_buf = torch.zeros( + self.model_args.max_batch_size, + 4, + self.model_args.dim, + dtype=torch.bfloat16, + device=dev, + ) + + weights.append(self.peer_bufs) + weights.append(self.partial_buf) + + return weights + + def get_cache_vars(self) -> list[torch.Tensor]: + """Return [ki_cache, kv_cache, pe_cache] matching DsaCacheVars layout.""" + cache_seq_len = self.model_args.max_seq_len + self.model_args.kv_cache_pad + bs_args = (self.model_args.max_batch_size, cache_seq_len) + + if self.ki_cache is None: + ki_dim = self.model_args.index_head_dim + self.ki_cache = torch.zeros( + *bs_args, ki_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + if self.kv_cache is None: + kv_dim = self.model_args.kv_lora_rank + self.kv_cache = torch.zeros( + *bs_args, kv_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + if self.pe_cache is None: + pe_dim = self.model_args.qk_rope_head_dim + self.pe_cache = torch.zeros( + *bs_args, pe_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + return [*super().get_cache_vars(), self.ki_cache, self.kv_cache, self.pe_cache] + + +class PureMlaV2(SerializableTileRTModule): + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + ll_buf: torch.Tensor | None = None, + ): + super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) + + self.rmsnorm_projx_wqkva = RMSNormProjxWqkva( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.rmsnorm_projx_wqkva.algorithm = RMSNormProjxWqkvaAlgorithm.DECOUPLED + self.register_op(self.rmsnorm_projx_wqkva) + + self.rmsnorm_projq_wqb = RmsnormProjqWqb( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.rmsnorm_projq_wqb.algorithm = RmsnormProjqWqbAlgorithm.FP16MMA + self.register_op(self.rmsnorm_projq_wqb) + + self.rmsnorm_kv = KVRMSNorm( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.rmsnorm_kv) + + self.projq_wqb = ProjqWqb( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.projq_wqb) + + self.projo_wkvb = ProjoWKVb( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.projo_wkvb) + + allreduce_algo = UnProjOAllReduceAlgorithm.FP16MMA + self.unproj_o_allreduce = UnProjOAllReduce( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + algorithm=allreduce_algo, + ) + self.register_op(self.unproj_o_allreduce) + + self.ll_buf = ll_buf + + self.ki_cache: torch.Tensor | None = None + self.kv_cache: torch.Tensor | None = None + self.pe_cache: torch.Tensor | None = None + + def init_random_weights(self) -> None: + """Initialize random weights for this module.""" + super().init_random_weights() + + from tilert.models.common import init_func + + for op in [self.projq_wqb, self.projo_wkvb]: + padded_total = op.num_local_heads * op.num_devices + w = init_func( + torch.empty( + padded_total * op.wkvb_head_dim, op.wkvb_lora_rank, dtype=torch.float8_e4m3fn + ) + ) + s = init_func( + torch.empty( + padded_total * op.wkvb_head_dim // op.model_args.block_size, + op.wkvb_lora_rank_qsize, + dtype=torch.float32, + ) + ) + ref_dict = dict(zip(op.ref_weights_alias(), [w, s])) + op.init_reference_weights(ref_dict) + sharded = op.device_sharding(ref_dict) + per_dev = {k: v[op.device_id] for k, v in sharded.items()} + op.init_tilert_weights_hmma(per_dev) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Load TileRT weights for this module from state_dict.""" + self.projq_wqb.is_tilert_weights_init = True + self.projo_wkvb.is_tilert_weights_init = True + + super().init_tilert_weights(state_dict) + + for op in [self.projq_wqb, self.projo_wkvb]: + op_state_dict = {} + for op_key in op.get_tilert_weights_alias(): + for p, s in zip(self.prefix_seq, self.suffix_seq): + original_key = f"{p}{op_key}{s}" + if original_key in state_dict: + op_state_dict[op_key] = state_dict[original_key] + break + op.is_tilert_weights_init = False + op.init_tilert_weights_hmma(op_state_dict) + + def get_weights_list(self) -> list[torch.Tensor]: + """Return weight tensors.""" + weights = super().get_weights_list() + + if self.ll_buf is None: + max_seq_len = getattr(self.model_args, "num_mtp", 3) + 1 + topk = self.model_args.index_topk + self.ll_buf = torch.zeros( + max_seq_len * topk * 2, dtype=torch.int32, device=f"cuda:{self.device_id}" + ) + + weights.append(self.ll_buf) + + return weights + + def get_cache_vars(self) -> list[torch.Tensor]: + """Return [ki_cache, kv_cache, pe_cache] matching DsaCacheVars layout.""" + cache_seq_len = self.model_args.max_seq_len + self.model_args.kv_cache_pad + bs_args = (self.model_args.max_batch_size, cache_seq_len) + + if self.ki_cache is None: + ki_dim = self.model_args.index_head_dim + self.ki_cache = torch.zeros( + *bs_args, ki_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + if self.kv_cache is None: + kv_dim = self.model_args.kv_lora_rank + self.kv_cache = torch.zeros( + *bs_args, kv_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + if self.pe_cache is None: + pe_dim = self.model_args.qk_rope_head_dim + self.pe_cache = torch.zeros( + *bs_args, pe_dim, dtype=torch.bfloat16, device=f"cuda:{self.device_id}" + ) + return [*super().get_cache_vars(), self.ki_cache, self.kv_cache, self.pe_cache] diff --git a/tilert/models/glm_5/_dsa_v32/modules/mlp.py b/tilert/models/glm_5/_dsa_v32/modules/mlp.py new file mode 100644 index 0000000..85fec25 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/modules/mlp.py @@ -0,0 +1,74 @@ +from tilert.models.base import SerializableTileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.modules.mla_v2 import PureMlaV2 as Mla +from tilert.models.glm_5._dsa_v32.ops.down_allreduce import ( + DownAllReduce, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_up_gate_silu import ( + RMSNormUpGateSiLU, + RMSNormUpGateSiLUAlgorithm, +) + + +class Mlp(SerializableTileRTModule): + """Implement the MLP operations.""" + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + ): + super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) + + self.rmsnorm_mlp_up_gate_silu = RMSNormUpGateSiLU( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + self.rmsnorm_mlp_up_gate_silu.algorithm = RMSNormUpGateSiLUAlgorithm.FP16MMA + self.register_op(self.rmsnorm_mlp_up_gate_silu) + + self.rmsnorm_mlp_down = DownAllReduce( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.rmsnorm_mlp_down) + + +class MlpBlock(SerializableTileRTModule): + """Implement the MOE block operations.""" + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + remove_selected: bool = False, + mla_cls: type | None = None, + mla_num_devices: int | None = None, + mla_kwargs: dict | None = None, + mlp: "Mlp | None" = None, + ): + super().__init__( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + remove_selected=remove_selected, + ) + + mla_class = mla_cls or Mla + mla_nd = mla_num_devices if mla_num_devices is not None else num_devices + self.mla = mla_class( + model_args=model_args, device_id=device_id, num_devices=mla_nd, **(mla_kwargs or {}) + ) + self.register_op(self.mla) + self.mlp = ( + mlp + if mlp is not None + else Mlp( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + ) + self.register_op(self.mlp) diff --git a/tilert/models/glm_5/_dsa_v32/modules/moe.py b/tilert/models/glm_5/_dsa_v32/modules/moe.py new file mode 100644 index 0000000..5410284 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/modules/moe.py @@ -0,0 +1,80 @@ +import torch + +from tilert.models.base import SerializableTileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.modules.mla_v2 import PureMlaV2 as Mla +from tilert.models.glm_5._dsa_v32.ops.expert_down_allreduce import ( + ExpertDownAllReduce, +) +from tilert.models.glm_5._dsa_v32.ops.expert_sel_up_gate_silu import ( + ExpertSelectUpGateSiLU, + ExpertSelectUpGateSiLUAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_expert_proj import ( + RMSNormExpertProj, +) + + +class Moe(SerializableTileRTModule): + """Implement the MOE operations.""" + + rmsnorm_expert_proj: RMSNormExpertProj + + def __init__(self, model_args: ModelArgs, device_id: int, num_devices: int): + super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) + + self.rmsnorm_expert_proj = RMSNormExpertProj( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.rmsnorm_expert_proj) + + self.exp_sel_up_gate_silu = ExpertSelectUpGateSiLU( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + algorithm=ExpertSelectUpGateSiLUAlgorithm.FP16MMA, + ) + self.register_op(self.exp_sel_up_gate_silu) + + self.expert_down_allreduce = ExpertDownAllReduce( + model_args=model_args, device_id=device_id, num_devices=num_devices + ) + self.register_op(self.expert_down_allreduce) + + def get_weights_list(self) -> list[torch.Tensor]: + return super().get_weights_list() + + +class MoeBlock(SerializableTileRTModule): + """Implement the MOE block operations.""" + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + remove_selected: bool = False, + mla_cls: type | None = None, + mla_num_devices: int | None = None, + mla_kwargs: dict | None = None, + moe: "Moe | None" = None, + ): + super().__init__( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + remove_selected=remove_selected, + ) + + mla_class = mla_cls or Mla + mla_nd = mla_num_devices if mla_num_devices is not None else num_devices + self.mla = mla_class( + model_args=model_args, device_id=device_id, num_devices=mla_nd, **(mla_kwargs or {}) + ) + self.register_op(self.mla) + self.moe = ( + moe + if moe is not None + else Moe(model_args=model_args, device_id=device_id, num_devices=num_devices) + ) + self.register_op(self.moe) diff --git a/tilert/models/glm_5/_dsa_v32/modules/mtp.py b/tilert/models/glm_5/_dsa_v32/modules/mtp.py new file mode 100644 index 0000000..ccfbdc8 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/modules/mtp.py @@ -0,0 +1,62 @@ +import torch + +from tilert.models.base import SerializableTileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.modules.moe import MoeBlock +from tilert.models.glm_5._dsa_v32.modules.mtp_preprocess import MTPPreprocessLayer +from tilert.models.glm_5._dsa_v32.ops import RMSNormHeadProj + + +class MTP(SerializableTileRTModule): + """MTP module.""" + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + mla_cls: type | None = None, + mla_num_devices: int | None = None, + mla_kwargs: dict | None = None, + ): + super().__init__(model_args=model_args, device_id=device_id, num_devices=num_devices) + + self.embed_tokens_weight = None + self.freqs_cis = None + + mtp_layer_id = self.model_args.n_layers + self.register_op( + MTPPreprocessLayer(self.model_args, self.num_devices, device_id), + prefix=f"layer_{mtp_layer_id}_", + suffix=f"_dev_{device_id}", + ) + self.register_op( + MoeBlock( + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + mla_cls=mla_cls, + mla_num_devices=mla_num_devices, + mla_kwargs=mla_kwargs, + ), + prefix=f"layer_{mtp_layer_id}_", + suffix=f"_dev_{device_id}", + ) + self.register_op( + RMSNormHeadProj(model_args=model_args, device_id=device_id, num_devices=num_devices), + prefix=f"layer_{mtp_layer_id}_", + suffix=f"_dev_{device_id}", + retain_weights=True, + ) + + def init_tilert_weights(self, state_dicts: dict[str, torch.Tensor]) -> None: + self.embed_tokens_weight = state_dicts["model.embed_tokens.weight"] + self.freqs_cis = state_dicts["freqs_cis"] + super().init_tilert_weights(state_dicts) + + def get_weights_list(self) -> list[torch.Tensor]: + return [ + self.embed_tokens_weight, + self.freqs_cis, + *super().get_weights_list(), + ] diff --git a/tilert/models/glm_5/_dsa_v32/modules/mtp_preprocess.py b/tilert/models/glm_5/_dsa_v32/modules/mtp_preprocess.py new file mode 100644 index 0000000..debd75d --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/modules/mtp_preprocess.py @@ -0,0 +1,238 @@ +"""MTP preprocess layer for DeepSeek v3.""" + +from dataclasses import dataclass + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import init_func, linear +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs + +__all__ = [ + "mtp_preprocess_layer", + "MTPPreprocessLayer", + "MTPPreprocessRefWeightsAlias", + "MTPPreprocessTilertWeightsAlias", + "MTPPreprocessWeightsConverter", +] + + +def mtp_preprocess_layer( + params: list[torch.Tensor], + temp_vars: list[torch.Tensor], + profile_logs: torch.Tensor, +) -> torch.Tensor: + """MTP preprocess layer op for DeepSeek v3.""" + return torch.ops.tilert.mtp_preprocess_layer(params, temp_vars, profile_logs) + + +@dataclass +class MTPPreprocessRefWeightsAlias: + """Reference (golden/PyTorch) weight keys for MTP preprocess.""" + + embedding_rmsnorm = "enorm.weight" + hidden_rmsnorm = "hnorm.weight" + eh_proj = "eh_proj.weight" + + @property + def ref_tensor_alias(self) -> list[str]: + return [ + self.embedding_rmsnorm, + self.hidden_rmsnorm, + self.eh_proj, + ] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class MTPPreprocessTilertWeightsAlias: + """TileRT weight keys for MTP preprocess.""" + + embedding_rmsnorm_gamma = "embedding_rmsnorm_gamma" + hidden_rmsnorm_gamma = "hidden_rmsnorm_gamma" + eh_proj_weights = "eh_proj_weights" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.embedding_rmsnorm_gamma, + self.hidden_rmsnorm_gamma, + self.eh_proj_weights, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class MTPPreprocessWeightsConverter(TilertWeightsConverter): + """Converts ref-format weights to TileRT format for MTP preprocess.""" + + def convert_to_tilert(self, weights: list[torch.Tensor], device_id: int) -> list[torch.Tensor]: + """ + Convert ref weights to TileRT format for a specific device. + + Args: + weights: [embedding_rmsnorm_gamma, hidden_rmsnorm_gamma, eh_proj.weight] + Ref format: enorm.weight [7168], hnorm.weight [7168], + eh_proj.weight [7168, 14336]. + device_id: Target device ID for weight placement. + + Returns: + MTPPreprocessParams with converted weights for device_id. + """ + device = torch.device(f"cuda:{device_id}") + embedding_rmsnorm_gamma, hidden_rmsnorm_gamma, eh_proj_weight = weights + + embedding_rmsnorm_gamma = embedding_rmsnorm_gamma.to(device=device, dtype=torch.float32) + hidden_rmsnorm_gamma = hidden_rmsnorm_gamma.to(device=device, dtype=torch.float32) + eh_proj_weights = ( + eh_proj_weight.reshape( + 128, self.model_args.dim // 128, self.model_args.dim * 2 // 256 // 8, 256 + ) + .transpose(1, 2) + .contiguous() + .to(device=device, dtype=torch.bfloat16) + ) + return [embedding_rmsnorm_gamma, hidden_rmsnorm_gamma, eh_proj_weights] + + +class MTPPreprocessLayer(TileRTModule): + """MTP preprocess layer: RMSNorm(embedding), RMSNorm(hidden), concat & project.""" + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int, + ref_weights_alias: MTPPreprocessRefWeightsAlias | None = None, + ) -> None: + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + self.tilert_weights_alias = MTPPreprocessTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else MTPPreprocessRefWeightsAlias() + ) + self.hidden_size = model_args.dim + + self.tilert_embedding_rmsnorm_gamma: torch.Tensor | None = None + self.tilert_hidden_rmsnorm_gamma: torch.Tensor | None = None + self.tilert_eh_proj_weights: torch.Tensor | None = None + + self.ref_embedding_rmsnorm_gamma: torch.Tensor | None = None + self.ref_hidden_rmsnorm_gamma: torch.Tensor | None = None + self.ref_eh_proj_weight: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + return [ + self.tilert_embedding_rmsnorm_gamma, + self.tilert_hidden_rmsnorm_gamma, + self.tilert_eh_proj_weights, + ] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Repeat ref weights for each device (for init_tilert_weights from ref).""" + embedding_gamma = weights_map[self.ref_weights_alias.embedding_rmsnorm] + hidden_gamma = weights_map[self.ref_weights_alias.hidden_rmsnorm] + eh_proj_weights = weights_map[self.ref_weights_alias.eh_proj] + return { + self.tilert_weights_alias.embedding_rmsnorm_gamma: ( + embedding_gamma[None, ...].repeat(self.num_devices, 1) + ), + self.tilert_weights_alias.hidden_rmsnorm_gamma: ( + hidden_gamma[None, ...].repeat(self.num_devices, 1) + ), + self.tilert_weights_alias.eh_proj_weights: ( + eh_proj_weights[None, ...] + .reshape( + self.model_args.dim, + self.num_devices, + self.model_args.dim * 2 // self.num_devices, + ) + .transpose(0, 1) + ), + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Load ref-format weights (enorm.weight, hnorm.weight, eh_proj.weight).""" + self.ref_embedding_rmsnorm_gamma = state_dict[self.ref_weights_alias.embedding_rmsnorm] + self.ref_hidden_rmsnorm_gamma = state_dict[self.ref_weights_alias.hidden_rmsnorm] + self.ref_eh_proj_weight = state_dict[self.ref_weights_alias.eh_proj] + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """ + Load TileRT weights from state_dict. + + state_dict may use: + - Full keys: layer_{layer_id}_{alias}_dev_{device_id} + - Short keys: embedding_rmsnorm_gamma, hidden_rmsnorm_gamma, eh_proj_weights + - Ref keys: enorm.weight, hnorm.weight, eh_proj.weight (then convert) + """ + converter = MTPPreprocessWeightsConverter(self.model_args, self.num_devices) + params = converter.convert_to_tilert( + [state_dict[k] for k in self.tilert_weights_alias()], self.device_id + ) + self.tilert_embedding_rmsnorm_gamma = params[0] + self.tilert_hidden_rmsnorm_gamma = params[1] + self.tilert_eh_proj_weights = params[2] + + def init_random_weights(self) -> dict[str, torch.Tensor]: + """Initialize random ref weights and convert to TileRT for this device.""" + embedding_gamma = init_func(torch.randn(self.hidden_size, dtype=torch.float32)) + hidden_gamma = init_func(torch.randn(self.hidden_size, dtype=torch.float32)) + eh_proj_weights = init_func( + torch.randn(self.hidden_size, self.hidden_size * 2, dtype=torch.bfloat16) + ) + return { + self.ref_weights_alias.embedding_rmsnorm: embedding_gamma, + self.ref_weights_alias.hidden_rmsnorm: hidden_gamma, + self.ref_weights_alias.eh_proj: eh_proj_weights, + } + + def golden_forward( + self, + x: torch.Tensor, + last_hidden_states: torch.Tensor, + ) -> torch.Tensor: + """ + Reference forward: enorm(x), hnorm(last_hidden), concat & eh_proj. + + Args: + x: [batch, seq_len, hidden_size] embedded tokens + last_hidden_states: [batch, seq_len, hidden_size] previous hidden + + Returns: + [batch, seq_len, hidden_size] projected hidden + """ + assert self.ref_embedding_rmsnorm_gamma is not None + assert self.ref_hidden_rmsnorm_gamma is not None + assert self.ref_eh_proj_weight is not None + + future_norm = torch.nn.functional.rms_norm( + x.float(), + [x.size(-1)], + self.ref_embedding_rmsnorm_gamma, + 1e-6, + ) + prev_norm = torch.nn.functional.rms_norm( + last_hidden_states.float(), + [last_hidden_states.size(-1)], + self.ref_hidden_rmsnorm_gamma, + 1e-6, + ) + combined = torch.cat([future_norm, prev_norm], dim=-1) + return linear(combined, self.ref_eh_proj_weight) + + def tilert_forward( + self, + params: list[torch.Tensor], + temp_vars: list[torch.Tensor], + profile_logs: torch.Tensor, + ) -> torch.Tensor: + """Run TileRT mtp_preprocess_layer op.""" + return mtp_preprocess_layer(params, temp_vars, profile_logs) diff --git a/tilert/models/glm_5/_dsa_v32/ops/__init__.py b/tilert/models/glm_5/_dsa_v32/ops/__init__.py new file mode 100644 index 0000000..a58dab8 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/__init__.py @@ -0,0 +1,160 @@ +"""Core operations for deepseek v3.2.""" + +from tilert.models.glm_5._dsa_v32.ops.broadcast_selected_token_ids import ( + broadcast_selected_token_ids, +) +from tilert.models.glm_5._dsa_v32.ops.down_allreduce import ( + DownAllReduce, + DownAllReduceAlgorithm, + down_allreduce, +) +from tilert.models.glm_5._dsa_v32.ops.eh_proj_allreduce import ( + EHProjAllReduce, + EHProjAllReduceAlgorithm, + eh_proj_allreduce, +) +from tilert.models.glm_5._dsa_v32.ops.expert_down_allreduce import ( + ExpertDownAllReduce, + ExpertDownAllReduceAlgorithm, + expert_down_allreduce, +) +from tilert.models.glm_5._dsa_v32.ops.expert_sel_up_gate_silu import ( + ExpertSelectUpGateSiLU, + ExpertSelectUpGateSiLUAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.flash_sparse_mla import ( + FlashSparseMLACombineAlgorithm, + flash_sparse_mla, +) +from tilert.models.glm_5._dsa_v32.ops.layernorm_rope_rotate import ( + LayerNormRoPERotateAlgorithm, + layernorm_rope_rotate, +) +from tilert.models.glm_5._dsa_v32.ops.padded_allreduce_add import ( + PaddedAllReduceAdd, + PaddedAllReduceAddAlgorithm, + padded_allreduce_add, +) +from tilert.models.glm_5._dsa_v32.ops.projo_wkvb import ProjoWKVbAlgorithm, projo_wkvb +from tilert.models.glm_5._dsa_v32.ops.projq_wqb import ProjqWqbAlgorithm, projq_wqb +from tilert.models.glm_5._dsa_v32.ops.projx_wis import ProjxWisAlgorithm, projx_wis +from tilert.models.glm_5._dsa_v32.ops.qkv_rope import ( + QKVRoPE, + QKVRoPEAlgorithm, + QKVRoPERefWeightsAlias, + QKVRoPETilertWeightsAlias, + qkv_rope, +) +from tilert.models.glm_5._dsa_v32.ops.receive_selected_token_ids import ( + receive_selected_token_ids, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_expert_proj import ( + RMSNormExpertProj, + RMSNormExpertProjAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_head_proj import ( + RMSNormHeadProj, + RMSNormHeadProjAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_kv import KVRMSNormAlgorithm, rmsnorm_kv +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqb, + RmsnormProjqWqbAlgorithm, + RmsnormProjqWqbWeightsConverter, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projq_wqi import ( + RmsnormProjqWqi, + RmsnormProjqWqiAlgorithm, + RmsnormProjqWqiWeightsConverter, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projx_wqakis import ( + RMSNormProjxWqakis, + RMSNormProjxWqakisAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projx_wqkva import ( + RMSNormProjxWqkva, + RMSNormProjxWqkvaAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_quant import rmsnorm_quant +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_up_gate_silu import ( + RMSNormUpGateSiLU, + RMSNormUpGateSiLUAlgorithm, +) +from tilert.models.glm_5._dsa_v32.ops.rotate import ( + Rotate, + RotateAlgorithm, + RotateRefWeightsAlias, + RotateTilertWeightsAlias, + rotate, + rotate_activation, +) +from tilert.models.glm_5._dsa_v32.ops.sparse_index import sparse_index, sparse_index_topk +from tilert.models.glm_5._dsa_v32.ops.topk import TopK, topk_accurate, topk_approximate +from tilert.models.glm_5._dsa_v32.ops.unproj_o_allreduce import ( + UnProjOAllReduce, + UnProjOAllReduceAlgorithm, + unproj_o_allreduce, +) + +__all__ = [ + "down_allreduce", + "DownAllReduce", + "DownAllReduceAlgorithm", + "expert_down_allreduce", + "ExpertDownAllReduce", + "ExpertDownAllReduceAlgorithm", + "rmsnorm_kv", + "KVRMSNormAlgorithm", + "unproj_o_allreduce", + "projo_wkvb", + "ProjoWKVbAlgorithm", + "projq_wqb", + "ProjqWqbAlgorithm", + "rotate", + "rotate_activation", + "Rotate", + "RotateAlgorithm", + "RotateRefWeightsAlias", + "RotateTilertWeightsAlias", + "layernorm_rope_rotate", + "LayerNormRoPERotateAlgorithm", + "TopK", + "topk_approximate", + "topk_accurate", + "sparse_index", + "sparse_index_topk", + "flash_sparse_mla", + "FlashSparseMLACombineAlgorithm", + "projx_wis", + "ProjxWisAlgorithm", + "qkv_rope", + "QKVRoPE", + "QKVRoPEAlgorithm", + "QKVRoPERefWeightsAlias", + "QKVRoPETilertWeightsAlias", + "eh_proj_allreduce", + "EHProjAllReduceAlgorithm", + "rmsnorm_quant", + "RmsnormProjqWqi", + "RmsnormProjqWqiAlgorithm", + "RmsnormProjqWqiWeightsConverter", + "RMSNormExpertProj", + "RMSNormExpertProjAlgorithm", + "RMSNormProjxWqakis", + "RMSNormProjxWqakisAlgorithm", + "RMSNormProjxWqkva", + "RMSNormProjxWqkvaAlgorithm", + "RMSNormUpGateSiLU", + "RMSNormUpGateSiLUAlgorithm", + "UnProjOAllReduce", + "UnProjOAllReduceAlgorithm", + "RMSNormHeadProj", + "RMSNormHeadProjAlgorithm", + "ExpertSelectUpGateSiLU", + "ExpertSelectUpGateSiLUAlgorithm", + "PaddedAllReduceAdd", + "PaddedAllReduceAddAlgorithm", + "padded_allreduce_add", + "broadcast_selected_token_ids", + "receive_selected_token_ids", +] diff --git a/tilert/models/glm_5/_dsa_v32/ops/broadcast_selected_token_ids.py b/tilert/models/glm_5/_dsa_v32/ops/broadcast_selected_token_ids.py new file mode 100644 index 0000000..f6bf2a8 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/broadcast_selected_token_ids.py @@ -0,0 +1,36 @@ +"""BroadcastSelectedTokenIds — P2P broadcast of idx_selects from GPU 0 to peers.""" + +import torch + +__all__ = [ + "broadcast_selected_token_ids", +] + + +def broadcast_selected_token_ids( + idx_selects: torch.Tensor, + peer_bufs: torch.Tensor, + flag_val: int, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", +) -> None: + """Broadcast idx_selects [1,S,2048] int32 from GPU 0 to peer GPUs. + + Args: + idx_selects: Source tensor [1, S, 2048] int32 on GPU 0. + peer_bufs: Device pointer array [N] int64 — each entry is a peer + buffer address. + flag_val: Synchronization flag value. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16"). + """ + torch.ops.tilert.broadcast_selected_token_ids_op( + idx_selects, + peer_bufs, + flag_val, + model_arch, + compute_kernel_type, + profile_logs, + ) diff --git a/tilert/models/glm_5/_dsa_v32/ops/down_allreduce.py b/tilert/models/glm_5/_dsa_v32/ops/down_allreduce.py new file mode 100644 index 0000000..38b305c --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/down_allreduce.py @@ -0,0 +1,343 @@ +"""DownAllreduce operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule +from tilert.models.common import weight_dequant +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.ops.expert_down_allreduce import ( + ExpertDownAllReduceWeightsConverter, +) +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "down_allreduce", + "DownAllReduceAlgorithm", + "DownAllReduce", + "DownAllReduceTilertWeightsAlias", +] + + +def down_allreduce( + vec_in: torch.Tensor, + mat_in: torch.Tensor, + mat_scale: torch.Tensor, + x_in: torch.Tensor, + flag: int, + vec_out: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", +) -> None: + """ + Fused operation of down and allreduce. + + Args: + vec_in: Input tensor. + mat_in: Input tensor. + mat_scale: Input tensor. + x_in: Input tensor. + flag: Input flag. + vec_out: Output tensor. + profile_logs: Profile logs tensor (1D). + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16"). + """ + torch.ops.tilert.down_allreduce_op( + vec_in, + mat_in, + mat_scale, + x_in, + flag, + vec_out, + model_arch, + compute_kernel_type, + profile_logs, + ) + + +class DownAllReduceAlgorithm(Enum): + """DownAllReduce algorithm""" + + GENERAL = "general" + + +DownAllReduceWeightsConverter = ExpertDownAllReduceWeightsConverter + + +@dataclass +class DownAllReduceTilertWeightsAlias: + """TileRT weights alias for DownAllReduce.""" + + down_weights = "down_weights" + down_scales = "down_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.down_weights, self.down_scales] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class DownAllReduce(TileRTModule): + """DownAllReduce module""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [DownAllReduceAlgorithm.GENERAL], + "glm_5": [DownAllReduceAlgorithm.GENERAL], + } + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + algorithm: DownAllReduceAlgorithm = DownAllReduceAlgorithm.GENERAL, + ): + super().__init__( + self.__class__.__name__, + device_id=device_id, + model_args=model_args, + num_devices=num_devices, + ) + + self.arch_name = self.model_args.arch_name + self.dim = self.model_args.dim + + self.inter_dim = self.model_args.inter_dim + self.moe_inter_dim = self.model_args.moe_inter_dim + self.moe_inter_dim_per_device = self.moe_inter_dim // self.num_devices + self.inter_dim_per_device = self.inter_dim // self.num_devices + self.n_experts: int = self.inter_dim_per_device // self.moe_inter_dim_per_device + self.block_size = self.model_args.block_size + self.dim_scale_dim = self.dim // self.block_size + self.in_scale_dim = self.inter_dim // self.block_size + self.moe_inter_scale_dim_per_device = self.moe_inter_dim_per_device // self.block_size + self.algorithm = algorithm + + if self.arch_name in ("deepseek_v3_2", "glm_5"): + self.compute_kernel_type = "bf16" + else: + raise ValueError(f"Unsupported architecture: {self.arch_name}") + + self.model_arch = self.arch_name + + self.ref_down: torch.Tensor | None = None + + self.tilert_weights: torch.Tensor | None = None + self.tilert_scales: torch.Tensor | None = None + + self.hidden_out: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + self.tilert_weights_alias = DownAllReduceTilertWeightsAlias() + + self.tensor_alias: list[str] = [ + "down_weights", + "down_scales", + ] + + self.ref_tensor_alias: list[str] = [ + "mlp.down_proj.weight", + "mlp.down_proj.weight_scale_inv", + ] + + @property + def tilert_tensor_alias(self) -> list[str]: + return self.tilert_weights_alias.tilert_tensor_alias + + def get_weights_list(self) -> list[torch.Tensor]: + """ + Get the weights list. + + Returns: + List of weights. + """ + return [self.tilert_weights, self.tilert_scales] + + def device_sharding( + self, + weights_dict: dict[str, torch.Tensor], + key_prefix: str, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Device sharding. + + Args: + weights_dict: Dictionary of weights. + key_prefix: Key prefix. + Returns: + Tuple of weights. + """ + down_proj_weight_key = f"{key_prefix}.down_proj.weight" + down_proj_scale_key = f"{key_prefix}.down_proj.weight_scale_inv" + down_proj_weight = weights_dict[down_proj_weight_key] + down_proj_scale = weights_dict[down_proj_scale_key] + down_proj_weight = down_proj_weight.reshape( + self.dim, self.n_experts, self.num_devices, self.moe_inter_dim_per_device + ) + down_proj_weight_splited = torch.split(down_proj_weight, 1, dim=2) + + down_proj_weight_splited = [ + down_proj_weight_splited[i] + .reshape(self.dim, self.n_experts, self.moe_inter_dim_per_device) + .transpose(0, 1) + .contiguous() + for i in range(self.num_devices) + ] + + down_proj_scale = down_proj_scale.reshape( + self.dim_scale_dim, + self.n_experts, + self.num_devices, + self.moe_inter_scale_dim_per_device, + ) + down_proj_scale_splited = torch.split(down_proj_scale, 1, dim=2) + down_proj_scale_splited = [ + down_proj_scale_splited[i] + .reshape(self.dim_scale_dim, self.n_experts, self.moe_inter_scale_dim_per_device) + .transpose(0, 1) + .contiguous() + for i in range(self.num_devices) + ] + down_weights = torch.stack(down_proj_weight_splited, dim=0) + down_scales = torch.stack(down_proj_scale_splited, dim=0) + return down_weights.contiguous(), down_scales.contiguous() + + def init_reference_weights( + self, + state_dict: dict[str, torch.Tensor], + key_prefix: str, + device_id: int = 0, + ) -> None: + """ + Initialize the reference weights. + + Args: + state_dict: State dictionary. + device_id: Device ID. + """ + sharded_list = self.device_sharding(state_dict, key_prefix) + + down_weights = sharded_list[0][device_id] + down_scales = sharded_list[1][device_id] + + down_list = [ + weight_dequant(down_weight, down_scale) + for down_weight, down_scale in zip(down_weights, down_scales) + ] + self.ref_down = torch.stack(down_list, dim=0) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """ + Initialize the tilert weights. + + Args: + state_dict: State dictionary. + """ + assert self.algorithm is not None, "Algorithm is not set" + self.tilert_weights, self.tilert_scales = DownAllReduceWeightsConverter( + self.model_args, self.num_devices + ).dispatch(self.algorithm, [state_dict[alias] for alias in self.tensor_alias]) + + def init_tilert_vars(self, batch_size: int, seq_len: int, device_id: int = 0) -> None: + """ + Initialize the tilert variables. + + Args: + batch_size: Batch size. + seq_len: Sequence length. + """ + self.hidden_out = torch.zeros( + (batch_size, seq_len, self.dim), + dtype=torch.bfloat16, + device=f"cuda:{device_id}", + ) + self.profile_logs = get_profile_log_tensor(device=f"cuda:{device_id}") + self.is_init = True + + def init_random_weights(self, device_id: int = 0) -> None: + """Initialize the random weights.""" + scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 + down_weights = torch.randn( + self.dim, self.inter_dim, dtype=torch.bfloat16, device=f"cuda:{device_id}" + ).to(torch.float8_e4m3fn) + + inter_dim_scale_dim = self.inter_dim // self.block_size + dim_scale_dim = self.dim // self.block_size + down_scales = torch.randn( + dim_scale_dim, inter_dim_scale_dim, dtype=scale_dtype, device=f"cuda:{device_id}" + ) + tensor_list = [ + down_weights, + down_scales, + ] + state_dict = dict(zip(self.ref_tensor_alias, tensor_list)) + + self.init_reference_weights(state_dict, "mlp", device_id) + sharded_list = self.device_sharding(state_dict, "mlp") + + sharded_state_dict = { + alias: sharded_list[i][device_id] for i, alias in enumerate(self.tensor_alias) + } + self.init_tilert_weights(sharded_state_dict) + + def golden_forward( + self, + vec_in: torch.Tensor, + ) -> torch.Tensor: + """ + Forward pass for the down-project module. + + Args: + vec_in: Input vector. + + Returns: + Output tensor. + """ + assert self.ref_down is not None + bsz = vec_in.shape[0] + assert bsz == 1 + seq_len = vec_in.shape[1] + hidden_out_list = [] + for s in range(seq_len): + hidden_out_w2_list = [] + for i in range(self.n_experts): + hidden_out_w2_sel = vec_in[0, s, i].float() @ self.ref_down[i].float().T + hidden_out_w2_list.append(hidden_out_w2_sel) + hidden_out_w2 = torch.stack(hidden_out_w2_list, dim=0).to(torch.bfloat16) + hidden_out_w2 = torch.sum(hidden_out_w2, dim=0) + hidden_out_list.append(hidden_out_w2) + return torch.stack(hidden_out_list, dim=0)[None, ...] + + def tilert_forward( + self, + vec_in: torch.Tensor, + x_in: torch.Tensor, + flag: int, + ) -> torch.Tensor: + assert self.hidden_out is not None + down_allreduce( + vec_in, + self.tilert_weights, + self.tilert_scales, + x_in, + flag, + self.hidden_out, + self.profile_logs, + self.model_arch, + self.compute_kernel_type, + ) + return self.hidden_out + + def __call__( + self, + x_in: torch.Tensor, + ) -> torch.Tensor: + return self.golden_forward(x_in) diff --git a/python/models/deepseek_v3_2/ops/eh_proj_allreduce.py b/tilert/models/glm_5/_dsa_v32/ops/eh_proj_allreduce.py similarity index 87% rename from python/models/deepseek_v3_2/ops/eh_proj_allreduce.py rename to tilert/models/glm_5/_dsa_v32/ops/eh_proj_allreduce.py index 309751a..fe0b71f 100644 --- a/python/models/deepseek_v3_2/ops/eh_proj_allreduce.py +++ b/tilert/models/glm_5/_dsa_v32/ops/eh_proj_allreduce.py @@ -3,11 +3,10 @@ from dataclasses import dataclass from enum import Enum -# import torch.nn.functional as F import torch from tilert.models.base import TileRTModule, TilertWeightsConverter -from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs from tilert.utils import get_profile_log_tensor __all__ = [ @@ -23,6 +22,7 @@ def eh_proj_allreduce( flag: int, vec_out: torch.Tensor, profile_logs: torch.Tensor, + model_arch: str, ) -> None: """ Fused operation of EHProj and allreduce. @@ -33,19 +33,21 @@ def eh_proj_allreduce( w_eh: Input tensor of shape (7168, 1792) or (128, 7, 56, 256). flag: Input tensor. vec_out: Output tensor of shape (1, seq_len, 7168). - profile_logs: Profile logs tensor. This is a 1D tensor of shape - (num_sms,) to store the profile logs of the eh_proj_allreduce - operation, where num_sms is the number of SMs on the - device. + profile_logs: Profile logs tensor (1D). + model_arch: Model architecture string. """ - dim = vec_in_enorm.shape[-1] - if dim == 7168: - func_call = torch.ops.tilert.eh_proj_allreduce_op - elif dim == 6144: - func_call = torch.ops.tilert.eh_proj_allreduce_glm5_op - else: - raise ValueError(f"Unsupported dimension: {dim}") - func_call(vec_in_enorm, vec_in_hnorm, w_eh, flag, vec_out, profile_logs) + compute_kernel_type = "bf16" + torch.ops.tilert.eh_proj_allreduce_op( + vec_in_enorm, + vec_in_hnorm, + w_eh, + flag, + vec_out, + profile_logs, + model_arch, + compute_kernel_type, + torch.empty(0, dtype=torch.int64, device=vec_in_enorm.device), + ) class EHProjAllReduceAlgorithm(Enum): @@ -100,6 +102,11 @@ def __call__(self) -> list[str]: class EHProjAllReduce(TileRTModule): """EHProjAllReduce module""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [EHProjAllReduceAlgorithm.GENERAL], + "glm_5": [EHProjAllReduceAlgorithm.GENERAL], + } + def __init__( self, model_args: ModelArgs, @@ -117,13 +124,10 @@ def __init__( self.algorithm = algorithm - # reference weights self.ref_proj: torch.Tensor | None = None - # tilert weights self.tilert_proj: torch.Tensor | None = None - # tilert vars self.hidden_out: torch.Tensor | None = None self.profile_logs: torch.Tensor | None = None @@ -131,13 +135,10 @@ def __init__( self.tilert_weights_alias = EHProjAllReduceTilertWeightsAlias() - # for device sharding, corresponding to the output of device_sharding - # and input of tilert_forward self.tensor_alias: list[str] = [ "eh_proj_weights", ] - # reference tensor aliases self.ref_tensor_alias: list[str] = [ "eh_proj.weight", ] @@ -158,7 +159,7 @@ def get_weights_list(self) -> list[torch.Tensor]: def device_sharding( self, weights_dict: dict[str, torch.Tensor], - key_prefix: str | None = None, # e.g. model.layers.{layer_id} + key_prefix: str | None = None, ) -> tuple[torch.Tensor]: """ Device sharding. @@ -220,7 +221,6 @@ def init_tilert_vars(self, batch_size: int, seq_len: int, device_id: int = 0) -> batch_size: Batch size. seq_len: Sequence length. """ - # tilert vars self.hidden_out = torch.zeros( (batch_size, seq_len, self.dim), dtype=torch.bfloat16, @@ -282,6 +282,12 @@ def tilert_forward( ) -> torch.Tensor: assert self.hidden_out is not None eh_proj_allreduce( - vec_in_enorm, vec_in_hnorm, self.tilert_proj, flag, self.hidden_out, self.profile_logs + vec_in_enorm, + vec_in_hnorm, + self.tilert_proj, + flag, + self.hidden_out, + self.profile_logs, + model_arch=self.model_args.arch_name, ) return self.hidden_out diff --git a/python/models/deepseek_v3_2/ops/expert_down_allreduce.py b/tilert/models/glm_5/_dsa_v32/ops/expert_down_allreduce.py similarity index 78% rename from python/models/deepseek_v3_2/ops/expert_down_allreduce.py rename to tilert/models/glm_5/_dsa_v32/ops/expert_down_allreduce.py index d49bc77..b0e6b24 100644 --- a/python/models/deepseek_v3_2/ops/expert_down_allreduce.py +++ b/tilert/models/glm_5/_dsa_v32/ops/expert_down_allreduce.py @@ -1,4 +1,5 @@ -from collections.abc import Callable +"""ExpertDownAllreduce operation module.""" + from dataclasses import dataclass from enum import Enum @@ -6,12 +7,11 @@ from tilert.models.base import TileRTModule, TilertWeightsConverter from tilert.models.common import weight_dequant -from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs from tilert.utils import get_profile_log_tensor __all__ = [ "expert_down_allreduce", - "expert_down_allreduce_glm5", "ExpertDownAllReduceAlgorithm", "ExpertDownAllReduce", "ExpertDownAllReduceTilertWeightsAlias", @@ -31,53 +31,36 @@ def expert_down_allreduce( flag: int, vec_out: torch.Tensor, profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", ) -> None: """ - Fused expert down + allreduce (deepseek_v3_2). + Fused expert down + allreduce (unified for DSv32 and GLM5). Args: vec_in: [1, seq_len, n_experts, 256], bfloat16. - mat_in: [n_experts, 6144, 256], float8_e4m3fn. - mat_scale: [n_experts, 1024, 2], bfloat16. + mat_in: [n_experts, dim, 256], float8_e4m3fn. + mat_scale: [n_experts, 1024, 2], bfloat16 (DSv32) or float32 (GLM5). indices: [1, seq_len, 8], int32. scores: [1, seq_len, 8], float32. - x_in: [1, seq_len, 6144], bfloat16. + x_in: [1, seq_len, dim], bfloat16. flag: User flag. - vec_out: [1, seq_len, 6144], bfloat16 (output). - profile_logs: 1D tensor (num_sms,) for profile logs. + vec_out: [1, seq_len, dim], bfloat16 (output). + profile_logs: 1D tensor for profile logs. + compute_kernel_type: "bf16". """ torch.ops.tilert.expert_down_allreduce_op( - vec_in, mat_in, mat_scale, indices, scores, x_in, flag, vec_out, profile_logs - ) - - -def expert_down_allreduce_glm5( - vec_in: torch.Tensor, - mat_in: torch.Tensor, - mat_scale: torch.Tensor, - indices: torch.Tensor, - scores: torch.Tensor, - x_in: torch.Tensor, - flag: int, - vec_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """ - Fused expert down + allreduce (glm_5). - - Args: - vec_in: [1, seq_len, n_experts, 256], bfloat16. - mat_in: [n_experts, 6144, 256], float8_e4m3fn. - mat_scale: [n_experts, 1024, 2], bfloat16. - indices: [1, seq_len, 8], int32. - scores: [1, seq_len, 8], float32. - x_in: [1, seq_len, 6144], bfloat16. - flag: User flag. - vec_out: [1, seq_len, 6144], bfloat16 (output). - profile_logs: 1D tensor (num_sms,) for profile logs. - """ - torch.ops.tilert.expert_down_allreduce_glm5_op( - vec_in, mat_in, mat_scale, indices, scores, x_in, flag, vec_out, profile_logs + vec_in, + mat_in, + mat_scale, + indices, + scores, + x_in, + flag, + vec_out, + profile_logs, + model_arch, + compute_kernel_type, ) @@ -115,36 +98,51 @@ def convert_to_general( num_sms = 128 dim_per_sm = dim // num_sms dim_scale_dim = dim // args.block_size + expert_dim = args.moe_inter_dim // 8 + k_chunks = expert_dim // 32 + scale_cols = expert_dim // args.block_size with torch.inference_mode(): mat_in, scale_in = weights_list exp_num = mat_in.shape[0] - mat_in_s = mat_in.reshape(exp_num, num_sms, dim_per_sm, 256) - mat_in_0 = mat_in_s[:, :, :16].reshape(exp_num, num_sms, 16, 8, 32).transpose(2, 3) + mat_in_s = mat_in.reshape(exp_num, num_sms, dim_per_sm, expert_dim) + mat_in_0 = ( + mat_in_s[:, :, :16].reshape(exp_num, num_sms, 16, k_chunks, 32).transpose(2, 3) + ) mat_in_0 = self._swizzle_qmma_16x32(mat_in_0).reshape(exp_num, 128, -1) - mat_in_1 = mat_in_s[:, :, 16:32].reshape(exp_num, num_sms, 16, 8, 32).transpose(2, 3) + mat_in_1 = ( + mat_in_s[:, :, 16:32].reshape(exp_num, num_sms, 16, k_chunks, 32).transpose(2, 3) + ) mat_in_1 = self._swizzle_qmma_16x32(mat_in_1).reshape(exp_num, 128, -1) - mat_in_2 = mat_in_s[:, :, 32:48].reshape(exp_num, num_sms, 16, 8, 32).transpose(2, 3) + mat_in_2 = ( + mat_in_s[:, :, 32:48].reshape(exp_num, num_sms, 16, k_chunks, 32).transpose(2, 3) + ) mat_in_2 = self._swizzle_qmma_16x32(mat_in_2).reshape(exp_num, 128, -1) - mat_in_swizzled = torch.cat([mat_in_0, mat_in_1, mat_in_2], dim=2) + mats_to_cat = [mat_in_0, mat_in_1, mat_in_2] if arch_name == "deepseek_v3_2": - mat_in_3 = mat_in_s[:, :, 48:56].reshape(exp_num, num_sms, 8, 8, 32).transpose(2, 3) + mat_in_3 = ( + mat_in_s[:, :, 48:56].reshape(exp_num, num_sms, 8, k_chunks, 32).transpose(2, 3) + ) mat_in_3 = self._swizzle_qmma_8x32(mat_in_3).reshape(exp_num, 128, -1) - mat_in_swizzled = torch.cat([mat_in_0, mat_in_1, mat_in_2, mat_in_3], dim=2) - mat_in_swizzled = mat_in_swizzled.reshape(exp_num, dim, 256) + mats_to_cat.append(mat_in_3) + mat_in_swizzled = torch.cat(mats_to_cat, dim=2) + mat_in_swizzled = mat_in_swizzled.reshape(exp_num, dim, expert_dim) mat_scale_tilert = ( - scale_in.reshape(exp_num, dim_scale_dim, 1, 2) + scale_in.reshape(exp_num, dim_scale_dim, 1, scale_cols) .repeat(1, 1, 16, 1) .reshape(exp_num, num_sms, -1) ) - padding_zeros = torch.zeros( - (exp_num, num_sms, 16 - mat_scale_tilert.shape[-1]), - dtype=scale_in.dtype, - device=scale_in.device, - ) - mat_scale_tilert = torch.cat([mat_scale_tilert, padding_zeros], dim=2) - mat_scale_tilert = mat_scale_tilert.reshape(exp_num, 1024, 2) + target_cols_per_sm = 1024 * scale_cols // num_sms + pad_amount = target_cols_per_sm - mat_scale_tilert.shape[-1] + if pad_amount > 0: + padding_zeros = torch.zeros( + (exp_num, num_sms, pad_amount), + dtype=scale_in.dtype, + device=scale_in.device, + ) + mat_scale_tilert = torch.cat([mat_scale_tilert, padding_zeros], dim=2) + mat_scale_tilert = mat_scale_tilert.reshape(exp_num, 1024, scale_cols) if arch_name == "glm_5": if mat_scale_tilert.dtype != torch.float32: print( @@ -153,7 +151,7 @@ def convert_to_general( + "is not float32, convert to float32." ) mat_scale_tilert = mat_scale_tilert.to(torch.float32) - else: # DS v3.2, use bfloat16 for mat_scale_tilert + else: mat_scale_tilert = mat_scale_tilert.to(torch.bfloat16) return mat_in_swizzled.contiguous(), mat_scale_tilert.contiguous() @@ -176,6 +174,11 @@ def __call__(self) -> list[str]: class ExpertDownAllReduce(TileRTModule): """ExpertDownAllReduce module.""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ExpertDownAllReduceAlgorithm.GENERAL], + "glm_5": [ExpertDownAllReduceAlgorithm.GENERAL], + } + def __init__( self, model_args: ModelArgs, @@ -204,15 +207,14 @@ def __init__( self.hidden_out: torch.Tensor | None = None self.profile_logs: torch.Tensor | None = None self.is_init = False - self.exp_down_allreduce_func: Callable | None = None - if self.arch_name == "deepseek_v3_2": - self.exp_down_allreduce_func = expert_down_allreduce - elif self.arch_name == "glm_5": - self.exp_down_allreduce_func = expert_down_allreduce_glm5 + if self.arch_name in ("deepseek_v3_2", "glm_5"): + self.compute_kernel_type = "bf16" else: raise ValueError(f"Unsupported architecture: {self.arch_name}") + self.model_arch = self.arch_name + self.tilert_weights_alias = ExpertDownAllReduceTilertWeightsAlias() self.tensor_alias = ["exp_down_weights", "exp_down_scales"] self.ref_tensor_alias = ( @@ -316,24 +318,21 @@ def init_tilert_vars(self, batch_size: int, seq_len: int, device_id: int = 0) -> self.is_init = True def init_random_weights(self, device_id: int = 0) -> None: - down_weights = [ - torch.randn( - self.dim, self.moe_inter_dim, dtype=torch.bfloat16, device=f"cuda:{device_id}" - ).to(torch.float8_e4m3fn) - for _ in range(self.n_routed_experts + 1) - ] + n = self.n_routed_experts + 1 + dev = f"cuda:{device_id}" + down_weights = list( + torch.randn(n, self.dim, self.moe_inter_dim, dtype=torch.bfloat16, device=dev) + .to(torch.float8_e4m3fn) + .unbind(0) + ) dim_scale_dim = self.dim // self.block_size moe_inter_dim_scale_dim = self.moe_inter_dim // self.block_size scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 - down_scales = [ + down_scales = list( torch.randn( - dim_scale_dim, - moe_inter_dim_scale_dim, - dtype=scale_dtype, - device=f"cuda:{device_id}", - ) - for _ in range(self.n_routed_experts + 1) - ] + n, dim_scale_dim, moe_inter_dim_scale_dim, dtype=scale_dtype, device=dev + ).unbind(0) + ) state_dict = dict( zip( self.ref_tensor_alias, @@ -367,6 +366,7 @@ def golden_forward( hidden_out_w2_list.append(hidden_out_w2_sel * scores[0, s, i]) hidden_out_w2 = torch.stack(hidden_out_w2_list, dim=0).to(torch.bfloat16) hidden_out_w2 = torch.sum(hidden_out_w2, dim=0) + hidden_out_list.append(hidden_out_w2) hidden_out = torch.stack(hidden_out_list, dim=0) return hidden_out[None, ...] @@ -379,9 +379,8 @@ def tilert_forward( x_in: torch.Tensor, flag: int, ) -> torch.Tensor: - assert self.exp_down_allreduce_func is not None assert self.hidden_out is not None - self.exp_down_allreduce_func( + expert_down_allreduce( vec_in, self.tilert_weights, self.tilert_scales, @@ -391,6 +390,8 @@ def tilert_forward( flag, self.hidden_out, self.profile_logs, + self.model_arch, + self.compute_kernel_type, ) return self.hidden_out diff --git a/python/models/deepseek_v3_2/ops/expert_sel_up_gate_silu.py b/tilert/models/glm_5/_dsa_v32/ops/expert_sel_up_gate_silu.py similarity index 86% rename from python/models/deepseek_v3_2/ops/expert_sel_up_gate_silu.py rename to tilert/models/glm_5/_dsa_v32/ops/expert_sel_up_gate_silu.py index 50a0a67..e2d96eb 100644 --- a/python/models/deepseek_v3_2/ops/expert_sel_up_gate_silu.py +++ b/tilert/models/glm_5/_dsa_v32/ops/expert_sel_up_gate_silu.py @@ -4,15 +4,12 @@ from enum import Enum import numpy as np - -# from typing import Any import torch import torch.nn.functional as F from tilert.models.base import TileRTModule, TilertWeightsConverter from tilert.models.common import weight_dequant -from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.profiler.utils import parse_profile_log_tensor +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs from tilert.utils import get_profile_log_tensor __all__ = [ @@ -34,9 +31,11 @@ def expert_select_up_gate_silu( expert_indices_out: torch.Tensor, profile_logs: torch.Tensor, algorithm: str = "fp8mma", + *, + model_arch: str, ) -> None: """Expert SelectUpGateSiLU operation.""" - args_list = [ + torch.ops.tilert.expert_select_up_gate_silu_op( hidden_in, scores_in, bias_in, @@ -45,9 +44,9 @@ def expert_select_up_gate_silu( expert_probs_out, expert_indices_out, profile_logs, + model_arch, algorithm, - ] - torch.ops.tilert.expert_select_up_gate_silu_op(*args_list) + ) @dataclass @@ -114,7 +113,6 @@ class ExpertSelectUpGateSiLUWeightsConverter(TilertWeightsConverter): def _swizzle_qmma_16x32(mat_in: torch.Tensor) -> torch.Tensor: assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 assert mat_in.dtype == torch.float8_e4m3fn - # PTX isa fig.88 pre_shape = mat_in.shape[:-2] mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) @@ -122,7 +120,6 @@ def _swizzle_qmma_16x32(mat_in: torch.Tensor) -> torch.Tensor: @staticmethod def _swizzle_mma_16x32(mat_in: torch.Tensor) -> torch.Tensor: assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 - # PTX isa fig.88 pre_shape = mat_in.shape[:-2] mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) @@ -130,7 +127,6 @@ def _swizzle_mma_16x32(mat_in: torch.Tensor) -> torch.Tensor: @staticmethod def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 - # PTX isa fig.88 pre_shape = mat_in.shape[:-2] mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) @@ -155,7 +151,6 @@ def tilert_to_tilert_144sm( weights_trt = mat_in.reshape(exp_num, 128, 4, 7168) weights_w1 = weights_trt[:, :, :2].reshape(exp_num, 256, 7168) weights_w3 = weights_trt[:, :, 2:].reshape(exp_num, 256, 7168) - # to 16x1024 blocks weights_w1 = weights_w1.reshape(exp_num, 16, 16, 7, 1024).transpose(2, 3) weights_w3 = weights_w3.reshape(exp_num, 16, 16, 7, 1024).transpose(2, 3) if mma_type == "16x32": @@ -177,7 +172,6 @@ def tilert_to_tilert_144sm( assert weights.shape == (exp_num, 16, 7, 32, 1024) weights = weights.reshape(exp_num, 16, 7, 32 * 1024) - # For scales, first unswizzle scales_unswizzled = torch.zeros(exp_num, 4, 56) for i in range(64): if ((i % 8) * 8 + i // 8) < 56: @@ -220,62 +214,63 @@ def tilert_to_tilert_144sm_mma( def convert_to_mma( self, weights_list: list[torch.Tensor], algorithm: str = "fp8mma" ) -> tuple[torch.Tensor, torch.Tensor]: - """ - Convert the weights to mma format. - - Args: - weights: List of weights. - - Returns: - Tuple of weights. - """ + """Convert the weights to mma format.""" args = self.model_args dim = args.dim - pages = dim // 1024 # 6 for GLM5, 7 for DS v3.2 + pages = dim // 1024 dim_scale_dim = dim // args.block_size with torch.inference_mode(): - # w1: gate, w3: up bias_or_gamma, weights_w1, scales_w1, weights_w3, scales_w3 = weights_list exp_num = weights_w1.shape[0] - # to 16x1024 blocks - weights_w1 = weights_w1.reshape(exp_num, 16, 16, pages, 1024).transpose(2, 3) - weights_w3 = weights_w3.reshape(exp_num, 16, 16, pages, 1024).transpose(2, 3) - # to 16x32 blocks and swizzle + moe_rows = weights_w1.shape[1] + n_row_groups = moe_rows // 16 + scale_m_dim = moe_rows // args.block_size + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, 16, pages, 1024).transpose(2, 3) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, 16, pages, 1024).transpose(2, 3) if algorithm == "fp8mma": - weights_w1 = weights_w1.reshape(exp_num, 16, pages, 16, 32, 32).transpose(3, 4) + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, pages, 16, 32, 32).transpose( + 3, 4 + ) weights_w1 = self._swizzle_qmma_16x32(weights_w1) - weights_w1 = weights_w1.reshape(exp_num, 16, pages, 16, 1024) - weights_w3 = weights_w3.reshape(exp_num, 16, pages, 16, 32, 32).transpose(3, 4) + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, pages, 16, 1024) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, pages, 16, 32, 32).transpose( + 3, 4 + ) weights_w3 = self._swizzle_qmma_16x32(weights_w3) - weights_w3 = weights_w3.reshape(exp_num, 16, pages, 16, 1024) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, pages, 16, 1024) elif algorithm == "fp16mma": - weights_w1 = weights_w1.reshape(exp_num, 16, pages, 16, 64, 16).transpose(3, 4) + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, pages, 16, 64, 16).transpose( + 3, 4 + ) weights_w1 = self._swizzle_mma_16x16(weights_w1) - weights_w1 = weights_w1.reshape(exp_num, 16, pages, 16, 1024) - weights_w3 = weights_w3.reshape(exp_num, 16, pages, 16, 64, 16).transpose(3, 4) + weights_w1 = weights_w1.reshape(exp_num, n_row_groups, pages, 16, 1024) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, pages, 16, 64, 16).transpose( + 3, 4 + ) weights_w3 = self._swizzle_mma_16x16(weights_w3) - weights_w3 = weights_w3.reshape(exp_num, 16, pages, 16, 1024) + weights_w3 = weights_w3.reshape(exp_num, n_row_groups, pages, 16, 1024) else: raise ValueError(f"Unsupported algorithm: {algorithm}") - # concat w1 and w3 weights: torch.Tensor = torch.cat([weights_w1, weights_w3], dim=3) - assert weights.shape == (exp_num, 16, pages, 32, 1024) - weights = weights.reshape(exp_num, 16, pages, 32 * 1024) + assert weights.shape == (exp_num, n_row_groups, pages, 32, 1024) + weights = weights.reshape(exp_num, n_row_groups, pages, 32 * 1024) + scales_per_page = 1024 // args.block_size + repeat_factor = n_row_groups // scale_m_dim scales_w1 = ( - scales_w1.reshape(exp_num, 2, 1, dim_scale_dim) - .repeat(1, 1, 8, 1) - .reshape(exp_num, 16, 1, pages, 8) + scales_w1.reshape(exp_num, scale_m_dim, 1, dim_scale_dim) + .repeat(1, 1, repeat_factor, 1) + .reshape(exp_num, n_row_groups, 1, pages, scales_per_page) ) scales_w1 = scales_w1.transpose(2, 3) scales_w3 = ( - scales_w3.reshape(exp_num, 2, 1, dim_scale_dim) - .repeat(1, 1, 8, 1) - .reshape(exp_num, 16, 1, pages, 8) + scales_w3.reshape(exp_num, scale_m_dim, 1, dim_scale_dim) + .repeat(1, 1, repeat_factor, 1) + .reshape(exp_num, n_row_groups, 1, pages, scales_per_page) ) scales_w3 = scales_w3.transpose(2, 3) scales = torch.cat([scales_w1, scales_w3], dim=3) - assert scales.shape == (exp_num, 16, pages, 2, 8) + assert scales.shape == (exp_num, n_row_groups, pages, 2, scales_per_page) if self.model_args.arch_name == "glm_5": if scales.dtype != torch.float32: @@ -285,14 +280,16 @@ def convert_to_mma( + "is not float32, convert to float32." ) scales = scales.to(torch.float32) - else: # DS v3.2, use bfloat16 for scales + else: scales = scales.to(torch.bfloat16) - scales = scales.reshape(exp_num, 16, pages, 2 * 8).view(dtype=torch.float8_e4m3fn) + scales = scales.reshape(exp_num, n_row_groups, pages, 2 * scales_per_page).view( + dtype=torch.float8_e4m3fn + ) weights_and_scales = torch.zeros( exp_num, - 16, + n_row_groups, pages, 32 * 1024 + 128, dtype=torch.float8_e4m3fn, @@ -335,6 +332,17 @@ def convert_to_fp16mma( class ExpertSelectUpGateSiLU(TileRTModule): """ExpertSelectUpGateSiLU module""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ + ExpertSelectUpGateSiLUAlgorithm.FP8MMA, + ExpertSelectUpGateSiLUAlgorithm.FP16MMA, + ], + "glm_5": [ + ExpertSelectUpGateSiLUAlgorithm.FP8MMA, + ExpertSelectUpGateSiLUAlgorithm.FP16MMA, + ], + } + def __init__( self, model_args: ModelArgs, @@ -377,18 +385,14 @@ def __init__( ) ) - # reference weights self.ref_bias: torch.Tensor | None = None self.ref_gate: torch.Tensor | None = None self.ref_up: torch.Tensor | None = None - # tilert weights self.tilert_bias: torch.Tensor | None = None self.tilert_weights: torch.Tensor | None = None - # for compatibility, to be removed in the future self.tilert_scales = torch.zeros(1, dtype=torch.bfloat16, device=torch.device("cuda")) - # tilert vars self.hidden_out: torch.Tensor | None = None self.expert_probs: torch.Tensor | None = None self.expert_indices: torch.Tensor | None = None @@ -423,7 +427,7 @@ def get_weights_list(self) -> list[torch.Tensor]: @staticmethod def process_gate_up_weights( - key_prefix: str, # e.g. mlp.shared_experts or mlp.experts.{id} + key_prefix: str, weights_hf: dict[str, torch.Tensor], num_devices: int, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: @@ -537,17 +541,11 @@ def init_reference_weights( self.ref_up = torch.stack(ref_up_list, dim=0) def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: - """ - Initialize the tilert weights. - - Args: - state_dict: State dict keyed by tilert_weights_alias() (per-device). - """ + """Initialize the tilert weights.""" assert self.algorithm is not None, "Algorithm is not set" weights_list = [state_dict[alias] for alias in self.tilert_weights_alias()] - self.tilert_bias, self.tilert_weights = ExpertSelectUpGateSiLUWeightsConverter( - self.model_args, self.num_devices - ).dispatch(self.algorithm, weights_list) + converter = ExpertSelectUpGateSiLUWeightsConverter(self.model_args, self.num_devices) + self.tilert_bias, self.tilert_weights = converter.dispatch(self.algorithm, weights_list) def init_tilert_vars(self, batch_size: int, seq_len: int, device: str = "cuda") -> None: """ @@ -557,7 +555,6 @@ def init_tilert_vars(self, batch_size: int, seq_len: int, device: str = "cuda") batch_size: Batch size. seq_len: Sequence length. """ - # tilert vars self.hidden_out = torch.zeros( ( batch_size, @@ -589,30 +586,31 @@ def init_random_weights(self, device: str = "cuda") -> None: Returns: None """ + n = self.n_routed_experts + 1 bias = torch.randn(self.n_routed_experts, dtype=torch.float32, device=device) - gate_weights = [ - torch.randn(self.moe_inter_dim, self.dim, dtype=torch.bfloat16, device=device).to( - torch.float8_e4m3fn - ) - for _ in range(self.n_routed_experts + 1) - ] - up_weights = [ - torch.randn(self.moe_inter_dim, self.dim, dtype=torch.bfloat16, device=device).to( - torch.float8_e4m3fn - ) - for _ in range(self.n_routed_experts + 1) - ] + gate_weights = list( + torch.randn(n, self.moe_inter_dim, self.dim, dtype=torch.bfloat16, device=device) + .to(torch.float8_e4m3fn) + .unbind(0) + ) + up_weights = list( + torch.randn(n, self.moe_inter_dim, self.dim, dtype=torch.bfloat16, device=device) + .to(torch.float8_e4m3fn) + .unbind(0) + ) moe_inter_dim_scale_dim = self.moe_inter_dim // self.block_size dim_scale_dim = self.dim // self.block_size scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 - gate_scales = [ - torch.randn(moe_inter_dim_scale_dim, dim_scale_dim, dtype=scale_dtype, device=device) - for _ in range(self.n_routed_experts + 1) - ] - up_scales = [ - torch.randn(moe_inter_dim_scale_dim, dim_scale_dim, dtype=scale_dtype, device=device) - for _ in range(self.n_routed_experts + 1) - ] + gate_scales = list( + torch.randn( + n, moe_inter_dim_scale_dim, dim_scale_dim, dtype=scale_dtype, device=device + ).unbind(0) + ) + up_scales = list( + torch.randn( + n, moe_inter_dim_scale_dim, dim_scale_dim, dtype=scale_dtype, device=device + ).unbind(0) + ) tensor_list = [ bias, *gate_weights, @@ -652,7 +650,6 @@ def _ref_expert_select_ds(self, scores: torch.Tensor) -> tuple[torch.Tensor, tor return weights, indices def _ref_expert_select_glm5(self, scores: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - # flatten_dim = np.prod(scores.size()[:-1]) scores = scores.sigmoid() original_scores = scores if self.ref_bias is not None: @@ -682,7 +679,6 @@ def golden_forward( raise ValueError(f"Unsupported architecture: {self.arch_name}") hidden_out_list = [] for s in range(seq_len): - # ref up-gate silu hidden_out_w1_list = [] hidden_out_w3_list = [] hidden_out_w1_shared = x_in[0, s].float() @ self.ref_gate[0].float().T @@ -721,9 +717,6 @@ def tilert_forward( self.expert_indices, self.profile_logs, self.algorithm.value, + model_arch=self.model_args.arch_name, ) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) return self.hidden_out, self.expert_probs, self.expert_indices diff --git a/tilert/models/glm_5/_dsa_v32/ops/flash_sparse_mla.py b/tilert/models/glm_5/_dsa_v32/ops/flash_sparse_mla.py new file mode 100644 index 0000000..1d4cc00 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/flash_sparse_mla.py @@ -0,0 +1,261 @@ +"""Flash Sparse MLA operation module.""" + +import math +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "flash_sparse_mla", + "FlashSparseMLACombine", +] + + +def flash_sparse_mla( + query: torch.Tensor, + query_pe: torch.Tensor, + key_value: torch.Tensor, + key_pe: torch.Tensor, + indices: torch.Tensor, + cur_pos: torch.Tensor, + output: torch.Tensor, + profile_logs: torch.Tensor, + split_size: int = 64, + compute_kernel_type: str = "bf16mma", + *, + model_arch: str, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Flash Sparse MLA operation for GLM5. + + Args: + query: Query tensor. (bs, seqlen, heads, dim) + query_pe: Query position embedding tensor. (bs, seqlen, heads, pe_dim) + key_value: Key-value tensor. (bs, seqlen_kv, dim) + key_pe: Key position embedding tensor. (bs, seqlen_kv, pe_dim) + indices: Indices tensor. (bs, seqlen, topk) + cur_pos: cur_pos tensor. (1) + output: Output tensor. + profile_logs: Profile logs tensor. + split_size: Number of splits. + """ + batch, seqlen, heads, hidden_dim = query.shape + if split_size != 64: + raise ValueError( + "The current implementation of flash_sparse_mla_op only supports split_size=64" + ) + if batch != 1: + raise ValueError("The current implementation of flash_sparse_mla_op only supports batch=1") + if seqlen > 4: + raise ValueError( + "The current implementation of flash_sparse_mla_op only supports seqlen<=4" + ) + + seqlen_kv = key_value.shape[1] + index_len = indices.shape[-1] + if index_len > seqlen_kv: + raise ValueError("index_len must be less than or equal to seqlen_kv") + + device = query.device + acc_type = torch.float32 + + dim = key_value.shape[-1] + max_num_splits = 32 + + lse = torch.empty((batch, seqlen, heads), device=device, dtype=acc_type) + lse_acc = torch.empty((batch, seqlen, heads, max_num_splits), device=device, dtype=acc_type) + output_acc = torch.empty( + batch, seqlen, heads, max_num_splits, dim, device=device, dtype=acc_type + ) + + if heads not in (8, 10, 16, 20): + raise ValueError(f"Unsupported heads: {heads}") + torch.ops.tilert.flash_sparse_mla_op( + query, + query_pe, + key_value, + key_pe, + indices, + cur_pos, + output, + output_acc, + lse, + lse_acc, + split_size, + model_arch, + compute_kernel_type, + profile_logs, + torch.empty(0, dtype=torch.int64, device=query.device), + ) + return lse, lse_acc, output_acc + + +class FlashSparseMLACombineAlgorithm(Enum): + """FlashSparseMLACombine algorithm.""" + + BF16MMA = "bf16mma" + + +class FlashSparseMLACombine(TileRTModule): + """Flash Sparse MLA combine module; no weights, uses model_args for scale and config.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [FlashSparseMLACombineAlgorithm.BF16MMA], + "glm_5": [FlashSparseMLACombineAlgorithm.BF16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + layer_idx: int = 0, + ): + super().__init__( + type(self).__name__, + model_args=model_args, + num_devices=num_devices, + layer_idx=layer_idx, + ) + self.tilert_tensor_alias: list[str] = [] + self.ref_tensor_alias: list[str] = [] + + scale = (model_args.qk_nope_head_dim + model_args.qk_rope_head_dim) ** -0.5 + if model_args.rope_factor is None: + mscale = 1.0 + else: + mscale = 0.1 * math.log(model_args.rope_factor) + 1.0 + self.softmax_scale = scale * mscale * mscale + + self.profile_logs = get_profile_log_tensor() + + def init_reference_weights( + self, state_dict: dict[str, torch.Tensor], device_id: int = 0 + ) -> None: + del state_dict, device_id + self.is_ref_weights_init = True + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + del state_dict + self.is_tilert_weights_init = True + + def init_random_weights(self) -> None: + self.is_ref_weights_init = True + self.is_tilert_weights_init = True + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + del batch_size, seq_len + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + kv_cache: torch.Tensor, + pe_cache: torch.Tensor, + topk_indices: torch.Tensor, + cur_pos: torch.Tensor, + ) -> torch.Tensor: + """Flash Sparse MLA golden version. + + Args: + q_nope: Query tensor. (bs, seqlen, heads, dim) + q_pe: Query position embedding tensor. (bs, seqlen, heads, pe_dim) + kv_cache: Key-value tensor. (bs, seqlen_kv, dim) + pe_cache: Key position embedding tensor. (bs, seqlen_kv, pe_dim) + topk_indices: Indices tensor. (bs, seqlen, topk) + cur_pos: cur_pos tensor. (1) + """ + batch_size = q_nope.shape[0] + seqlen = q_nope.shape[1] + seqlen_kv = kv_cache.shape[1] + + start_pos = int(cur_pos.item()) + mask = ( + torch.full((seqlen, seqlen_kv), float("-inf")).triu_(start_pos + 1) + if seqlen > 1 + else None + ) + + scores = ( + torch.einsum("bshc,btc->bsht", q_nope.float(), kv_cache.float()) + + torch.einsum("bshr,btr->bsht", q_pe.float(), pe_cache.float()) + ) * self.softmax_scale + index_mask = torch.full( + (batch_size, seqlen, seqlen_kv), float("-inf"), device=q_nope.device + ).scatter_(-1, topk_indices, 0) + if mask is not None: + index_mask += mask + + scores += index_mask.unsqueeze(2) + scores = scores.softmax(dim=-1, dtype=torch.float32) + return torch.einsum("bsht,btc->bshc", scores.to(torch.bfloat16), kv_cache) + + def tilert_forward( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + kv_cache: torch.Tensor, + pe_cache: torch.Tensor, + topk_indices: torch.Tensor, + cur_pos: torch.Tensor, + ) -> torch.Tensor: + """Flash Sparse MLA tilert version. + + Args: + q_nope: Query tensor. (bs, seqlen, heads, dim) + q_pe: Query position embedding tensor. (bs, seqlen, heads, pe_dim) + kv_cache: Key-value tensor. (bs, seqlen_kv, dim) + pe_cache: Key position embedding tensor. (bs, seqlen_kv, pe_dim) + topk_indices: Indices tensor. (bs, seqlen, topk) + cur_pos: cur_pos tensor. (1) + """ + batch_size, seqlen, heads, dim = q_nope.shape + v_dim = kv_cache.shape[-1] + + topk_indices = topk_indices.to(torch.int32) + topk_indices = topk_indices[..., : kv_cache.shape[1]] + device = q_nope.device + if any(t.device != device for t in (q_pe, kv_cache, pe_cache, topk_indices, cur_pos)): + raise RuntimeError( + "flash_sparse_mla inputs must be on the same device: " + f"q_nope={device}, q_pe={q_pe.device}, kv_cache={kv_cache.device}, " + f"pe_cache={pe_cache.device}, topk_indices={topk_indices.device}, " + f"cur_pos={cur_pos.device}" + ) + if self.profile_logs is not None and self.profile_logs.device != device: + self.profile_logs = get_profile_log_tensor(device_index=device.index, device=device) + output = torch.zeros( + (batch_size, seqlen, heads, v_dim), dtype=torch.bfloat16, device=device + ) + flash_sparse_mla( + q_nope, + q_pe, + kv_cache, + pe_cache, + topk_indices, + cur_pos, + output, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + return output + + def to_tilert_weights(self) -> None: + raise NotImplementedError("to_tilert_weights not implemented") + + def __call__( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + kv_cache: torch.Tensor, + pe_cache: torch.Tensor, + topk_indices: torch.Tensor, + cur_pos: torch.Tensor, + ) -> torch.Tensor: + if self.flag_enable_tilert: + return self.tilert_forward(q_nope, q_pe, kv_cache, pe_cache, topk_indices, cur_pos) + return self.golden_forward(q_nope, q_pe, kv_cache, pe_cache, topk_indices, cur_pos) diff --git a/tilert/models/glm_5/_dsa_v32/ops/layernorm_rope_rotate.py b/tilert/models/glm_5/_dsa_v32/ops/layernorm_rope_rotate.py new file mode 100644 index 0000000..4fc8c0d --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/layernorm_rope_rotate.py @@ -0,0 +1,243 @@ +"""Layernorm_rope_rotate operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch +import torch.nn.functional as F + +from tilert.models.base import TileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.ops.rotate import rotate_activation +from tilert.models.utils import apply_rotary_emb +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "layernorm_rope_rotate", + "LayerNormRoPERotate", + "LayerNormRoPERotateRefWeightsAlias", + "LayerNormRoPERotateTilertWeightsAlias", +] + + +def layernorm_rope_rotate( + input_raw: torch.Tensor, + cur_pos: torch.Tensor, + k_cache_raw: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + freqs_cis: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", +) -> None: + """ + Layernorm_rope_rotate operation. + + Layernorm_rope_rotate the input tensor `input_raw` and stores the result in `k_cache_raw`. + + Args: + input_raw (torch.Tensor): The input tensor. + cur_pos (torch.Tensor): The current position tensor. + k_cache_raw (torch.Tensor): The output tensor where the result will be stored. + weight (torch.Tensor): The weight tensor. + bias (torch.Tensor): The bias tensor. + freqs_cis (torch.Tensor): The frequency tensor. + profile_logs (torch.Tensor): Tensor for storing profiling logs. + + Returns: + None + """ + if input_raw.dtype != torch.bfloat16: + raise ValueError("input must be a bfloat16 tensor.") + if cur_pos.dtype != torch.int32: + raise ValueError("cur_pos must be a int32 tensor.") + if k_cache_raw.dtype != torch.bfloat16: + raise ValueError("k_cache must be a bfloat16 tensor.") + + if weight.dtype != torch.float32: + raise ValueError("weight must be a float32 tensor.") + + if bias.dtype != torch.float32: + raise ValueError("bias must be a float32 tensor.") + + if freqs_cis.dtype != torch.float32: + raise ValueError("freqs_cis must be a float32 tensor.") + + batch, seq, dim = input_raw.shape + if dim != 128: + raise ValueError("dim must be 128, as we precompute scale inner kernel") + if batch != 1: + raise ValueError("batch must be 1 in this version") + + torch.ops.tilert.layernorm_rope_rotate_op( + input_raw, + cur_pos, + k_cache_raw, + weight, + bias, + freqs_cis, + model_arch, + compute_kernel_type, + profile_logs, + ) + + +@dataclass +class LayerNormRoPERotateRefWeightsAlias: + """Reference weights alias for LayerNormRoPERotate.""" + + k_weight = "self_attn.indexer.k_norm.weight" + k_bias = "self_attn.indexer.k_norm.bias" + + @property + def ref_tensor_alias(self) -> list[str]: + return [self.k_weight, self.k_bias] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class LayerNormRoPERotateTilertWeightsAlias: + """TileRT weights alias for LayerNormRoPERotate.""" + + k_weight = "k_weights" + k_bias = "k_bias" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.k_weight, self.k_bias] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class LayerNormRoPERotateAlgorithm(Enum): + """LayerNormRoPERotate algorithm.""" + + GENERAL = "general" + + +class LayerNormRoPERotate(TileRTModule): + """LayerNormRoPERotate module: LayerNorm + RoPE + rotate on K indexer output.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [LayerNormRoPERotateAlgorithm.GENERAL], + "glm_5": [LayerNormRoPERotateAlgorithm.GENERAL], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int, + ref_weights_alias: LayerNormRoPERotateRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = LayerNormRoPERotateTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias + if ref_weights_alias is not None + else LayerNormRoPERotateRefWeightsAlias() + ) + + self.rope_head_dim = self.model_args.qk_rope_head_dim + self.head_dim = self.model_args.index_head_dim + + self.ref_weight: torch.Tensor | None = None + self.ref_bias: torch.Tensor | None = None + self.tilert_weight: torch.Tensor | None = None + self.tilert_bias: torch.Tensor | None = None + self.output: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_weight, self.tilert_bias] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """ + Device sharding: replicate weight and bias for each device. + + Args: + weights_map: Map from ref weight alias to tensor. + + Returns: + Map from tilert weight alias to (num_devices, ...) tensors. + """ + k_weight = weights_map[self.ref_weights_alias.k_weight][None, ...].repeat( + self.num_devices, 1 + ) + k_bias = weights_map[self.ref_weights_alias.k_bias][None, ...].repeat(self.num_devices, 1) + return { + self.tilert_weights_alias.k_weight: k_weight, + self.tilert_weights_alias.k_bias: k_bias, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + self.ref_weight = state_dict[self.ref_weights_alias.k_weight].contiguous().float() + self.ref_bias = state_dict[self.ref_weights_alias.k_bias].contiguous().float() + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + self.tilert_weight = state_dict[self.tilert_weights_alias.k_weight].contiguous().float() + self.tilert_bias = state_dict[self.tilert_weights_alias.k_bias].contiguous().float() + + def init_random_weights(self) -> None: + ref_weight = torch.ones(self.head_dim, dtype=torch.float32) + ref_bias = torch.zeros(self.head_dim, dtype=torch.float32) + ref_state_dict = dict(zip(self.ref_weights_alias(), [ref_weight, ref_bias])) + self.init_reference_weights(ref_state_dict) + self.init_tilert_weights( + {_k: _v[self.device_id] for _k, _v in self.device_sharding(ref_state_dict).items()} + ) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + self.cur_pos = torch.tensor([0], dtype=torch.int32) + self.output = torch.zeros((batch_size, seq_len, self.head_dim), dtype=torch.bfloat16) + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward(self, idx_k: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: + assert self.ref_weight is not None and self.ref_bias is not None + k = F.layer_norm( + idx_k.float(), + (self.head_dim,), + self.ref_weight, + self.ref_bias, + 1e-6, + ).to(idx_k.dtype) + k_pe, k_nope = torch.split( + k, [self.rope_head_dim, self.head_dim - self.rope_head_dim], dim=-1 + ) + k_pe = apply_rotary_emb(k_pe.unsqueeze(2), freqs_cis).squeeze(2) + k = torch.cat([k_pe, k_nope], dim=-1) + return rotate_activation(k) + + def tilert_forward(self, idx_k: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: + assert self.tilert_weight is not None and self.tilert_bias is not None + assert self.output is not None and self.profile_logs is not None + rope_freqs = ( + torch.view_as_real(freqs_cis).reshape(*freqs_cis.shape[:-1], -1).float().unsqueeze(1) + ) + layernorm_rope_rotate( + idx_k, + self.cur_pos, + self.output, + self.tilert_weight, + self.tilert_bias, + rope_freqs, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + return self.output + + def __call__(self, idx_k: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: + if self.flag_enable_tilert: + return self.tilert_forward(idx_k, freqs_cis) + return self.golden_forward(idx_k, freqs_cis) diff --git a/tilert/models/glm_5/_dsa_v32/ops/padded_allreduce_add.py b/tilert/models/glm_5/_dsa_v32/ops/padded_allreduce_add.py new file mode 100644 index 0000000..a6490c9 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/padded_allreduce_add.py @@ -0,0 +1,147 @@ +"""PaddedAllReduceAdd operation module.""" + +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "padded_allreduce_add", + "PaddedAllReduceAdd", +] + + +def padded_allreduce_add( + partial_buf: torch.Tensor, + x_in: torch.Tensor, + flag: int, + vec_out: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", +) -> None: + """Padded AllReduce + residual add for Device Group A (GPU 0). + + GPU 0 contributes zeros to the 8-GPU AllReduce, then adds the residual. + + Args: + partial_buf: Zero-filled partial buffer [1, L, hidden_dim] bf16. + x_in: Residual input [1, L, hidden_dim] bf16. + flag: AllReduce sync flag. + vec_out: Output tensor [1, L, hidden_dim] bf16. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16"). + """ + torch.ops.tilert.padded_allreduce_add_op( + partial_buf, x_in, flag, vec_out, profile_logs, model_arch, compute_kernel_type + ) + + +class PaddedAllReduceAddAlgorithm(Enum): + """PaddedAllReduceAdd algorithm.""" + + BF16 = "bf16" + + +class PaddedAllReduceAdd(TileRTModule): + """PaddedAllReduceAdd module — zero-partial AllReduce + residual add.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [PaddedAllReduceAddAlgorithm.BF16], + "glm_5": [PaddedAllReduceAddAlgorithm.BF16], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int = 0, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.dim = self.model_args.dim + + self.partial_buf: torch.Tensor | None = None + + self.hidden_out: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + self.is_var_init = False + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + """Allocate output buffer and persistent zero-filled partial buffer. + + Args: + batch_size: Batch size. + seq_len: Sequence length. + """ + self.hidden_out = torch.zeros( + (batch_size, seq_len, self.dim), + dtype=torch.bfloat16, + device=f"cuda:{self.device_id}", + ) + self.partial_buf = torch.zeros( + (batch_size, seq_len, self.dim), + dtype=torch.bfloat16, + device=f"cuda:{self.device_id}", + ) + self.profile_logs = get_profile_log_tensor(device=f"cuda:{self.device_id}") + self.is_var_init = True + + def golden_forward( + self, + x_in: torch.Tensor, + ) -> torch.Tensor: + """Golden reference: allreduce(zeros) + x_in = x_in (single-GPU). + + On a single GPU, allreduce of zeros returns zeros, so output = x_in. + + Args: + x_in: Residual input [1, L, hidden_dim]. + + Returns: + Output tensor (copy of x_in). + """ + return x_in.clone() + + def tilert_forward( + self, + x_in: torch.Tensor, + flag: int, + ) -> torch.Tensor: + """Run TileRT kernel forward. + + Args: + x_in: Residual input [1, L, hidden_dim]. + flag: AllReduce sync flag. + + Returns: + Output tensor [1, L, hidden_dim]. + """ + assert self.hidden_out is not None + assert self.partial_buf is not None + assert self.profile_logs is not None + padded_allreduce_add( + self.partial_buf, + x_in, + flag, + self.hidden_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + return self.hidden_out + + def __call__( + self, + x_in: torch.Tensor, + ) -> torch.Tensor: + return self.golden_forward(x_in) diff --git a/tilert/models/glm_5/_dsa_v32/ops/projo_wkvb.py b/tilert/models/glm_5/_dsa_v32/ops/projo_wkvb.py new file mode 100644 index 0000000..3e99f0e --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/projo_wkvb.py @@ -0,0 +1,483 @@ +"""ProjOWkvb operation module.""" + +import math +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import init_func, weight_dequant +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "projo_wkvb", + "ProjoWKVb", + "ProjoWKVbAlgorithm", + "ProjoWKVbWeightsConverter", + "ProjoWKVbRefWeightsAlias", + "ProjoWKVbTilertWeightsAlias", +] + + +def projo_wkvb( + o_in: torch.Tensor, + wkv_b_b: torch.Tensor, + wkv_b_scales: torch.Tensor, + output: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "fp16mma", +) -> None: + """ + Define the ProjOWkvb operation. + + Args: + o_in: Input tensor. + wkv_b_b: Weight tensor. + wkv_b_scales: Scale tensor. + output: Output tensor. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Kernel type ("fp16mma" for both DSv32 and GLM5). + """ + torch.ops.tilert.projo_wkvb_op( + o_in, + wkv_b_b, + wkv_b_scales, + output, + model_arch, + compute_kernel_type, + profile_logs, + torch.empty(0, dtype=torch.int64, device=o_in.device), + ) + + +class ProjoWKVbAlgorithm(Enum): + """ProjoWKVb algorithm""" + + GENERAL = "general" + FP16MMA = "fp16mma" + BF16MMA = "bf16mma" + + +class ProjoWKVbWeightsConverter(TilertWeightsConverter): + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args, num_devices) + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle a [*, 16, 16] block for the packed weight layout.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x16_for_pages(mat_in: torch.Tensor, k_dim: int, pages: int) -> torch.Tensor: + """Swizzle a [*, 16, K] matrix for the paged weight layout.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == k_dim + pre_shape = mat_in.shape[:-2] + k_per_page = k_dim // pages + n_k_tiles = k_per_page // 16 + mat_in = mat_in.reshape(*pre_shape, 16, pages, k_per_page).transpose(-3, -2) + mat_in = mat_in.reshape(*pre_shape, pages, 16, n_k_tiles, 16).transpose(-3, -2) + mat_in = ProjoWKVbWeightsConverter._swizzle_mma_16x16(mat_in) + return mat_in.contiguous() + + def convert_to_fp16mma(self, weights: list[torch.Tensor]) -> torch.Tensor: + """Convert weights to the packed format expected by the kernel.""" + with torch.inference_mode(): + wkv_b_b, wkv_b_b_scales = self.convert_to_general(weights) + + n_heads = wkv_b_b.size(0) + v_head_dim = wkv_b_b.size(1) + kv_lora_rank = wkv_b_b.size(2) + num_ctas = 80 + rows_per_cta = (n_heads * v_head_dim) // num_ctas + + is_glm5 = self.model_args.arch_name == "glm_5" + + w_flat = wkv_b_b.reshape(num_ctas, rows_per_cta // 16, 16, kv_lora_rank) + w_swizzled = ProjoWKVbWeightsConverter._swizzle_mma_16x16_for_pages( + w_flat, kv_lora_rank, pages=1 + ) + w_bytes = w_swizzled.reshape(num_ctas, -1) + + scale_k_block = 128 + n_scale_k = kv_lora_rank // scale_k_block + ctas_per_head = num_ctas // n_heads + + if is_glm5: + ctas_per_scale_row = 64 // rows_per_cta + scales_per_cta = wkv_b_b_scales.repeat_interleave(ctas_per_scale_row, dim=1) + scales_per_cta = scales_per_cta.reshape(num_ctas, n_scale_k) + else: + scales_per_cta = wkv_b_b_scales.squeeze(1).repeat_interleave(ctas_per_head, dim=0) + + scale_dtype = torch.float32 + scales_per_cta = scales_per_cta.to(scale_dtype) + + mat_bytes = rows_per_cta * kv_lora_rank + scale_bytes = n_scale_k * 4 + page_size = (mat_bytes + scale_bytes + 127) // 128 * 128 + + scales_raw = scales_per_cta.contiguous().view(torch.float8_e4m3fn) + padding_size = page_size - mat_bytes - scales_raw.shape[-1] + padding = torch.zeros( + num_ctas, padding_size, dtype=torch.float8_e4m3fn, device=wkv_b_b.device + ) + return torch.cat([w_bytes, scales_raw, padding], dim=-1).contiguous() + + def convert_to_bf16mma(self, weights: list[torch.Tensor]) -> torch.Tensor: + """Convert weights to the packed format expected by the BF16 kernel.""" + with torch.inference_mode(): + tilert_wkv_b_weights, tilert_wkv_b_scales = weights + + wkvb_head_dim = self.model_args.qk_nope_head_dim + self.model_args.v_head_dim + left_head_dim = wkvb_head_dim % self.model_args.block_size + hd_block = left_head_dim if left_head_dim != 0 else self.model_args.block_size + + if self.model_args.n_heads % self.num_devices == 0: + n_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local_heads = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local_heads % 2 != 0: + n_local_heads += 1 + + v_head_dim = self.model_args.v_head_dim + kv_lora_rank = self.model_args.kv_lora_rank + n_block = self.model_args.block_size + + w = tilert_wkv_b_weights + s = tilert_wkv_b_scales + if self.model_args.n_heads % self.num_devices != 0: + n_current = w.size(0) + if n_current < n_local_heads: + pad_w = torch.zeros( + n_local_heads - n_current, *w.shape[1:], dtype=w.dtype, device=w.device + ) + w = torch.cat([w, pad_w], dim=0) + pad_s = torch.zeros( + n_local_heads - n_current, *s.shape[1:], dtype=s.dtype, device=s.device + ) + s = torch.cat([s, pad_s], dim=0) + + s = s.float() + s = s.repeat_interleave(hd_block, dim=1).repeat_interleave(n_block, dim=2) + wkv_bf16 = (w.float() * s).to(torch.bfloat16) + n_heads = n_local_heads + + num_ctas = 80 + rows_per_cta = (n_heads * v_head_dim) // num_ctas + + w_flat = wkv_bf16.reshape(num_ctas, rows_per_cta // 16, 16, kv_lora_rank) + w_swizzled = ProjoWKVbWeightsConverter._swizzle_mma_16x16_for_pages( + w_flat, kv_lora_rank, pages=1 + ) + w_bytes = w_swizzled.reshape(num_ctas, -1).contiguous().view(torch.float8_e4m3fn) + + mat_bytes = rows_per_cta * kv_lora_rank * 2 + page_size = (mat_bytes + 127) // 128 * 128 + padding_size = page_size - w_bytes.shape[-1] + + if padding_size > 0: + padding = torch.zeros( + num_ctas, padding_size, dtype=torch.float8_e4m3fn, device=wkv_bf16.device + ) + return torch.cat([w_bytes, padding], dim=-1).contiguous() + return w_bytes.contiguous() + + def convert_to_general(self, weights: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: + with torch.inference_mode(): + tilert_wkv_b_weights, tilert_wkv_b_scales = weights + + wkv_b_b = tilert_wkv_b_weights.contiguous() + wkv_b_b_scales = tilert_wkv_b_scales.contiguous() + if self.model_args.arch_name == "glm_5": + if wkv_b_b_scales.dtype != torch.float32: + print( + "Warning: ProjoWKVbWeightsConverter: " + + f"wkv_b_b_scales.dtype: {wkv_b_b_scales.dtype} " + + "is not float32, convert to float32." + ) + wkv_b_b_scales = wkv_b_b_scales.to(torch.float32) + else: + wkv_b_b_scales = wkv_b_b_scales.to(torch.bfloat16) + + wkv_b_b = wkv_b_b.detach() + wkv_b_b_scales = wkv_b_b_scales.detach() + + if self.model_args.n_heads % self.num_devices != 0: + n_target = math.ceil(self.model_args.n_heads / self.num_devices) + if n_target % 2 != 0: + n_target += 1 + n_current = wkv_b_b.size(0) + if n_current < n_target: + pad_b = torch.zeros( + n_target - n_current, + *wkv_b_b.shape[1:], + dtype=wkv_b_b.dtype, + device=wkv_b_b.device, + ) + wkv_b_b = torch.cat([wkv_b_b, pad_b], dim=0) + pad_s = torch.zeros( + n_target - n_current, + *wkv_b_b_scales.shape[1:], + dtype=wkv_b_b_scales.dtype, + device=wkv_b_b_scales.device, + ) + wkv_b_b_scales = torch.cat([wkv_b_b_scales, pad_s], dim=0) + wkv_b_b = wkv_b_b.contiguous() + wkv_b_b_scales = wkv_b_b_scales.contiguous() + + return wkv_b_b, wkv_b_b_scales + + +@dataclass +class ProjoWKVbRefWeightsAlias: + """Reference weights alias for ProjoWKVb.""" + + wkv_b_weights = "self_attn.kv_b_proj.weight" + wkv_b_scales = "self_attn.kv_b_proj.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [self.wkv_b_weights, self.wkv_b_scales] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class ProjoWKVbTilertWeightsAlias: + """TileRT weights alias for ProjoWKVb.""" + + wkv_b_weights = "wkv_b2_weights" + wkv_b_scales = "wkv_b2_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.wkv_b_weights, self.wkv_b_scales] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class ProjoWKVb(TileRTModule): + """ProjoWKVb module: O projection (wkv_b) for output.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ProjoWKVbAlgorithm.FP16MMA], + "glm_5": [ProjoWKVbAlgorithm.FP16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int = 0, + ref_weights_alias: ProjoWKVbRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = ProjoWKVbTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else ProjoWKVbRefWeightsAlias() + ) + + self.ref_wkv_b: torch.Tensor | None = None + self.tilert_wkv_b_b: torch.Tensor | None = None + self.tilert_wkv_b_b_scales: torch.Tensor | None = None + self.output: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + + if self.model_args.n_heads % self.num_devices == 0: + self.num_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local % 2 != 0: + n_local += 1 + self.num_local_heads = n_local + + self.wkvb_lora_rank = self.model_args.kv_lora_rank + self.wkvb_lora_rank_qsize = self.wkvb_lora_rank // self.model_args.block_size + + self.wkvb_head_dim = self.model_args.qk_nope_head_dim + self.model_args.v_head_dim + self.wkvb_v_head_dim = self.model_args.v_head_dim + left_head_dim = self.wkvb_head_dim % self.model_args.block_size + if left_head_dim != 0: + assert self.model_args.block_size % left_head_dim == 0 + self.head_dim_block_size = left_head_dim + self.head_dim_scale_repeat = self.model_args.block_size // self.head_dim_block_size + else: + self.head_dim_scale_repeat = 1 + self.head_dim_block_size = self.model_args.block_size + self.wkvb_head_qsize = self.wkvb_head_dim // self.head_dim_block_size + self.wkvb_v_head_qsize = self.wkvb_v_head_dim // self.head_dim_block_size + + self.compute_kernel_type = "fp16mma" + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_wkv_b_b, self.tilert_wkv_b_b_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """ + Device sharding: split weights and scales per device. + + Args: + weights_map: Map from ref weight alias to tensor. + + Returns: + Map from tilert weight alias to (num_devices, ...) tensors. + """ + kv_b_proj_weight = weights_map[self.ref_weights_alias.wkv_b_weights] + kv_b_proj_weight_scale = weights_map[self.ref_weights_alias.wkv_b_scales] + + if self.model_args.n_heads % self.num_devices == 0: + dev_weights = kv_b_proj_weight.view( + self.num_devices, self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank + ) + dev_scale_rows = self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size + dev_scales = kv_b_proj_weight_scale.view( + self.num_devices, dev_scale_rows, 1, self.wkvb_lora_rank_qsize + ) + else: + from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqbWeightsConverter, + ) + + wq_b_list, scale_list = RmsnormProjqWqbWeightsConverter._redistribute_heads( + kv_b_proj_weight, + kv_b_proj_weight_scale, + n_total_heads=self.model_args.n_heads, + n_local_heads=self.num_local_heads, + num_devices=self.num_devices, + qk_head_dim=self.wkvb_head_dim, + block_size=self.model_args.block_size, + ) + dev_weights = torch.stack(wq_b_list, dim=0).view( + self.num_devices, self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank + ) + dev_scale_rows = self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size + dev_scales = torch.stack(scale_list, dim=0).view( + self.num_devices, dev_scale_rows, 1, self.wkvb_lora_rank_qsize + ) + + wkvb = dev_weights[:, :, -self.wkvb_v_head_dim :] + wkvb_scales = ( + dev_scales.contiguous() + .repeat(1, 1, self.head_dim_scale_repeat, 1) + .view( + self.num_devices, + self.num_local_heads, + self.wkvb_head_qsize, + self.wkvb_lora_rank_qsize, + ) + .contiguous()[:, :, -self.wkvb_v_head_qsize :] + ) + return { + self.tilert_weights_alias.wkv_b_weights: wkvb.contiguous(), + self.tilert_weights_alias.wkv_b_scales: wkvb_scales.contiguous(), + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + sharding_size = self.num_local_heads * self.wkvb_head_dim + sharding_start = self.device_id * sharding_size + sharding_end = sharding_start + sharding_size + wkv_b = weight_dequant( + state_dict[self.ref_weights_alias.wkv_b_weights], + state_dict[self.ref_weights_alias.wkv_b_scales], + ) + wkv_b = wkv_b[sharding_start:sharding_end, :] + wkv_b = wkv_b.view(self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank) + self.ref_wkv_b = wkv_b[:, -self.wkvb_v_head_dim :] + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + self.init_tilert_weights_hmma(state_dict) + + def init_tilert_weights_hmma(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize with HMMA-packed weights.""" + packed = ProjoWKVbWeightsConverter(self.model_args, self.num_devices).dispatch( + ProjoWKVbAlgorithm.FP16MMA, + [ + state_dict[self.tilert_weights_alias.wkv_b_weights], + state_dict[self.tilert_weights_alias.wkv_b_scales], + ], + ) + self.tilert_wkv_b_b = packed + self.tilert_wkv_b_b_scales = torch.empty(1, dtype=torch.float8_e4m3fn, device=packed.device) + self.compute_kernel_type = "fp16mma" + + def init_tilert_weights_hmma_bf16(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize with BF16 HMMA-packed weights (dequantized, no scales).""" + packed = ProjoWKVbWeightsConverter(self.model_args, self.num_devices).dispatch( + ProjoWKVbAlgorithm.BF16MMA, + [ + state_dict[self.tilert_weights_alias.wkv_b_weights], + state_dict[self.tilert_weights_alias.wkv_b_scales], + ], + ) + self.tilert_wkv_b_b = packed + self.tilert_wkv_b_b_scales = torch.empty(1, dtype=torch.float8_e4m3fn, device=packed.device) + self.compute_kernel_type = "bf16mma" + + def init_random_weights(self) -> None: + padded_total_heads = self.num_local_heads * self.num_devices + wkv_b = init_func( + torch.empty( + padded_total_heads * self.wkvb_head_dim, + self.wkvb_lora_rank, + dtype=torch.float8_e4m3fn, + ) + ) + wkv_b_scales = init_func( + torch.empty( + padded_total_heads * self.wkvb_head_dim // self.model_args.block_size, + self.wkvb_lora_rank_qsize, + dtype=torch.float32, + ) + ) + ref_state_dict = dict( + zip( + self.ref_weights_alias(), + [wkv_b, wkv_b_scales], + ) + ) + self.init_reference_weights(ref_state_dict) + sharded = self.device_sharding(ref_state_dict) + self.init_tilert_weights({k: v[self.device_id] for k, v in sharded.items()}) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + self.output = torch.zeros( + (batch_size, seq_len, self.num_local_heads, self.wkvb_v_head_dim), + dtype=torch.bfloat16, + ) + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward(self, x_out: torch.Tensor) -> torch.Tensor: + assert self.ref_wkv_b is not None + return torch.einsum("bshc,hdc->bshd", x_out, self.ref_wkv_b) + + def tilert_forward(self, x_out: torch.Tensor) -> torch.Tensor: + assert self.tilert_wkv_b_b is not None + assert self.tilert_wkv_b_b_scales is not None + assert self.output is not None + assert self.profile_logs is not None + projo_wkvb( + x_out, + self.tilert_wkv_b_b, + self.tilert_wkv_b_b_scales, + self.output, + self.profile_logs, + model_arch=self.model_args.arch_name, + compute_kernel_type=self.compute_kernel_type, + ) + return self.output diff --git a/tilert/models/glm_5/_dsa_v32/ops/projq_wqb.py b/tilert/models/glm_5/_dsa_v32/ops/projq_wqb.py new file mode 100644 index 0000000..c40ca51 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/projq_wqb.py @@ -0,0 +1,466 @@ +"""ProjQB operation module.""" + +import math +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import init_func, weight_dequant +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "projq_wqb", + "ProjqWqb", + "ProjqWqbAlgorithm", + "ProjqWqbWeightsConverter", + "ProjqWqbRefWeightsAlias", + "ProjqWqbTilertWeightsAlias", +] + + +def projq_wqb( + q_nope_in: torch.Tensor, + wkv_b_a: torch.Tensor, + wkv_b_a_scales: torch.Tensor, + output: torch.Tensor, + profile_logs: torch.Tensor, + compute_kernel_type: str = "fp16mma", + *, + model_arch: str, +) -> None: + """ + Define the ProjqWqb operation. + + Args: + q_nope_in: Input tensor. + wkv_b_a: Weight tensor. + wkv_b_a_scales: Scale tensor. + output: Output tensor. + profile_logs: Profile logs tensor. + compute_kernel_type: Kernel type ("fp16mma"). + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + """ + torch.ops.tilert.projq_wqb_op( + q_nope_in, + wkv_b_a, + wkv_b_a_scales, + output, + model_arch, + compute_kernel_type, + profile_logs, + ) + + +class ProjqWqbAlgorithm(Enum): + """ProjqWqb algorithm""" + + GENERAL = "general" + FP16MMA = "fp16mma" + BF16MMA = "bf16mma" + + +class ProjqWqbWeightsConverter(TilertWeightsConverter): + def __init__(self, model_args: ModelArgs, num_devices: int, head_dim_block_size: int): + super().__init__(model_args, num_devices) + self.head_dim_block_size = head_dim_block_size + self.impl_block_size = 64 + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle a [*, 16, 16] block for the packed weight layout.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x16_for_pages(mat_in: torch.Tensor, k_dim: int, pages: int) -> torch.Tensor: + """Swizzle a [*, 16, K] matrix for the paged weight layout.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == k_dim + pre_shape = mat_in.shape[:-2] + k_per_page = k_dim // pages + n_k_tiles = k_per_page // 16 + mat_in = mat_in.reshape(*pre_shape, 16, pages, k_per_page).transpose(-3, -2) + mat_in = mat_in.reshape(*pre_shape, pages, 16, n_k_tiles, 16).transpose(-3, -2) + mat_in = ProjqWqbWeightsConverter._swizzle_mma_16x16(mat_in) + return mat_in.contiguous() + + def convert_to_fp16mma(self, weights: list[torch.Tensor]) -> torch.Tensor: + """Convert weights to the packed format expected by the kernel.""" + with torch.inference_mode(): + wkv_b_a, wkv_b_a_scales = self.convert_to_general(weights) + + n_heads = wkv_b_a.size(0) + head_dim = wkv_b_a.size(2) + kv_lora_rank = wkv_b_a.size(1) + num_ctas = 80 + rows_per_cta = (n_heads * kv_lora_rank) // num_ctas + + is_glm5 = self.model_args.arch_name == "glm_5" + + w_flat = wkv_b_a.reshape(num_ctas, rows_per_cta // 16, 16, head_dim) + w_swizzled = self._swizzle_mma_16x16_for_pages(w_flat, head_dim, pages=1) + w_bytes = w_swizzled.reshape(num_ctas, -1) + + kScalesPerPage = head_dim // 64 + + if is_glm5: + ctas_per_scale_row = 128 // rows_per_cta + scales_expanded = wkv_b_a_scales.repeat_interleave(ctas_per_scale_row, dim=1) + scales_per_cta = scales_expanded.reshape(num_ctas, kScalesPerPage) + scale_dtype = torch.float32 + else: + scales_per_cta = wkv_b_a_scales.reshape(num_ctas, kScalesPerPage) + scale_dtype = torch.bfloat16 + + mat_bytes = rows_per_cta * head_dim + scale_elem_bytes = 4 if scale_dtype == torch.float32 else 2 + scale_bytes = kScalesPerPage * scale_elem_bytes + page_size = (mat_bytes + scale_bytes + 127) // 128 * 128 + + scales_raw = scales_per_cta.to(scale_dtype).contiguous().view(torch.float8_e4m3fn) + padding_size = page_size - mat_bytes - scales_raw.shape[-1] + padding = torch.zeros( + num_ctas, padding_size, dtype=torch.float8_e4m3fn, device=wkv_b_a.device + ) + return torch.cat([w_bytes, scales_raw, padding], dim=-1).contiguous() + + def convert_to_bf16mma(self, weights: list[torch.Tensor]) -> torch.Tensor: + """Convert weights to the packed format expected by the BF16 kernel.""" + with torch.inference_mode(): + tilert_wkv_b_weights, tilert_wkv_b_scales = weights + + if self.model_args.n_heads % self.num_devices == 0: + n_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local_heads = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local_heads % 2 != 0: + n_local_heads += 1 + + nope_head_dim = self.model_args.qk_nope_head_dim + kv_lora_rank = self.model_args.kv_lora_rank + hd_block = self.head_dim_block_size + n_block = self.model_args.block_size + + s = tilert_wkv_b_scales.float() + s = s.repeat_interleave(hd_block, dim=1).repeat_interleave(n_block, dim=2) + wkv_bf16 = ( + (tilert_wkv_b_weights.float() * s).transpose(1, 2).contiguous().to(torch.bfloat16) + ) + n_heads = n_local_heads + head_dim = nope_head_dim + + num_ctas = 80 + rows_per_cta = (n_heads * kv_lora_rank) // num_ctas + + w_flat = wkv_bf16.reshape(num_ctas, rows_per_cta // 16, 16, head_dim) + w_swizzled = self._swizzle_mma_16x16_for_pages(w_flat, head_dim, pages=1) + w_bytes = w_swizzled.reshape(num_ctas, -1).contiguous().view(torch.float8_e4m3fn) + + mat_bytes = rows_per_cta * head_dim * 2 + page_size = (mat_bytes + 127) // 128 * 128 + padding_size = page_size - w_bytes.shape[-1] + + if padding_size > 0: + padding = torch.zeros( + num_ctas, padding_size, dtype=torch.float8_e4m3fn, device=wkv_bf16.device + ) + return torch.cat([w_bytes, padding], dim=-1).contiguous() + return w_bytes.contiguous() + + def convert_to_general(self, weights: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: + with torch.inference_mode(): + tilert_wkv_b_weights, tilert_wkv_b_scales = weights + + if self.model_args.n_heads % self.num_devices == 0: + n_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local_heads = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local_heads % 2 != 0: + n_local_heads += 1 + + wkv_b = tilert_wkv_b_weights + wkv_b_scales_raw = tilert_wkv_b_scales + wkv_b = wkv_b.view(n_local_heads, -1, self.model_args.kv_lora_rank) + assert self.model_args.kv_lora_rank % self.model_args.block_size == 0 + wkv_b_scales_raw = wkv_b_scales_raw.view( + n_local_heads, -1, self.model_args.kv_lora_rank // self.model_args.block_size + ) + wkv_b_a = wkv_b[:, : self.model_args.qk_nope_head_dim].transpose(1, 2).contiguous() + assert self.model_args.qk_nope_head_dim % self.head_dim_block_size == 0 + wkv_b_a_scales = ( + wkv_b_scales_raw[:, : self.model_args.qk_nope_head_dim // self.head_dim_block_size] + .transpose(1, 2) + .contiguous() + ) + if self.model_args.arch_name == "glm_5": + if wkv_b_a_scales.dtype != torch.float32: + print( + "Warning: ProjqWqbWeightsConverter: " + + f"wkv_b_a_scales.dtype: {wkv_b_a_scales.dtype} " + + "is not float32, convert to float32." + ) + wkv_b_a_scales = wkv_b_a_scales.to(torch.float32) + else: + wkv_b_a_scales = wkv_b_a_scales.to(torch.bfloat16) + if self.head_dim_block_size != self.impl_block_size: + repeats = self.head_dim_block_size // self.impl_block_size + wkv_b_a_scales = wkv_b_a_scales.repeat(1, 1, repeats).contiguous() + + wkv_b_a = wkv_b_a.detach() + wkv_b_a_scales = wkv_b_a_scales.detach() + + return wkv_b_a, wkv_b_a_scales + + +@dataclass +class ProjqWqbRefWeightsAlias: + """Reference weights alias for ProjqWqb.""" + + wkv_b_weights = "self_attn.kv_b_proj.weight" + wkv_b_scales = "self_attn.kv_b_proj.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [self.wkv_b_weights, self.wkv_b_scales] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class ProjqWqbTilertWeightsAlias: + """TileRT weights alias for ProjqWqb.""" + + wkv_b_weights = "wkv_b1_weights" + wkv_b_scales = "wkv_b1_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.wkv_b_weights, self.wkv_b_scales] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class ProjqWqb(TileRTModule): + """ProjqWqb module: Q projection (wkv_b) for KV LoRA.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ProjqWqbAlgorithm.FP16MMA], + "glm_5": [ProjqWqbAlgorithm.FP16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int = 0, + ref_weights_alias: ProjqWqbRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = ProjqWqbTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else ProjqWqbRefWeightsAlias() + ) + + self.ref_wkv_b: torch.Tensor | None = None + self.tilert_wkv_b_a: torch.Tensor | None = None + self.tilert_wkv_b_a_scales: torch.Tensor | None = None + self.output: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + + self.compute_kernel_type = "fp16mma" + + if self.model_args.n_heads % self.num_devices == 0: + self.num_local_heads = self.model_args.n_heads // self.num_devices + else: + n_local = math.ceil(self.model_args.n_heads / self.num_devices) + if n_local % 2 != 0: + n_local += 1 + self.num_local_heads = n_local + + self.wkvb_lora_rank = self.model_args.kv_lora_rank + self.wkvb_lora_rank_qsize = self.wkvb_lora_rank // self.model_args.block_size + + self.wkvb_head_dim = self.model_args.qk_nope_head_dim + self.model_args.v_head_dim + self.wkvb_nope_head_dim = self.model_args.qk_nope_head_dim + left_head_dim = self.wkvb_head_dim % self.model_args.block_size + if left_head_dim != 0: + assert self.model_args.block_size % left_head_dim == 0 + self.head_dim_block_size = left_head_dim + self.head_dim_scale_repeat = self.model_args.block_size // self.head_dim_block_size + else: + self.head_dim_scale_repeat = 1 + self.head_dim_block_size = self.model_args.block_size + self.wkvb_head_qsize = self.wkvb_head_dim // self.head_dim_block_size + self.wkvb_nope_head_qsize = self.wkvb_nope_head_dim // self.head_dim_block_size + + @property + def tilert_tensor_alias(self) -> list[str]: + return self.tilert_weights_alias.tilert_tensor_alias + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_wkv_b_a, self.tilert_wkv_b_a_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """ + Device sharding: split weights and scales per device. + + Args: + weights_map: Map from ref weight alias to tensor. + + Returns: + Map from tilert weight alias to (num_devices, ...) tensors. + """ + kv_b_proj_weight = weights_map[self.ref_weights_alias.wkv_b_weights] + kv_b_proj_weight_scale = weights_map[self.ref_weights_alias.wkv_b_scales] + + if self.model_args.n_heads % self.num_devices == 0: + dev_weights = kv_b_proj_weight.view( + self.num_devices, self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank + ) + dev_scale_rows = self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size + dev_scales = kv_b_proj_weight_scale.view( + self.num_devices, dev_scale_rows, 1, self.wkvb_lora_rank_qsize + ) + else: + from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projq_wqb import ( + RmsnormProjqWqbWeightsConverter, + ) + + wq_b_list, scale_list = RmsnormProjqWqbWeightsConverter._redistribute_heads( + kv_b_proj_weight, + kv_b_proj_weight_scale, + n_total_heads=self.model_args.n_heads, + n_local_heads=self.num_local_heads, + num_devices=self.num_devices, + qk_head_dim=self.wkvb_head_dim, + block_size=self.model_args.block_size, + ) + dev_weights = torch.stack(wq_b_list, dim=0).view( + self.num_devices, self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank + ) + dev_scale_rows = self.num_local_heads * self.wkvb_head_dim // self.model_args.block_size + dev_scales = torch.stack(scale_list, dim=0).view( + self.num_devices, dev_scale_rows, 1, self.wkvb_lora_rank_qsize + ) + + wkvb = dev_weights[:, :, : self.wkvb_nope_head_dim] + wkvb_scales = ( + dev_scales.contiguous() + .repeat(1, 1, self.head_dim_scale_repeat, 1) + .view( + self.num_devices, + self.num_local_heads, + self.wkvb_head_qsize, + self.wkvb_lora_rank_qsize, + ) + .contiguous()[:, :, : self.wkvb_nope_head_qsize] + ) + return { + self.tilert_weights_alias.wkv_b_weights: wkvb.contiguous(), + self.tilert_weights_alias.wkv_b_scales: wkvb_scales.contiguous(), + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + sharding_size = self.num_local_heads * self.wkvb_head_dim + sharding_start = self.device_id * sharding_size + sharding_end = sharding_start + sharding_size + wkv_b = weight_dequant( + state_dict[self.ref_weights_alias.wkv_b_weights], + state_dict[self.ref_weights_alias.wkv_b_scales], + ) + wkv_b = wkv_b[sharding_start:sharding_end, :] + wkv_b = wkv_b.view(self.num_local_heads, self.wkvb_head_dim, self.wkvb_lora_rank) + self.ref_wkv_b = wkv_b[:, : self.wkvb_nope_head_dim] + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + self.init_tilert_weights_hmma(state_dict) + + def init_tilert_weights_hmma(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize with HMMA-packed weights.""" + packed = ProjqWqbWeightsConverter( + self.model_args, self.num_devices, self.head_dim_block_size + ).dispatch( + ProjqWqbAlgorithm.FP16MMA, + [ + state_dict[self.tilert_weights_alias.wkv_b_weights], + state_dict[self.tilert_weights_alias.wkv_b_scales], + ], + ) + self.tilert_wkv_b_a = packed + self.tilert_wkv_b_a_scales = torch.empty(1, dtype=torch.float8_e4m3fn, device=packed.device) + self.compute_kernel_type = "fp16mma" + + def init_tilert_weights_hmma_bf16(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize with BF16 HMMA-packed weights (dequantized, no scales).""" + packed = ProjqWqbWeightsConverter( + self.model_args, self.num_devices, self.head_dim_block_size + ).dispatch( + ProjqWqbAlgorithm.BF16MMA, + [ + state_dict[self.tilert_weights_alias.wkv_b_weights], + state_dict[self.tilert_weights_alias.wkv_b_scales], + ], + ) + self.tilert_wkv_b_a = packed + self.tilert_wkv_b_a_scales = torch.empty(1, dtype=torch.float8_e4m3fn, device=packed.device) + self.compute_kernel_type = "bf16mma" + + def init_random_weights(self) -> None: + padded_total_heads = self.num_local_heads * self.num_devices + wkv_b = init_func( + torch.empty( + padded_total_heads * self.wkvb_head_dim, + self.wkvb_lora_rank, + dtype=torch.float8_e4m3fn, + ) + ) + wkv_b_scales = init_func( + torch.empty( + padded_total_heads * self.wkvb_head_dim // self.model_args.block_size, + self.wkvb_lora_rank_qsize, + dtype=torch.float32, + ) + ) + ref_state_dict = dict(zip(self.ref_weights_alias(), [wkv_b, wkv_b_scales])) + self.init_reference_weights(ref_state_dict) + sharded = self.device_sharding(ref_state_dict) + self.init_tilert_weights({k: v[self.device_id] for k, v in sharded.items()}) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + self.output = torch.zeros( + (batch_size, seq_len, self.num_local_heads, self.wkvb_lora_rank), dtype=torch.bfloat16 + ) + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward(self, q_nope: torch.Tensor) -> torch.Tensor: + assert self.ref_wkv_b is not None + return torch.einsum("bshd,hdc->bshc", q_nope, self.ref_wkv_b) + + def tilert_forward(self, q_nope: torch.Tensor) -> torch.Tensor: + assert self.tilert_wkv_b_a is not None + assert self.tilert_wkv_b_a_scales is not None + assert self.output is not None + assert self.profile_logs is not None + projq_wqb( + q_nope, + self.tilert_wkv_b_a, + self.tilert_wkv_b_a_scales, + self.output, + self.profile_logs, + self.compute_kernel_type, + model_arch=self.model_args.arch_name, + ) + return self.output diff --git a/tilert/models/glm_5/_dsa_v32/ops/projx_wis.py b/tilert/models/glm_5/_dsa_v32/ops/projx_wis.py new file mode 100644 index 0000000..e13b4e0 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/projx_wis.py @@ -0,0 +1,211 @@ +"""ProjxWis operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule +from tilert.models.common import init_func +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "projx_wis", + "ProjxWis", + "ProjxWisRefWeightsAlias", + "ProjxWisTilertWeightsAlias", +] + + +def projx_wis( + x_in: torch.Tensor, + w: torch.Tensor, + output: torch.Tensor, + compute_kernel_type: str, + profile_logs: torch.Tensor, + model_arch: str, +) -> None: + """ + Define the ProjxWis operation. + + Args: + x_in: Input tensor. + w: Weight tensor. + output: Output tensor. + compute_kernel_type: Compute kernel type ("bf16" or "bf16mma"). + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + """ + torch.ops.tilert.proj_w_op(x_in, w, output, model_arch, compute_kernel_type, profile_logs) + + +@dataclass +class ProjxWisRefWeightsAlias: + """Reference weights alias for ProjxWis.""" + + w_weights = "self_attn.indexer.weights_proj.weight" + + @property + def ref_tensor_alias(self) -> list[str]: + return [self.w_weights] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class ProjxWisTilertWeightsAlias: + """TileRT weights alias for ProjxWis.""" + + w_weights = "id_score_weights" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.w_weights] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class ProjxWisAlgorithm(Enum): + """ProjxWis algorithm.""" + + BF16 = "bf16" + BF16MMA = "bf16mma" + + +class ProjxWis(TileRTModule): + """ProjxWis module: linear projection for indexer score weights.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ProjxWisAlgorithm.BF16, ProjxWisAlgorithm.BF16MMA], + "glm_5": [ProjxWisAlgorithm.BF16, ProjxWisAlgorithm.BF16MMA], + } + + _HMMA_CONFIGS = { + 7168: (4, 16, 7), + 6144: (2, 16, 6), + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int = 0, + ref_weights_alias: ProjxWisRefWeightsAlias | None = None, + compute_kernel_type: str | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = ProjxWisTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else ProjxWisRefWeightsAlias() + ) + + self.ref_tensor_alias = self.ref_weights_alias.ref_tensor_alias + + self.ref_w: torch.Tensor | None = None + self.tilert_w: torch.Tensor | None = None + self.output: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + + self.dim = model_args.dim + self.index_n_heads = model_args.index_n_heads + + if compute_kernel_type is not None: + self.compute_kernel_type = compute_kernel_type + else: + self.compute_kernel_type = "bf16" + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle each 16x16 BF16 tile for the packed weight layout.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _to_hmma_layout( + w_orig: torch.Tensor, n_ctas: int, rows_per_cta: int, x_dim: int, num_pages: int + ) -> torch.Tensor: + """Convert [output_dim, x_dim] BF16 weights to the packed kernel layout.""" + cols_per_page = x_dim // num_pages + n_k_tiles = cols_per_page // 16 + w = w_orig.reshape(n_ctas, rows_per_cta, num_pages, cols_per_page) + w = w.transpose(1, 2) + n_row_tiles = rows_per_cta // 16 + w = w.reshape(n_ctas, num_pages, n_row_tiles, 16, n_k_tiles, 16).transpose(-3, -2) + w = ProjxWis._swizzle_mma_16x16(w) + return w.reshape(n_ctas, -1).contiguous() + + @property + def tilert_tensor_alias(self) -> list[str]: + return self.tilert_weights_alias.tilert_tensor_alias + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_w] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """ + Device sharding: replicate weight for each device. + + Args: + weights_map: Map from ref weight alias to tensor. + + Returns: + Map from tilert weight alias to (num_devices, ...) tensors. + """ + w = weights_map[self.ref_weights_alias.w_weights] + if self.compute_kernel_type == "bf16mma": + n_ctas, rows_per_cta, num_pages = self._HMMA_CONFIGS[self.dim] + w_hmma = self._to_hmma_layout(w, n_ctas, rows_per_cta, self.dim, num_pages) + w_out = w_hmma[None, ...].repeat(self.num_devices, 1, 1) + else: + w_out = w[None, ...].repeat(self.num_devices, 1, 1) + return {self.tilert_weights_alias.w_weights: w_out} + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + w = state_dict[self.ref_weights_alias.w_weights] + self.ref_w = w.detach().clone().to(torch.bfloat16) + self.is_ref_weights_init = True + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + self.tilert_w = state_dict[self.tilert_weights_alias.w_weights].detach().clone() + self.is_tilert_weights_init = True + + def init_random_weights(self) -> None: + ref_w = init_func(torch.empty(self.index_n_heads, self.dim, dtype=torch.bfloat16)) + ref_state_dict = dict(zip(self.ref_weights_alias(), [ref_w])) + self.init_reference_weights(ref_state_dict) + sharded = self.device_sharding(ref_state_dict) + self.init_tilert_weights({k: v[self.device_id] for k, v in sharded.items()}) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + self.output = torch.zeros((batch_size, seq_len, self.index_n_heads), dtype=torch.bfloat16) + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward(self, x_norm: torch.Tensor) -> torch.Tensor: + assert self.ref_w is not None + return torch.nn.functional.linear(x_norm, self.ref_w) + + def tilert_forward(self, x_norm: torch.Tensor) -> torch.Tensor: + assert self.tilert_w is not None + assert self.output is not None + assert self.profile_logs is not None + projx_wis( + x_norm, + self.tilert_w, + self.output, + self.compute_kernel_type, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + return self.output diff --git a/tilert/models/glm_5/_dsa_v32/ops/projx_wqaki.py b/tilert/models/glm_5/_dsa_v32/ops/projx_wqaki.py new file mode 100644 index 0000000..367d5fe --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/projx_wqaki.py @@ -0,0 +1,247 @@ +"""ProjxWqaki operation module.""" + +import torch + +__all__ = [ + "projx_wqaki", + "ProjxWqakiWeightsConverter", +] + + +def projx_wqaki( + x_quant: torch.Tensor, + x_scale: torch.Tensor, + wqaki: torch.Tensor, + out_q: torch.Tensor, + out_ki: torch.Tensor, + profile_logs: torch.Tensor, + compute_kernel_type: str = "fp8mma", + *, + model_arch: str, +) -> None: + """FP8 projection for q, ki. + + Args: + x_quant: FP8 quantized hidden states [1, seq_len, hidden_dim]. + x_scale: Scale factors for x_quant. + wqaki: Packed FP8 weights + scales for q, ki. + out_q: Output q tensor. + out_ki: Output ki tensor. + profile_logs: Profile logs tensor. + compute_kernel_type: Kernel type ("fp8mma", "fp8mma_68cta", "fp8mma_136cta"). + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + """ + torch.ops.tilert.projx_wqaki_op( + x_quant, + x_scale, + wqaki, + out_q, + out_ki, + model_arch, + compute_kernel_type, + profile_logs, + torch.empty(0, dtype=torch.int64, device=x_quant.device), + ) + + +class ProjxWqakiWeightsConverter: + """Weight converter for ProjxWqaki kernel.""" + + @staticmethod + def _swizzle_qmma_16x32(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 + assert mat_in.dtype == torch.float8_e4m3fn + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) + + @staticmethod + def convert_dsv32( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wki: torch.Tensor, + wki_scale: torch.Tensor, + ) -> torch.Tensor: + """Convert DSV3.2 weights to the packed format expected by the kernel.""" + with torch.inference_mode(): + wq_a_scale = wq_a_scale.to(torch.bfloat16) + wki_scale = wki_scale.to(torch.bfloat16) + + dim = 7168 + q_rows = 1536 + ki_rows = 128 + total_rows = q_rows + ki_rows + n_blocks = total_rows // 16 + scale_dim = dim // 128 + + n_q_blocks = q_rows // 16 + n_ki_blocks = ki_rows // 16 + wq_a = wq_a.reshape(n_q_blocks, 16, dim) + wq_a_scale = ( + wq_a_scale.reshape(wq_a_scale.shape[0], 1, scale_dim) + .repeat(1, n_q_blocks // wq_a_scale.shape[0], 1) + .reshape(n_q_blocks, scale_dim) + ) + wki = wki.reshape(n_ki_blocks, 16, dim) + wki_scale = ( + wki_scale.reshape(wki_scale.shape[0], 1, scale_dim) + .repeat(1, n_ki_blocks // wki_scale.shape[0], 1) + .reshape(n_ki_blocks, scale_dim) + ) + + wqaki = torch.cat([wq_a, wki], dim=0) + wqaki_scale = torch.cat([wq_a_scale, wki_scale], dim=0) + + swizzle = ProjxWqakiWeightsConverter._swizzle_qmma_16x32 + + wqaki_0 = wqaki[..., :2048] + wqaki_0_scale = wqaki_scale[..., :16].contiguous().view(torch.float8_e4m3fn) + wqaki_1 = wqaki[..., 2048:4096] + wqaki_1_scale = wqaki_scale[..., 16:32].contiguous().view(torch.float8_e4m3fn) + wqaki_2 = wqaki[..., 4096:6144] + wqaki_2_scale = wqaki_scale[..., 32:48].contiguous().view(torch.float8_e4m3fn) + wqaki_3 = wqaki[..., 6144:7168] + wqaki_3_scale = wqaki_scale[..., 48:56].contiguous().view(torch.float8_e4m3fn) + + wqaki_0 = wqaki_0.reshape(n_blocks, 16, 64, 32).transpose(1, 2) + wqaki_0 = swizzle(wqaki_0).reshape(n_blocks, 16 * 2048) + + wqaki_1 = wqaki_1.reshape(n_blocks, 16, 64, 32).transpose(1, 2) + wqaki_1 = swizzle(wqaki_1).reshape(n_blocks, 16 * 2048) + + wqaki_2 = wqaki_2.reshape(n_blocks, 16, 64, 32).transpose(1, 2) + wqaki_2 = swizzle(wqaki_2).reshape(n_blocks, 16 * 2048) + + wqaki_3 = wqaki_3.reshape(n_blocks, 16, 32, 32).transpose(1, 2) + wqaki_3 = swizzle(wqaki_3).reshape(n_blocks, 16 * 1024) + + padding_scale0 = torch.zeros( + (n_blocks, 48), dtype=torch.bfloat16, device=wq_a.device + ).view(torch.float8_e4m3fn) + padding_scale1 = torch.zeros( + (n_blocks, 48), dtype=torch.bfloat16, device=wq_a.device + ).view(torch.float8_e4m3fn) + padding_scale2 = torch.zeros( + (n_blocks, 48), dtype=torch.bfloat16, device=wq_a.device + ).view(torch.float8_e4m3fn) + padding_scale3 = torch.zeros( + (n_blocks, 56), dtype=torch.bfloat16, device=wq_a.device + ).view(torch.float8_e4m3fn) + + return torch.cat( + [ + wqaki_0, + wqaki_0_scale, + padding_scale0, + wqaki_1, + wqaki_1_scale, + padding_scale1, + wqaki_2, + wqaki_2_scale, + padding_scale2, + wqaki_3, + wqaki_3_scale, + padding_scale3, + ], + dim=1, + ).contiguous() + + @staticmethod + def convert_glm5_68cta( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wki: torch.Tensor, + wki_scale: torch.Tensor, + ) -> torch.Tensor: + """Convert GLM5 weights to the packed format expected by the kernel.""" + with torch.inference_mode(): + wq_a_scale = wq_a_scale.to(torch.float32) + wki_scale = wki_scale.to(torch.float32) + + dim = 6144 + q_rows = 2048 + ki_rows = 128 + total_rows = q_rows + ki_rows + n_blocks = total_rows // 32 + scale_dim = dim // 128 + + n_q_blocks = q_rows // 32 + n_ki_blocks = ki_rows // 32 + + wqaki_raw = torch.cat([wq_a, wki], dim=0).reshape(n_blocks, 32, dim) + + wq_a_scale = ( + wq_a_scale.reshape(wq_a_scale.shape[0], 1, scale_dim) + .repeat(1, n_q_blocks // wq_a_scale.shape[0], 1) + .reshape(n_q_blocks, scale_dim) + ) + wki_scale = ( + wki_scale.reshape(wki_scale.shape[0], 1, scale_dim) + .repeat(1, n_ki_blocks // wki_scale.shape[0], 1) + .reshape(n_ki_blocks, scale_dim) + ) + wqaki_scales = torch.cat([wq_a_scale, wki_scale], dim=0) + + swizzle = ProjxWqakiWeightsConverter._swizzle_qmma_16x32 + + wqaki_raw = wqaki_raw.reshape(n_blocks, 32, 6, 1024).transpose(1, 2) + wqaki_raw = wqaki_raw.reshape(n_blocks, 6, 2, 16, 32, 32).transpose(3, 4) + wqaki_raw = swizzle(wqaki_raw).reshape(n_blocks, 6, 32 * 1024) + wqaki_scales = wqaki_scales.reshape(n_blocks, 6, 8).view(torch.float8_e4m3fn) + wqaki_padding = torch.zeros( + (n_blocks, 6, 128 - wqaki_scales.shape[-1]), + dtype=torch.float8_e4m3fn, + device=wq_a.device, + ) + return torch.cat([wqaki_raw, wqaki_scales, wqaki_padding], dim=-1).contiguous() + + @staticmethod + def convert_glm5_136cta( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wki: torch.Tensor, + wki_scale: torch.Tensor, + ) -> torch.Tensor: + """Convert GLM5 weights to the packed format expected by the kernel.""" + with torch.inference_mode(): + wq_a_scale = wq_a_scale.to(torch.float32) + wki_scale = wki_scale.to(torch.float32) + + dim = 6144 + q_rows = 2048 + ki_rows = 128 + total_rows = q_rows + ki_rows + n_blocks = total_rows // 16 + scale_dim = dim // 128 + + n_q_blocks = q_rows // 16 + n_ki_blocks = ki_rows // 16 + + wq_a = wq_a.reshape(n_q_blocks, 16, dim) + wq_a_scale = ( + wq_a_scale.reshape(wq_a_scale.shape[0], 1, scale_dim) + .repeat(1, n_q_blocks // wq_a_scale.shape[0], 1) + .reshape(n_q_blocks, scale_dim) + ) + wki = wki.reshape(n_ki_blocks, 16, dim) + wki_scale = ( + wki_scale.reshape(wki_scale.shape[0], 1, scale_dim) + .repeat(1, n_ki_blocks // wki_scale.shape[0], 1) + .reshape(n_ki_blocks, scale_dim) + ) + + wqaki_raw = torch.cat([wq_a, wki], dim=0) + wqaki_scales = torch.cat([wq_a_scale, wki_scale], dim=0) + + swizzle = ProjxWqakiWeightsConverter._swizzle_qmma_16x32 + + wqaki_raw = wqaki_raw.reshape(n_blocks, 16, 3, 2048).transpose(1, 2) + wqaki_raw = wqaki_raw.reshape(n_blocks, 3, 1, 16, 64, 32).transpose(3, 4) + wqaki_raw = swizzle(wqaki_raw).reshape(n_blocks, 3, 16 * 2048) + wqaki_scales = wqaki_scales.reshape(n_blocks, 3, 16).view(torch.float8_e4m3fn) + wqaki_padding = torch.zeros( + (n_blocks, 3, 128 - wqaki_scales.shape[-1]), + dtype=torch.float8_e4m3fn, + device=wq_a.device, + ) + return torch.cat([wqaki_raw, wqaki_scales, wqaki_padding], dim=-1).contiguous() diff --git a/tilert/models/glm_5/_dsa_v32/ops/projx_wqkva.py b/tilert/models/glm_5/_dsa_v32/ops/projx_wqkva.py new file mode 100644 index 0000000..6ade7af --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/projx_wqkva.py @@ -0,0 +1,330 @@ +"""ProjXWqkva operation module.""" + +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule +from tilert.models.common import weight_dequant +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_projx_wqkva import ( + RMSNormProjQKVAFP8MMAWeightsConverter, + RMSNormProjQKVAFP16MMAWeightsConverter, +) +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "ProjXWqkva", + "projx_wqkva", +] + + +def projx_wqkva( + x_quant: torch.Tensor, + x_scale: torch.Tensor, + wqkva: torch.Tensor, + cur_pos: torch.Tensor, + q_out: torch.Tensor, + kv_out: torch.Tensor, + pe_cache_out: torch.Tensor, + profile_logs: torch.Tensor, + compute_kernel_type: str = "fp8mma", + *, + model_arch: str, +) -> None: + """FP8 MMA projection for q, kv, pe_cache (DSV3.2).""" + torch.ops.tilert.projx_wqkva_op( + x_quant, + x_scale, + wqkva, + cur_pos, + q_out, + kv_out, + pe_cache_out, + model_arch, + compute_kernel_type, + profile_logs, + torch.empty(0, dtype=torch.int64, device=x_quant.device), + ) + + +class ProjXWqkvaRefWeightsAlias: + """Reference weight aliases for ProjXWqkva.""" + + x_rmsnorm_gamma = "input_layernorm.weight" + q_a_weights = "self_attn.q_a_proj.weight" + q_a_scales = "self_attn.q_a_proj.weight_scale_inv" + kv_a_weights = "self_attn.kv_a_proj_with_mqa.weight" + kv_a_scales = "self_attn.kv_a_proj_with_mqa.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.kv_a_weights, + self.kv_a_scales, + ] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +class ProjXWqkvaTilertWeightsAlias: + """Tilert weight aliases for ProjXWqkva.""" + + q_a_weights = "q_a_weights" + q_a_scales = "q_a_scales" + kv_a_weights = "kv_a_weights" + kv_a_scales = "kv_a_scales" + w_pe_weights = "w_pe_weights" + w_pe_scales = "w_pe_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.q_a_weights, + self.q_a_scales, + self.kv_a_weights, + self.kv_a_scales, + self.w_pe_weights, + self.w_pe_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class ProjXWqkvaAlgorithm(Enum): + """ProjXWqkva algorithm.""" + + FP8MMA = "fp8mma" + FP16MMA = "fp16mma" + + +class ProjXWqkva(TileRTModule): + """FP8 MMA projection module for q, kv, pe_cache.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ProjXWqkvaAlgorithm.FP8MMA], + "glm_5": [ProjXWqkvaAlgorithm.FP8MMA, ProjXWqkvaAlgorithm.FP16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int, + ref_weights_alias: ProjXWqkvaRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = ProjXWqkvaTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else ProjXWqkvaRefWeightsAlias() + ) + + self.dim = self.model_args.dim + self.q_lora_rank = self.model_args.q_lora_rank + self.kv_lora_rank = self.model_args.kv_lora_rank + self.qk_rope_head_dim = self.model_args.qk_rope_head_dim + self.block_size = self.model_args.block_size + self.eps = self.model_args.eps + + self.ref_wq_a: torch.Tensor | None = None + self.ref_wkv_a: torch.Tensor | None = None + self.ref_w_pe: torch.Tensor | None = None + + self.tilert_wqkva: torch.Tensor | None = None + + self.q_out: torch.Tensor | None = None + self.kv_out: torch.Tensor | None = None + self.pe_cache_out: torch.Tensor | None = None + self.cur_pos: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + self.compute_kernel_type = "fp8mma" + + def set_algorithm(self, algorithm: Enum) -> None: + super().set_algorithm(algorithm) + if algorithm == ProjXWqkvaAlgorithm.FP16MMA: + self.compute_kernel_type = "fp16mma" + else: + self.compute_kernel_type = "fp8mma" + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Repeat weights for device sharding.""" + q_a_proj_weight = weights_map[self.ref_weights_alias.q_a_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + q_a_proj_weight_scale = weights_map[self.ref_weights_alias.q_a_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + kv_a_mqa = weights_map[self.ref_weights_alias.kv_a_weights] + kv_a_proj_weight = kv_a_mqa[: self.kv_lora_rank, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + w_pe_weight = kv_a_mqa[self.kv_lora_rank :, :][None, ...].repeat(self.num_devices, 1, 1) + kv_a_mqa_scale = weights_map[self.ref_weights_alias.kv_a_scales] + kv_scale_rows = (self.kv_lora_rank + self.block_size - 1) // self.block_size + kv_a_proj_weight_scale = kv_a_mqa_scale[:kv_scale_rows, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + w_pe_weight_scale = kv_a_mqa_scale[kv_scale_rows:, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + return { + self.tilert_weights_alias.q_a_weights: q_a_proj_weight, + self.tilert_weights_alias.q_a_scales: q_a_proj_weight_scale, + self.tilert_weights_alias.kv_a_weights: kv_a_proj_weight, + self.tilert_weights_alias.kv_a_scales: kv_a_proj_weight_scale, + self.tilert_weights_alias.w_pe_weights: w_pe_weight, + self.tilert_weights_alias.w_pe_scales: w_pe_weight_scale, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + aliases = self.ref_weights_alias() + self.ref_wq_a = weight_dequant(state_dict[aliases[1]], state_dict[aliases[2]]) + kv_a_mqa = weight_dequant(state_dict[aliases[3]], state_dict[aliases[4]]) + self.ref_wkv_a = kv_a_mqa[: self.kv_lora_rank, :] + self.ref_w_pe = kv_a_mqa[self.kv_lora_rank :, :] + + assert self.ref_wq_a.shape == (self.q_lora_rank, self.dim) + assert self.ref_wkv_a.shape == (self.kv_lora_rank, self.dim) + assert self.ref_w_pe.shape == (self.qk_rope_head_dim, self.dim) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + tilert_aliases = self.tilert_weights_alias() + wq_a = state_dict[tilert_aliases[0]] + wq_a_scale = state_dict[tilert_aliases[1]] + wkv_a = state_dict[tilert_aliases[2]] + wkv_a_scale = state_dict[tilert_aliases[3]] + w_pe = state_dict[tilert_aliases[4]] + w_pe_scale = state_dict[tilert_aliases[5]] + dummy_gamma = torch.zeros(self.dim, dtype=torch.float32, device=wq_a.device) + + if self.algorithm == ProjXWqkvaAlgorithm.FP16MMA: + self.tilert_wqkva, _ = RMSNormProjQKVAFP16MMAWeightsConverter.convert_to_fp16_mma_gemv( + wq_a, + wq_a_scale, + wkv_a, + wkv_a_scale, + w_pe, + w_pe_scale, + dummy_gamma, + hidden_dim=self.dim, + q_lora_rank=self.q_lora_rank, + ) + else: + self.tilert_wqkva, _ = RMSNormProjQKVAFP8MMAWeightsConverter.convert_to_fp8_mma_gemv( + wq_a, + wq_a_scale, + wkv_a, + wkv_a_scale, + w_pe, + w_pe_scale, + dummy_gamma, + hidden_dim=self.dim, + q_lora_rank=self.q_lora_rank, + ) + + def init_tilert_vars(self, batch_size: int, seq_len: int, max_len: int = 128) -> None: + self.q_out = torch.zeros((batch_size, seq_len, self.q_lora_rank), dtype=torch.bfloat16) + self.kv_out = torch.zeros((batch_size, seq_len, self.kv_lora_rank), dtype=torch.bfloat16) + self.pe_cache_out = torch.zeros( + (batch_size, max_len, self.qk_rope_head_dim), dtype=torch.bfloat16 + ) + self.cur_pos = torch.zeros((1,), dtype=torch.int32) + self.profile_logs = get_profile_log_tensor() + self.is_init = True + + def init_random_weights(self) -> None: + bs = self.block_size + dim_scale_dim = self.dim // bs + q_scale_dim = (self.q_lora_rank + bs - 1) // bs + kv_mqa_rows = self.kv_lora_rank + self.qk_rope_head_dim + kv_mqa_scale_dim = (kv_mqa_rows + bs - 1) // bs + scale_dtype = torch.bfloat16 + + tensor_list = [ + torch.randn(self.dim, dtype=torch.float32), + torch.randn(self.q_lora_rank, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(q_scale_dim, dim_scale_dim, dtype=scale_dtype), + torch.randn(kv_mqa_rows, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(kv_mqa_scale_dim, dim_scale_dim, dtype=scale_dtype), + ] + ref_state_dict = dict(zip(self.ref_weights_alias(), tensor_list)) + self.init_reference_weights(ref_state_dict) + self.init_tilert_weights( + {_k: _v[self.device_id] for _k, _v in self.device_sharding(ref_state_dict).items()} + ) + + def golden_forward( + self, + x_quant: torch.Tensor, + x_scale: torch.Tensor, + cur_pos: int = 0, # noqa: U100 + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Pure PyTorch reference: dequant FP8 -> matmul -> q, kv, pe.""" + assert self.ref_wq_a is not None + assert self.ref_wkv_a is not None + assert self.ref_w_pe is not None + + if self.algorithm == ProjXWqkvaAlgorithm.FP16MMA: + x_float = x_quant.float() + else: + x_fp8 = x_quant.to(torch.float32) + scale_expanded = x_scale.unsqueeze(-1).repeat(1, 1, 1, self.block_size) + scale_expanded = scale_expanded.reshape(x_quant.shape) + x_float = x_fp8 * scale_expanded + + q_out = torch.matmul(x_float, self.ref_wq_a.transpose(0, 1).float()) + kv_out = torch.matmul(x_float, self.ref_wkv_a.transpose(0, 1).float()) + pe_out = torch.matmul(x_float, self.ref_w_pe.transpose(0, 1).float()) + return ( + q_out.to(torch.bfloat16), + kv_out.to(torch.bfloat16), + pe_out.to(torch.bfloat16), + ) + + def tilert_forward( + self, + x_quant: torch.Tensor, + x_scale: torch.Tensor, + cur_pos: int = 0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Run FP8 QMMA GEMV via TileRT CUDA kernel.""" + assert self.cur_pos is not None + assert self.pe_cache_out is not None + self.cur_pos.fill_(cur_pos) + projx_wqkva( + x_quant, + x_scale, + self.tilert_wqkva, + self.cur_pos, + self.q_out, + self.kv_out, + self.pe_cache_out, + self.profile_logs, + self.compute_kernel_type, + model_arch=self.model_args.arch_name, + ) + + seq_len = x_quant.size(-2) + pe_at_pos = self.pe_cache_out[:, cur_pos : cur_pos + seq_len, :] + return self.q_out, self.kv_out, pe_at_pos + + def __call__( + self, + x_quant: torch.Tensor, + x_scale: torch.Tensor, + cur_pos: int = 0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + return self.golden_forward(x_quant, x_scale, cur_pos) diff --git a/tilert/models/glm_5/_dsa_v32/ops/qkv_rope.py b/tilert/models/glm_5/_dsa_v32/ops/qkv_rope.py new file mode 100644 index 0000000..7f16a1c --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/qkv_rope.py @@ -0,0 +1,192 @@ +"""QKV Rope operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.utils import apply_rotary_emb +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "qkv_rope", + "QKVRoPE", + "QKVRoPERefWeightsAlias", + "QKVRoPETilertWeightsAlias", +] + + +def qkv_rope( + pe_cache: torch.Tensor, + kv_cache: torch.Tensor, + rope_freqs: torch.Tensor, + cur_pos: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", +) -> None: + """ + Perform QKV Rope operation. + + Args: + pe_cache: Q PE tensor (bsz, seq, n_local_heads, qk_rope_head_dim). + kv_cache: K PE cache (bsz, seq, qk_rope_head_dim). + rope_freqs: Rope frequencies tensor. + cur_pos: Current position tensor. + profile_logs: Profile logs tensor. + model_arch: Model architecture string. + compute_kernel_type: Compute kernel type string. + """ + torch.ops.tilert.qkv_rope_op( + pe_cache, + kv_cache, + rope_freqs, + cur_pos, + model_arch, + compute_kernel_type, + profile_logs, + ) + + +@dataclass +class QKVRoPERefWeightsAlias: + """Reference weights alias for QKVRoPE (no weights).""" + + @property + def ref_tensor_alias(self) -> list[str]: + return [] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class QKVRoPETilertWeightsAlias: + """TileRT weights alias for QKVRoPE (no weights).""" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class QKVRoPEAlgorithm(Enum): + """QKVRoPE algorithm.""" + + GENERAL = "general" + + +class QKVRoPE(TileRTModule): + """QKV RoPE module. Unified for deepseek_v3_2 and glm_5.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [QKVRoPEAlgorithm.GENERAL], + "glm_5": [QKVRoPEAlgorithm.GENERAL], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int = 1, + device_id: int = 0, + layer_idx: int = 0, + ref_weights_alias: QKVRoPERefWeightsAlias | None = None, + ) -> None: + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + layer_idx=layer_idx, + ) + self.tilert_weights_alias = QKVRoPETilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else QKVRoPERefWeightsAlias() + ) + self.n_local_heads = model_args.n_heads // num_devices + self.qk_rope_head_dim = model_args.qk_rope_head_dim + self.profile_logs: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + return [] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + del weights_map + return {} + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + del state_dict + pass + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + del state_dict + pass + + def init_random_weights(self) -> None: + pass + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + del batch_size, seq_len + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward( + self, + q_pe: torch.Tensor, + pe_cache: torch.Tensor, + start_pos: int, + freqs_cis: torch.Tensor, + bsz: int, + seqlen: int, + ) -> torch.Tensor: + end_pos = start_pos + seqlen + + k_pe = pe_cache[:bsz, start_pos:end_pos] + k_pe = apply_rotary_emb(k_pe.unsqueeze(2), freqs_cis) + pe_cache[:bsz, start_pos:end_pos] = k_pe.squeeze(2) + + return apply_rotary_emb(q_pe, freqs_cis) + + def tilert_forward( + self, + q_pe: torch.Tensor, + pe_cache: torch.Tensor, + start_pos: int, + freqs_cis: torch.Tensor, + bsz: int, + seqlen: int, + ) -> torch.Tensor: + assert self.profile_logs is not None + end_pos = start_pos + seqlen + + q_pe_rope = q_pe.clone() + rope_freqs = torch.view_as_real(freqs_cis).reshape(*freqs_cis.shape[:-1], -1) + cur_pos = torch.tensor([start_pos], dtype=torch.int32) + + qkv_rope( + q_pe_rope, + pe_cache[:bsz, start_pos:end_pos], + rope_freqs, + cur_pos, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + + return q_pe_rope + + def __call__( + self, + q_pe: torch.Tensor, + pe_cache: torch.Tensor, + start_pos: int, + freqs_cis: torch.Tensor, + bsz: int, + seqlen: int, + ) -> torch.Tensor: + if self.flag_enable_tilert: + return self.tilert_forward(q_pe, pe_cache, start_pos, freqs_cis, bsz, seqlen) + return self.golden_forward(q_pe, pe_cache, start_pos, freqs_cis, bsz, seqlen) diff --git a/tilert/models/glm_5/_dsa_v32/ops/receive_selected_token_ids.py b/tilert/models/glm_5/_dsa_v32/ops/receive_selected_token_ids.py new file mode 100644 index 0000000..508d13e --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/receive_selected_token_ids.py @@ -0,0 +1,35 @@ +"""ReceiveSelectedTokenIds — receive idx_selects from GPU 0.""" + +import torch + +__all__ = [ + "receive_selected_token_ids", +] + + +def receive_selected_token_ids( + ll_buf: torch.Tensor, + dst: torch.Tensor, + expected_flag: int, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "bf16", +) -> None: + """Receive idx_selects from GPU 0. + + Args: + ll_buf: Receive buffer on this GPU (written by GPU 0). + dst: Destination idx_selects tensor [1, S, 2048] int32. + expected_flag: Expected synchronization flag value. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16"). + """ + torch.ops.tilert.receive_selected_token_ids_op( + ll_buf, + dst, + expected_flag, + model_arch, + compute_kernel_type, + profile_logs, + ) diff --git a/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_expert_proj.py b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_expert_proj.py new file mode 100644 index 0000000..ae004fa --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_expert_proj.py @@ -0,0 +1,172 @@ +"""RMSNormExpertProj operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch +from torch import nn + +from tilert.models.base import TileRTModule +from tilert.models.common import RMSNorm, init_func, linear +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RMSNormExpertProj", + "RMSNormExpertProjRefWeightsAlias", + "RMSNormExpertProjTilertWeightsAlias", +] + + +@dataclass +class RMSNormExpertProjRefWeightsAlias: + """Reference weights alias for RMSNormExpertProj.""" + + post_attention_layernorm_weight = "post_attention_layernorm.weight" + mlp_gate_weight = "mlp.gate.weight" + + def __call__(self) -> list[str]: + return [self.post_attention_layernorm_weight, self.mlp_gate_weight] + + +@dataclass +class RMSNormExpertProjTilertWeightsAlias: + """TileRT weights alias for RMSNormExpertProj.""" + + unproj_o_gamma = "unproj_o_gamma" + exp_proj_weights = "exp_proj_weights" + + def __call__(self) -> list[str]: + return [self.unproj_o_gamma, self.exp_proj_weights] + + +class RMSNormExpertProjAlgorithm(Enum): + """RMSNormExpertProj algorithm.""" + + GENERAL = "general" + + +class RMSNormExpertProj(TileRTModule): + """RMS Norm followed by expert projection.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RMSNormExpertProjAlgorithm.GENERAL], + "glm_5": [RMSNormExpertProjAlgorithm.GENERAL], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int = 0, + ref_weights_alias: RMSNormExpertProjRefWeightsAlias | None = None, + tilert_weights_alias: RMSNormExpertProjTilertWeightsAlias | None = None, + ): + super().__init__( + type(self).__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + self.dim = model_args.dim + self.eps = model_args.eps + + self.ref_weights_alias = ( + ref_weights_alias + if ref_weights_alias is not None + else RMSNormExpertProjRefWeightsAlias() + ) + self.tilert_weights_alias = ( + tilert_weights_alias + if tilert_weights_alias is not None + else RMSNormExpertProjTilertWeightsAlias() + ) + + self.is_ref_weights_init = False + self.is_tilert_weights_init = False + + self.ref_rmsnorm: RMSNorm | None = None + self.ref_proj_weight: torch.Tensor | None = None + self.proj_weight = nn.Parameter( + init_func(torch.empty(model_args.n_routed_experts, model_args.dim)) + ) + self.n_routed_experts = model_args.n_routed_experts + + self.tilert_proj_weight: torch.Tensor | None = None + self.tilert_rms_norm_weight: torch.Tensor | None = None + + self.profile_logs = get_profile_log_tensor() + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_rms_norm_weight, self.tilert_proj_weight] + + def device_sharding( + self, rms_norm_weight: torch.Tensor, proj_weight: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + return rms_norm_weight.float().contiguous(), proj_weight.contiguous() + + def init_reference_weights( + self, state_dict: dict[str, torch.Tensor], device_id: int | None = None + ) -> None: + del device_id + self.ref_rmsnorm = RMSNorm(self.dim, self.eps) + self.ref_rmsnorm.weight.data = state_dict[ + self.ref_weights_alias.post_attention_layernorm_weight + ] + self.ref_proj_weight = state_dict[self.ref_weights_alias.mlp_gate_weight] + self.is_ref_weights_init = True + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + self.tilert_proj_weight = ( + state_dict[self.tilert_weights_alias.exp_proj_weights].detach().clone() + ) + self.tilert_rms_norm_weight = ( + state_dict[self.tilert_weights_alias.unproj_o_gamma].detach().clone() + ) + self.is_tilert_weights_init = True + + def init_random_weights(self) -> None: + proj_weight = torch.randn(self.n_routed_experts, self.dim) + rms_norm_weight = torch.randn(self.dim, dtype=torch.float32) + ref_state_dict = dict( + zip( + self.ref_weights_alias(), + [rms_norm_weight, proj_weight], + ) + ) + self.init_reference_weights(ref_state_dict) + assert self.ref_rmsnorm is not None and self.ref_proj_weight is not None + sharded_weights = self.device_sharding(self.ref_rmsnorm.weight, self.ref_proj_weight) + self.init_tilert_weights(dict(zip(self.tilert_weights_alias(), sharded_weights))) + + def golden_forward( + self, x_in: torch.Tensor, residual: torch.Tensor | None = None + ) -> tuple[torch.Tensor, torch.Tensor]: + assert self.is_ref_weights_init, "Reference weights must be initialized before forward pass" + assert self.ref_rmsnorm is not None and self.ref_proj_weight is not None + norm_x = self.ref_rmsnorm(x_in, residual) + scores = linear(norm_x.view(-1, self.dim).float(), self.ref_proj_weight.float()) + return norm_x, scores + + def tilert_forward(self, x_in: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + assert self.is_tilert_weights_init, "Tilert weights must be initialized before forward pass" + assert self.tilert_rms_norm_weight is not None and self.tilert_proj_weight is not None + x_in = x_in.to(torch.bfloat16) + hidden_out = torch.zeros_like(x_in) + scores_out = torch.zeros( + (x_in.shape[0], x_in.shape[1], self.n_routed_experts), dtype=torch.float32 + ) + torch.ops.tilert.rmsnorm_expert_proj_op( + x_in, + self.tilert_rms_norm_weight, + self.tilert_proj_weight, + scores_out, + hidden_out, + self.model_args.arch_name, + "bf16", + self.profile_logs, + ) + return hidden_out, scores_out + + def __call__(self, x_in: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + return self.tilert_forward(x_in) diff --git a/python/models/deepseek_v3_2/ops/rmsnorm_head_proj.py b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_head_proj.py similarity index 84% rename from python/models/deepseek_v3_2/ops/rmsnorm_head_proj.py rename to tilert/models/glm_5/_dsa_v32/ops/rmsnorm_head_proj.py index 6145b5b..fa2086d 100644 --- a/python/models/deepseek_v3_2/ops/rmsnorm_head_proj.py +++ b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_head_proj.py @@ -1,74 +1,40 @@ """RMSNormHeadProj operation module.""" -from collections.abc import Callable from dataclasses import dataclass from enum import Enum import torch from tilert.models.base import TileRTModule, TilertWeightsConverter -from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs from tilert.utils import get_profile_log_tensor __all__ = [ "rmsnorm_head_proj", - "rmsnorm_head_proj_glm5", "RMSNormHeadProj", "RMSNormHeadProjTilertWeightsAlias", ] def rmsnorm_head_proj( - hidden_in: torch.Tensor, - gamma_in: torch.Tensor, - weight_in: torch.Tensor, - logits_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """RMS Norm Head Projection operation.""" - torch.ops.tilert.rmsnorm_head_proj_op( - hidden_in, - gamma_in, - weight_in, - logits_out, - profile_logs, - ) - - -def rmsnorm_head_proj_dsv32( hidden_in: torch.Tensor, gamma_in: torch.Tensor, weight_in: torch.Tensor, hidden_rmsnorm_out: torch.Tensor, logits_out: torch.Tensor, profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", ) -> None: """RMS Norm Head Projection operation.""" - del hidden_rmsnorm_out torch.ops.tilert.rmsnorm_head_proj_op( - hidden_in, - gamma_in, - weight_in, - logits_out, - profile_logs, - ) - - -def rmsnorm_head_proj_glm5( - hidden_in: torch.Tensor, - gamma_in: torch.Tensor, - weight_in: torch.Tensor, - hidden_rmsnorm_out: torch.Tensor, - logits_out: torch.Tensor, - profile_logs: torch.Tensor, -) -> None: - """RMS Norm Head Projection operation.""" - torch.ops.tilert.rmsnorm_head_proj_glm5_op( hidden_in, gamma_in, weight_in, hidden_rmsnorm_out, logits_out, + model_arch, + compute_kernel_type, profile_logs, ) @@ -135,6 +101,11 @@ def __call__(self) -> list[str]: class RMSNormHeadProj(TileRTModule): """RMSNormHeadProj module""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RMSNormHeadProjAlgorithm.GENERAL], + "glm_5": [RMSNormHeadProjAlgorithm.GENERAL], + } + def __init__( self, model_args: ModelArgs, @@ -155,34 +126,20 @@ def __init__( self.algorithm = algorithm self.eps = self.model_args.eps - # reference weights self.ref_rmsnorm_gamma: torch.Tensor | None = None self.ref_head_proj: torch.Tensor | None = None - # tilert weights self.tilert_rmsnorm_gamma: torch.Tensor | None = None self.tilert_head_proj: torch.Tensor | None = None - # tilert vars self.hidden_rmsnorm_out: torch.Tensor | None = None self.hidden_out: torch.Tensor | None = None self.profile_logs: torch.Tensor | None = None self.is_init = False - # tilert_funcs - self.rmsnorm_head_proj_func: Callable | None = None - - if self.arch_name == "deepseek_v3_2": - self.rmsnorm_head_proj_func = rmsnorm_head_proj_dsv32 - elif self.arch_name == "glm_5": - self.rmsnorm_head_proj_func = rmsnorm_head_proj_glm5 - else: - raise ValueError(f"Unsupported architecture: {self.arch_name}") - self.tilert_weights_alias = RMSNormHeadProjTilertWeightsAlias() - # reference tensor aliases self.ref_tensor_alias: list[str] = [ "model.norm.weight", "lm_head.weight", @@ -217,7 +174,6 @@ def device_sharding( rmsnorm_gamma_key = "model.norm.weight" head_proj_key = "lm_head.weight" rmsnorm_gamma = weights_dict[rmsnorm_gamma_key][None, ...] - # repeat number of devices times rmsnorm_gamma = rmsnorm_gamma.repeat(self.num_devices, 1) head_proj = weights_dict[head_proj_key] @@ -258,7 +214,6 @@ def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: batch_size: Batch size. seq_len: Sequence length. """ - # tilert vars self.hidden_rmsnorm_out = torch.zeros( (batch_size, seq_len, self.dim), dtype=torch.bfloat16, @@ -319,16 +274,16 @@ def tilert_forward( self, hidden_in: torch.Tensor, ) -> torch.Tensor: - assert self.rmsnorm_head_proj_func is not None assert self.hidden_out is not None - self.rmsnorm_head_proj_func( + rmsnorm_head_proj( hidden_in, self.tilert_rmsnorm_gamma, self.tilert_head_proj, self.hidden_rmsnorm_out, self.hidden_out, self.profile_logs, + model_arch=self.model_args.arch_name, ) return self.hidden_out diff --git a/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_kv.py b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_kv.py new file mode 100644 index 0000000..81d161c --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_kv.py @@ -0,0 +1,204 @@ +"""RMSNormKV operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "rmsnorm_kv", + "KVRMSNorm", + "KVRMSNormRefWeightsAlias", + "KVRMSNormTilertWeightsAlias", +] + + +def rmsnorm_kv( + kv: torch.Tensor, + gamma: torch.Tensor, + cur_pos: torch.Tensor, + kv_cache: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", +) -> None: + """ + Define the RMSNormKV operation. + + Args: + kv: Input tensor. + gamma: Weight tensor. + cur_pos: Current position tensor. + kv_cache: Output tensor. + profile_logs: Profile logs tensor. + model_arch: Model architecture string. + compute_kernel_type: Compute kernel type string. + """ + torch.ops.tilert.rmsnorm_kv_op( + kv, gamma, cur_pos, kv_cache, model_arch, compute_kernel_type, profile_logs + ) + + +@dataclass +class KVRMSNormRefWeightsAlias: + """Reference weights alias for KVRMSNorm.""" + + kv_norm_weight = "self_attn.kv_a_layernorm.weight" + + @property + def ref_tensor_alias(self) -> list[str]: + return [self.kv_norm_weight] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class KVRMSNormTilertWeightsAlias: + """TileRT weights alias for KVRMSNorm.""" + + kv_norm_gamma = "kv_rmsnorm_gamma" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.kv_norm_gamma] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class KVRMSNormAlgorithm(Enum): + """KVRMSNorm algorithm.""" + + GENERAL = "general" + + +class KVRMSNorm(TileRTModule): + """KVRMSNorm module: RMSNorm on KV tensor with in-place write to kv_cache.""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [KVRMSNormAlgorithm.GENERAL], + "glm_5": [KVRMSNormAlgorithm.GENERAL], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int, + ref_weights_alias: KVRMSNormRefWeightsAlias | None = None, + tilert_weights_alias: KVRMSNormTilertWeightsAlias | None = None, + layer_idx: int = 0, + golden_weights_dir: str = "", + tilert_weights_dir: str = "", + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + layer_idx=layer_idx, + golden_weights_dir=golden_weights_dir, + tilert_weights_dir=tilert_weights_dir, + ) + + self.tilert_weights_alias = ( + tilert_weights_alias + if tilert_weights_alias is not None + else KVRMSNormTilertWeightsAlias() + ) + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else KVRMSNormRefWeightsAlias() + ) + + self.kv_lora_rank = self.model_args.kv_lora_rank + self.eps = self.model_args.eps + + self.ref_norm_gamma: torch.Tensor | None = None + self.tilert_kv_norm_weight: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_kv_norm_weight] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """ + Device sharding: replicate gamma for each device. + + Args: + weights_map: Map from ref weight alias to tensor. + + Returns: + Map from tilert weight alias to (num_devices, ...) tensors. + """ + gamma = weights_map[self.ref_weights_alias.kv_norm_weight][None, ...].repeat( + self.num_devices, 1 + ) + return {self.tilert_weights_alias.kv_norm_gamma: gamma} + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize reference weights from state dict.""" + self.ref_norm_gamma = state_dict[self.ref_weights_alias.kv_norm_weight].contiguous() + assert ( + self.ref_norm_gamma.shape[-1] == self.kv_lora_rank + ), f"kv_norm weight shape must be ({self.kv_lora_rank},), got {self.ref_norm_gamma.shape}" + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize TileRT weights from state dict.""" + gamma = state_dict[self.tilert_weights_alias.kv_norm_gamma] + self.tilert_kv_norm_weight = gamma.float().detach().clone().contiguous() + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + """Allocate TileRT profiling buffer.""" + del batch_size, seq_len + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def init_random_weights(self) -> None: + """Initialize random reference and TileRT weights for testing.""" + ref_state_dict = { + self.ref_weights_alias.kv_norm_weight: torch.randn( + self.kv_lora_rank, dtype=torch.float32 + ), + } + self.init_reference_weights(ref_state_dict) + sharded = self.device_sharding(ref_state_dict) + self.init_tilert_weights({k: v[self.device_id] for k, v in sharded.items()}) + + def golden_forward( + self, kv: torch.Tensor, kv_cache: torch.Tensor, start_pos: int, bsz: int, seqlen: int + ) -> None: + """Reference forward: RMSNorm and write to kv_cache.""" + assert self.ref_norm_gamma is not None + end_pos = start_pos + seqlen + out = torch.nn.functional.rms_norm( + kv.float(), [kv.size(-1)], self.ref_norm_gamma, self.eps + ).to(kv.dtype) + kv_cache[:bsz, start_pos:end_pos].copy_(out) + + def tilert_forward( + self, kv: torch.Tensor, kv_cache: torch.Tensor, start_pos: int, bsz: int, seqlen: int + ) -> None: + del seqlen + assert self.tilert_kv_norm_weight is not None + assert self.profile_logs is not None + cur_pos = torch.tensor([start_pos], dtype=torch.int32, device=kv.device) + rmsnorm_kv( + kv, + self.tilert_kv_norm_weight, + cur_pos, + kv_cache[:bsz], + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + + def __call__( + self, kv: torch.Tensor, kv_cache: torch.Tensor, start_pos: int, bsz: int, seqlen: int + ) -> None: + if self.flag_enable_tilert: + return self.tilert_forward(kv, kv_cache, start_pos, bsz, seqlen) + return self.golden_forward(kv, kv_cache, start_pos, bsz, seqlen) diff --git a/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projq_wqb.py b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projq_wqb.py new file mode 100644 index 0000000..92d7a99 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projq_wqb.py @@ -0,0 +1,530 @@ +"""RmsnormProjqWqb operation module.""" + +import math +from dataclasses import dataclass +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RmsnormProjqWqb", + "RmsnormProjqWqbAlgorithm", + "RmsnormProjqWqbWeightsConverter", +] + + +def rmsnorm_projq_wqb_op( + q: torch.Tensor, + wq_b: torch.Tensor, + wq_b_scales: torch.Tensor, + q_norm_weight: torch.Tensor, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + profile_logs: torch.Tensor, + algorithm: str, + model_arch: str, +) -> None: + torch.ops.tilert.rmsnorm_proj_qb_op( + q, + wq_b, + wq_b_scales, + q_norm_weight, + q_nope, + q_pe, + model_arch, + algorithm, + profile_logs, + torch.empty(0, dtype=torch.int64, device=q.device), + ) + + +class RmsnormProjqWqbAlgorithm(Enum): + """RmsnormProjqWqb algorithm.""" + + FP16MMA = "fp16mma" + + +class RmsnormProjqWqbWeightsConverter(TilertWeightsConverter): + """Weights converter for RmsnormProjqWqb. + + Supports configurations where n_heads is not evenly divisible by + num_devices; in that case n_local_heads is padded and padded head + weight rows are zero-filled. + """ + + kBf16NumCtas = 80 + kGemvPageSize = 8 + + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args=model_args, num_devices=num_devices) + + self.proc_groups = 8 + self.repeat = 16 + + self.block_size = self.model_args.block_size + + self.qk_nope_head_dim = self.model_args.qk_nope_head_dim + self.qk_rope_head_dim = self.model_args.qk_rope_head_dim + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + + self.needs_padding = self.model_args.n_heads % num_devices != 0 + self.n_local_heads = self._compute_n_local_heads( + self.model_args.n_heads, num_devices, self.qk_head_dim + ) + + self.q_lora_dim = self.model_args.q_lora_rank + self.q_lora_qdim = self.q_lora_dim // self.block_size + + self.qk_dim = self.qk_head_dim * self.n_local_heads + self.qk_qdim = self.qk_dim // self.block_size + + assert self.qk_dim % (self.kBf16NumCtas * self.kGemvPageSize) == 0, ( + f"qk_dim ({self.qk_dim}) must be divisible by " + f"kBf16NumCtas * kGemvPageSize ({self.kBf16NumCtas * self.kGemvPageSize})" + ) + assert self.qk_dim % self.block_size == 0, ( + f"qk_dim ({self.qk_dim}) must be divisible by block_size ({self.block_size}) " + f"for scale alignment" + ) + + @classmethod + def _compute_n_local_heads(cls, n_total_heads: int, num_devices: int, qk_head_dim: int) -> int: + """Compute padded n_local_heads per device.""" + if n_total_heads % num_devices == 0: + return n_total_heads // num_devices + + base = math.ceil(n_total_heads / num_devices) + align_unit = cls.kBf16NumCtas * cls.kGemvPageSize + g = math.gcd(qk_head_dim, align_unit) + head_align = align_unit // g + return math.ceil(base / head_align) * head_align + + @staticmethod + def _redistribute_heads( + wq_b_full: torch.Tensor, + wq_b_scale_full: torch.Tensor, + n_total_heads: int, + n_local_heads: int, + num_devices: int, + qk_head_dim: int, + block_size: int, + ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: + """Redistribute heads across devices with padding. + + Args: + wq_b_full: [n_total_heads * qk_head_dim, q_lora_dim] full weight. + wq_b_scale_full: [n_total_heads * qk_head_dim // block_size, q_lora_qdim] full scale. + n_total_heads: Total number of heads (e.g. 128). + n_local_heads: Target heads per GPU (padded, e.g. 20). + num_devices: Number of devices (e.g. 7). + qk_head_dim: Head dimension (e.g. 192). + block_size: Quantization block size (e.g. 128). + + Returns: + Lists of per-device (wq_b, wq_b_scale) with shape + [n_local_heads * qk_head_dim, q_lora_dim] and + [n_local_heads * qk_head_dim // block_size, q_lora_qdim]. + """ + total_rows = n_total_heads * qk_head_dim + rows_per_dev = n_local_heads * qk_head_dim + scale_rows_per_dev = rows_per_dev // block_size + total_scale_rows = total_rows // block_size + + q_lora_dim = wq_b_full.shape[-1] + q_lora_qdim = wq_b_scale_full.shape[-1] + + assert rows_per_dev % block_size == 0, ( + f"n_local_heads * qk_head_dim ({rows_per_dev}) must be " + f"divisible by block_size ({block_size})" + ) + + wq_b_list = [] + scale_list = [] + for dev in range(num_devices): + start_row = dev * rows_per_dev + end_row = min(total_rows, start_row + rows_per_dev) + real_rows = max(0, end_row - start_row) + + dev_wqb = torch.zeros( + rows_per_dev, q_lora_dim, dtype=wq_b_full.dtype, device=wq_b_full.device + ) + if real_rows > 0: + dev_wqb[:real_rows] = wq_b_full[start_row:end_row] + + start_scale = dev * scale_rows_per_dev + end_scale = min(total_scale_rows, start_scale + scale_rows_per_dev) + real_scale_rows = max(0, end_scale - start_scale) + + dev_scale = torch.zeros( + scale_rows_per_dev, + q_lora_qdim, + dtype=wq_b_scale_full.dtype, + device=wq_b_scale_full.device, + ) + if real_scale_rows > 0: + dev_scale[:real_scale_rows] = wq_b_scale_full[start_scale:end_scale] + + wq_b_list.append(dev_wqb) + scale_list.append(dev_scale) + + return wq_b_list, scale_list + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x16_for_pages( + mat_in: torch.Tensor, q_lora_dim: int, pages: int + ) -> torch.Tensor: + """Swizzle a 16xK matrix for the paged weight layout (K divisible by 16).""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == q_lora_dim + k_per_page = q_lora_dim // pages + n_k_tiles = k_per_page // 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 16, pages, k_per_page).transpose(-3, -2) + mat_in = mat_in.reshape(*pre_shape, pages, 16, n_k_tiles, 16).transpose(-3, -2) + mat_in = RmsnormProjqWqbWeightsConverter._swizzle_mma_16x16(mat_in) + return mat_in.contiguous() + + def _common_to_tilert_fp16mma( + self, + wq_b: torch.Tensor, + wq_b_scales_raw: torch.Tensor, + rmsnorm_gamma: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common weights to the packed TileRT FP16 layout.""" + pages = 2 + rows_per_cta = 32 + + qk_nope_dim = self.n_local_heads * self.qk_nope_head_dim + qk_pe_dim = self.n_local_heads * self.qk_rope_head_dim + nope_ctas = qk_nope_dim // rows_per_cta + pe_ctas = qk_pe_dim // rows_per_cta + num_ctas = nope_ctas + pe_ctas + + wq_b_scales_f32 = wq_b_scales_raw.to(torch.float32) + wq_b_scales_f32 = ( + wq_b_scales_f32.reshape(self.qk_qdim, 1, self.q_lora_qdim) + .repeat(1, self.block_size, 1) + .reshape(self.qk_dim, self.q_lora_qdim) + ) + + wq_b_scales_f32 = wq_b_scales_f32.reshape( + self.n_local_heads, self.qk_head_dim, self.q_lora_qdim + ) + scale_nope = wq_b_scales_f32[:, : self.qk_nope_head_dim, :].reshape(-1, self.q_lora_qdim) + scale_pe = wq_b_scales_f32[:, self.qk_nope_head_dim :, :].reshape(-1, self.q_lora_qdim) + + scale_nope = scale_nope.reshape( + nope_ctas, rows_per_cta, pages, self.q_lora_qdim // pages + ).transpose(1, 2)[:, :, 0, :] + scale_pe = scale_pe.reshape( + pe_ctas, rows_per_cta, pages, self.q_lora_qdim // pages + ).transpose(1, 2)[:, :, 0, :] + + scales = torch.cat([scale_nope, scale_pe], dim=0) + scales_fp8 = scales.contiguous().view(torch.float8_e4m3fn) + + wq_b = wq_b.reshape(self.n_local_heads, self.qk_head_dim, self.q_lora_dim) + wq_b_nope = wq_b[:, : self.qk_nope_head_dim, :].reshape(-1, self.q_lora_dim) + wq_b_pe = wq_b[:, self.qk_nope_head_dim :, :].reshape(-1, self.q_lora_dim) + + wq_b_nope = wq_b_nope.reshape(nope_ctas, rows_per_cta // 16, 16, self.q_lora_dim) + wq_b_nope = RmsnormProjqWqbWeightsConverter._swizzle_mma_16x16_for_pages( + wq_b_nope, self.q_lora_dim, pages + ) + wq_b_nope = ( + wq_b_nope.reshape(nope_ctas, rows_per_cta // 16, pages, 16, -1) + .transpose(1, 2) + .reshape(nope_ctas, pages, rows_per_cta, -1) + ) + + wq_b_pe = wq_b_pe.reshape(pe_ctas, rows_per_cta // 16, 16, self.q_lora_dim) + wq_b_pe = RmsnormProjqWqbWeightsConverter._swizzle_mma_16x16_for_pages( + wq_b_pe, self.q_lora_dim, pages + ) + wq_b_pe = ( + wq_b_pe.reshape(pe_ctas, rows_per_cta // 16, pages, 16, -1) + .transpose(1, 2) + .reshape(pe_ctas, pages, rows_per_cta, -1) + ) + + weights = torch.cat([wq_b_nope, wq_b_pe], dim=0) + weights = weights.reshape(num_ctas, pages, -1) + + scale_padding_size = 128 - scales_fp8.shape[-1] + scale_padding = torch.zeros( + num_ctas, + pages, + scale_padding_size, + dtype=torch.float8_e4m3fn, + device=wq_b.device, + ) + tilert_wqb = torch.cat([weights, scales_fp8, scale_padding], dim=-1).contiguous() + + tilert_wqb_scales = torch.zeros(1, dtype=torch.bfloat16) + tilert_gamma = rmsnorm_gamma.float().detach().clone() + return tilert_wqb, tilert_wqb_scales, tilert_gamma + + def convert_to_fp16mma( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common-format weights to TileRT FP16 MMA layout.""" + with torch.inference_mode(): + wq_b, wq_b_scale, q_norm_weight = weights + return self._common_to_tilert_fp16mma(wq_b, wq_b_scale, q_norm_weight) + + +@dataclass +class RmsnormProjqWqbRefWeightsAlias: + """Reference weights alias for RmsnormProjqWqb.""" + + rmsnorm_gamma = "self_attn.q_a_layernorm.weight" + wqb_weights = "self_attn.q_b_proj.weight" + wqb_scales = "self_attn.q_b_proj.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [ + self.rmsnorm_gamma, + self.wqb_weights, + self.wqb_scales, + ] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class RmsnormProjqWqbTilertWeightsAlias: + """TileRT weights alias for RmsnormProjqWqb.""" + + rmsnorm_gamma = "q_rmsnorm_gamma" + wqb_weights = "wqb_weights" + wqb_scales = "wqb_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.rmsnorm_gamma, + self.wqb_weights, + self.wqb_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RmsnormProjqWqb(TileRTModule): + """RmsnormProjqWqb module: RMSNorm + Q projection (wq_b only).""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RmsnormProjqWqbAlgorithm.FP16MMA], + "glm_5": [RmsnormProjqWqbAlgorithm.FP16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int = 7, + ref_weights_alias: RmsnormProjqWqbRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + + self.tilert_weights_alias = RmsnormProjqWqbTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias if ref_weights_alias is not None else RmsnormProjqWqbRefWeightsAlias() + ) + + self.n_local_heads = RmsnormProjqWqbWeightsConverter._compute_n_local_heads( + model_args.n_heads, + num_devices, + model_args.qk_nope_head_dim + model_args.qk_rope_head_dim, + ) + self.q_lora_rank = model_args.q_lora_rank + self.n_heads = model_args.n_heads + self.qk_nope_head_dim = model_args.qk_nope_head_dim + self.qk_rope_head_dim = model_args.qk_rope_head_dim + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + self.qk_local_dim = self.qk_head_dim * self.n_local_heads + + self.block_size = model_args.block_size + self.q_lora_qdim = self.q_lora_rank // self.block_size + self.qk_local_qdim = self.qk_local_dim // self.block_size + self.eps = model_args.eps + + self.ref_q_norm: torch.Tensor | None = None + self.ref_wq_b: torch.Tensor | None = None + + self.tilert_wq_b: torch.Tensor | None = None + self.tilert_wq_b_scales: torch.Tensor | None = None + self.tilert_q_norm_weight: torch.Tensor | None = None + + self.q_nope: torch.Tensor | None = None + self.q_pe: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_q_norm_weight, self.tilert_wq_b, self.tilert_wq_b_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Redistribute heads across devices with padding.""" + gamma = weights_map[self.ref_weights_alias.rmsnorm_gamma][None, ...].repeat( + self.num_devices, 1 + ) + + wq_b_full = weights_map[self.ref_weights_alias.wqb_weights] + wq_b_scale_full = weights_map[self.ref_weights_alias.wqb_scales] + + wq_b_list, scale_list = RmsnormProjqWqbWeightsConverter._redistribute_heads( + wq_b_full, + wq_b_scale_full, + n_total_heads=self.n_heads, + n_local_heads=self.n_local_heads, + num_devices=self.num_devices, + qk_head_dim=self.qk_head_dim, + block_size=self.block_size, + ) + + sharded_wqb_weights = torch.stack(wq_b_list, dim=0) + sharded_wqb_scales = torch.stack(scale_list, dim=0) + + return { + self.tilert_weights_alias.rmsnorm_gamma: gamma, + self.tilert_weights_alias.wqb_weights: sharded_wqb_weights, + self.tilert_weights_alias.wqb_scales: sharded_wqb_scales, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize reference weights from common-format state dict.""" + self.ref_q_norm = state_dict[self.ref_weights_alias.rmsnorm_gamma] + + wq_b_full = state_dict[self.ref_weights_alias.wqb_weights] + wq_b_scale_full = state_dict[self.ref_weights_alias.wqb_scales] + + wq_b_bf16_full = weight_dequant(wq_b_full, wq_b_scale_full) + + total_rows = self.n_heads * self.qk_head_dim + rows_per_dev = self.n_local_heads * self.qk_head_dim + start_row = self.device_id * rows_per_dev + end_row = min(total_rows, start_row + rows_per_dev) + real_rows = max(0, end_row - start_row) + + dev_wqb = torch.zeros( + rows_per_dev, + wq_b_bf16_full.shape[-1], + dtype=wq_b_bf16_full.dtype, + device=wq_b_bf16_full.device, + ) + if real_rows > 0: + dev_wqb[:real_rows] = wq_b_bf16_full[start_row:end_row] + + self.ref_wq_b = dev_wqb.contiguous() + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize TileRT weights from common-format state dict.""" + weights = [ + state_dict[self.tilert_weights_alias.wqb_weights], + state_dict[self.tilert_weights_alias.wqb_scales], + state_dict[self.tilert_weights_alias.rmsnorm_gamma], + ] + assert self.algorithm is not None, "Algorithm is not set" + self.tilert_wq_b, self.tilert_wq_b_scales, self.tilert_q_norm_weight = ( + RmsnormProjqWqbWeightsConverter(self.model_args, self.num_devices).dispatch( + self.algorithm, weights + ) + ) + + def init_random_weights(self) -> None: + """Initialize random reference and TileRT weights for testing.""" + q_norm = torch.randn(self.q_lora_rank, dtype=torch.float32) + + wq_b = torch.randn(self.qk_local_dim, self.q_lora_rank, dtype=torch.bfloat16).to( + torch.float8_e4m3fn + ) + scale_dtype = torch.float32 if self.model_args.arch_name == "glm_5" else torch.bfloat16 + wq_b_scale = torch.randn(self.qk_local_qdim, self.q_lora_qdim, dtype=scale_dtype) + + self.ref_q_norm = q_norm + self.ref_wq_b = weight_dequant(wq_b, wq_b_scale).contiguous() + + assert self.algorithm is not None, "Algorithm is not set" + weights = [wq_b, wq_b_scale, q_norm] + self.tilert_wq_b, self.tilert_wq_b_scales, self.tilert_q_norm_weight = ( + RmsnormProjqWqbWeightsConverter(self.model_args, self.num_devices).dispatch( + self.algorithm, weights + ) + ) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + """Allocate TileRT output buffers.""" + self.q_nope = torch.zeros( + batch_size, seq_len, self.n_local_heads, self.qk_nope_head_dim, dtype=torch.bfloat16 + ) + self.q_pe = torch.zeros( + batch_size, seq_len, self.n_local_heads, self.qk_rope_head_dim, dtype=torch.bfloat16 + ) + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward(self, q: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Reference forward: RMSNorm + linear projection (no iq).""" + assert self.ref_q_norm is not None + assert self.ref_wq_b is not None + + bsz, seqlen, _ = q.shape + if bsz != 1 or seqlen not in [1, 2, 4]: + raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") + + qr = torch.nn.functional.rms_norm(q.float(), [q.size(-1)], self.ref_q_norm, self.eps).to( + q.dtype + ) + + q_out = torch.matmul(qr, self.ref_wq_b.T) + q_out = q_out.view(bsz, seqlen, self.n_local_heads, self.qk_head_dim) + q_nope, q_pe = torch.split(q_out, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + return q_nope, q_pe + + def tilert_forward(self, q: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + assert self.tilert_wq_b is not None + assert self.tilert_wq_b_scales is not None + assert self.tilert_q_norm_weight is not None + assert self.q_nope is not None + assert self.q_pe is not None + assert self.profile_logs is not None + + bsz, seqlen, _ = q.shape + if bsz != 1 or seqlen not in [1, 2, 4]: + raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") + + assert self.algorithm is not None, "Algorithm is not set" + + rmsnorm_projq_wqb_op( + q, + self.tilert_wq_b, + self.tilert_wq_b_scales, + self.tilert_q_norm_weight, + self.q_nope, + self.q_pe, + self.profile_logs, + self.algorithm.value, + model_arch=self.model_args.arch_name, + ) + + return self.q_nope, self.q_pe diff --git a/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projq_wqi.py b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projq_wqi.py new file mode 100644 index 0000000..4f4d07f --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projq_wqi.py @@ -0,0 +1,330 @@ +"""RmsnormProjqWqi operation module (IQ-only projection).""" + +from dataclasses import dataclass +from enum import Enum + +import torch +from einops import rearrange + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RmsnormProjqWqi", + "RmsnormProjqWqiAlgorithm", + "RmsnormProjqWqiWeightsConverter", +] + + +def rmsnorm_projq_wqi_op( + q: torch.Tensor, + wqi: torch.Tensor, + wqi_scale: torch.Tensor, + rmsnorm_gamma: torch.Tensor, + iq: torch.Tensor, + profile_logs: torch.Tensor, + algorithm: str, + model_arch: str, +) -> None: + torch.ops.tilert.rmsnorm_proj_qi_op( + q, + wqi, + wqi_scale, + rmsnorm_gamma, + iq, + model_arch, + algorithm, + profile_logs, + ) + + +class RmsnormProjqWqiAlgorithm(Enum): + """RmsnormProjqWqi algorithm.""" + + FP16MMA = "fp16mma" + + +class RmsnormProjqWqiWeightsConverter(TilertWeightsConverter): + """Weights converter: common format to TileRT format (IQ only).""" + + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args=model_args, num_devices=num_devices) + + self.block_size = self.model_args.block_size + self.q_lora_dim = self.model_args.q_lora_rank + self.q_lora_qdim = self.q_lora_dim // self.block_size + + self.index_n_heads = self.model_args.index_n_heads + self.index_head_dim = self.index_n_heads * self.model_args.index_head_dim + self.index_head_qdim = self.index_head_dim // self.block_size + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def _swizzle_mma_16x16_for_pages( + mat_in: torch.Tensor, q_lora_rank: int, pages: int + ) -> torch.Tensor: + """Swizzle a 16xK matrix for the paged weight layout (K divisible by 16).""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == q_lora_rank + pre_shape = mat_in.shape[:-2] + k_per_page = q_lora_rank // pages + n_k_tiles = k_per_page // 16 + mat_in = mat_in.reshape(*pre_shape, 16, pages, k_per_page).transpose(-3, -2) + mat_in = mat_in.reshape(*pre_shape, pages, 16, n_k_tiles, 16).transpose(-3, -2) + mat_in = RmsnormProjqWqiWeightsConverter._swizzle_mma_16x16(mat_in) + return mat_in.contiguous() + + def _common_to_tilert_fp16mma( + self, + wqi: torch.Tensor, + wqi_scales: torch.Tensor, + rmsnorm_gamma: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common weights to the packed TileRT FP16 layout (IQ only).""" + sms = 128 + k_per_page = 1024 if self.model_args.arch_name == "glm_5" else 512 + pages = self.q_lora_dim // k_per_page + iq_dim_per_sm = self.index_head_dim // sms + + wqi_scales_f32 = wqi_scales.to(torch.float32) + wqi_scales_f32 = ( + wqi_scales_f32.reshape(self.index_head_qdim, 1, self.q_lora_qdim) + .repeat(1, self.block_size, 1) + .reshape(self.index_head_dim, self.q_lora_qdim) + ) + wqi_scales_f32 = wqi_scales_f32.reshape( + sms, iq_dim_per_sm, pages, self.q_lora_qdim // pages + ).transpose(1, 2) + wqi_scales_f32 = wqi_scales_f32[:, :, 0, :] + wqi_full_scales = wqi_scales_f32.contiguous().view(torch.float8_e4m3fn) + + wqi_mat = wqi.reshape(sms, iq_dim_per_sm // 16, 16, self.q_lora_dim) + wqi_mat = RmsnormProjqWqiWeightsConverter._swizzle_mma_16x16_for_pages( + wqi_mat, self.q_lora_dim, pages + ) + wqi_mat = ( + wqi_mat.reshape(sms, iq_dim_per_sm // 16, pages, 16, -1) + .transpose(1, 2) + .reshape(sms, pages, iq_dim_per_sm, -1) + ) + wqi_mat = wqi_mat.reshape(sms, pages, -1) + + wqi_scales_padding = torch.zeros( + sms, + pages, + 128 - wqi_full_scales.shape[-1], + dtype=torch.float8_e4m3fn, + device=wqi.device, + ) + tilert_wqi = torch.cat([wqi_mat, wqi_full_scales, wqi_scales_padding], dim=-1).contiguous() + tilert_wqi_scales = torch.zeros(1, dtype=torch.bfloat16) + tilert_gamma = rmsnorm_gamma.float().detach().clone() + return tilert_wqi, tilert_wqi_scales, tilert_gamma + + def convert_to_fp16mma( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert common-format weights to TileRT FP16 MMA layout. + + Args: + weights: [wqi, wqi_scale, q_norm_weight]. + """ + with torch.inference_mode(): + wqi, wqi_scale, q_norm_weight = weights + return self._common_to_tilert_fp16mma(wqi, wqi_scale, q_norm_weight) + + +@dataclass +class RmsnormProjqWqiRefWeightsAlias: + """Reference (HuggingFace) weights alias for RmsnormProjqWqi.""" + + rmsnorm_gamma = "self_attn.q_a_layernorm.weight" + wqi_weights = "self_attn.indexer.wq_b.weight" + wqi_scales = "self_attn.indexer.wq_b.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [self.rmsnorm_gamma, self.wqi_weights, self.wqi_scales] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +@dataclass +class RmsnormProjqWqiTilertWeightsAlias: + """TileRT weights alias for RmsnormProjqWqi.""" + + rmsnorm_gamma = "q_rmsnorm_gamma_qi" + wqi_weights = "wqi_weights" + wqi_scales = "wqi_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [self.rmsnorm_gamma, self.wqi_weights, self.wqi_scales] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RmsnormProjqWqi(TileRTModule): + """RmsnormProjqWqi module: RMSNorm + W_qi projection (IQ only, GLM5 v2).""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RmsnormProjqWqiAlgorithm.FP16MMA], + "glm_5": [RmsnormProjqWqiAlgorithm.FP16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + + self.tilert_weights_alias = RmsnormProjqWqiTilertWeightsAlias() + self.ref_weights_alias = RmsnormProjqWqiRefWeightsAlias() + + self.q_lora_rank = model_args.q_lora_rank + self.index_n_heads = model_args.index_n_heads + self.head_dim = model_args.index_head_dim + self.index_head_dim = model_args.index_n_heads * model_args.index_head_dim + + self.block_size = model_args.block_size + self.q_lora_qdim = self.q_lora_rank // self.block_size + self.index_head_qdim = self.index_head_dim // self.block_size + self.eps = model_args.eps + + self.ref_q_norm: torch.Tensor | None = None + self.ref_wqi: torch.Tensor | None = None + + self.tilert_wqi: torch.Tensor | None = None + self.tilert_wqi_scales: torch.Tensor | None = None + self.tilert_q_norm_weight: torch.Tensor | None = None + + self.iq: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_q_norm_weight, self.tilert_wqi, self.tilert_wqi_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Replicate IQ weights across devices (no per-head redistribution needed).""" + gamma = ( + weights_map[self.ref_weights_alias.rmsnorm_gamma][None, ...] + .float() + .repeat(self.num_devices, 1) + ) + wqi_weights = weights_map[self.ref_weights_alias.wqi_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + wqi_scales = weights_map[self.ref_weights_alias.wqi_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + return { + self.tilert_weights_alias.rmsnorm_gamma: gamma, + self.tilert_weights_alias.wqi_weights: wqi_weights, + self.tilert_weights_alias.wqi_scales: wqi_scales, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize reference weights from common-format state dict.""" + self.ref_q_norm = state_dict[self.tilert_weights_alias.rmsnorm_gamma] + wqi = weight_dequant( + state_dict[self.tilert_weights_alias.wqi_weights], + state_dict[self.tilert_weights_alias.wqi_scales], + ) + self.ref_wqi = wqi.contiguous() + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """Initialize TileRT weights from common-format state dict.""" + weights = [ + state_dict[self.tilert_weights_alias.wqi_weights], + state_dict[self.tilert_weights_alias.wqi_scales], + state_dict[self.tilert_weights_alias.rmsnorm_gamma], + ] + assert self.algorithm is not None, "Algorithm is not set" + self.tilert_wqi, self.tilert_wqi_scales, self.tilert_q_norm_weight = ( + RmsnormProjqWqiWeightsConverter(self.model_args, self.num_devices).dispatch( + self.algorithm, weights + ) + ) + + def init_random_weights(self) -> None: + """Initialize random reference and TileRT weights for testing.""" + q_norm = torch.randn(self.q_lora_rank, dtype=torch.float32) + wqi = torch.randn(self.index_head_dim, self.q_lora_rank, dtype=torch.bfloat16).to( + torch.float8_e4m3fn + ) + scale_dtype = torch.float32 if self.model_args.arch_name == "glm_5" else torch.bfloat16 + wqi_scale = torch.randn(self.index_head_qdim, self.q_lora_qdim, dtype=scale_dtype) + + ref_state = { + self.tilert_weights_alias.rmsnorm_gamma: q_norm, + self.tilert_weights_alias.wqi_weights: wqi, + self.tilert_weights_alias.wqi_scales: wqi_scale, + } + + self.init_reference_weights(ref_state) + self.init_tilert_weights(ref_state) + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + """Allocate TileRT output buffers.""" + self.iq = torch.zeros( + batch_size, seq_len, self.index_n_heads, self.head_dim, dtype=torch.bfloat16 + ) + self.profile_logs = get_profile_log_tensor() + self.is_var_init = True + + def golden_forward(self, q: torch.Tensor) -> torch.Tensor: + """Reference forward: RMSNorm + W_qi_b linear projection.""" + assert self.ref_q_norm is not None + assert self.ref_wqi is not None + + bsz, seqlen, _ = q.shape + if bsz != 1 or seqlen not in [1, 2, 4, 8]: + raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") + + qr = torch.nn.functional.rms_norm(q.float(), [q.size(-1)], self.ref_q_norm, self.eps).to( + q.dtype + ) + + return rearrange(torch.matmul(qr, self.ref_wqi.T), "b s (h d) -> b s h d", d=self.head_dim) + + def tilert_forward(self, q: torch.Tensor) -> torch.Tensor: + assert self.tilert_wqi is not None + assert self.tilert_wqi_scales is not None + assert self.tilert_q_norm_weight is not None + assert self.iq is not None + assert self.profile_logs is not None + + bsz, seqlen, _ = q.shape + if bsz != 1 or seqlen not in [1, 2, 4, 8]: + raise ValueError(f"Invalid batch size or sequence length: bsz={bsz}, seqlen={seqlen}") + + assert self.algorithm is not None, "Algorithm is not set" + + rmsnorm_projq_wqi_op( + q, + self.tilert_wqi, + self.tilert_wqi_scales, + self.tilert_q_norm_weight, + self.iq, + self.profile_logs, + self.algorithm.value, + model_arch=self.model_args.arch_name, + ) + + return self.iq diff --git a/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projx_wqakis.py b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projx_wqakis.py new file mode 100644 index 0000000..8813d6a --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projx_wqakis.py @@ -0,0 +1,341 @@ +"""RMSNormProjxWqakis operation module.""" + +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.ops.projx_wis import projx_wis +from tilert.models.glm_5._dsa_v32.ops.projx_wqaki import ( + ProjxWqakiWeightsConverter, + projx_wqaki, +) +from tilert.models.glm_5._dsa_v32.ops.rmsnorm_quant import rmsnorm_quant +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RMSNormProjxWqakis", +] + + +class RMSNormProjxWqakisWeightsConverter(TilertWeightsConverter): + """Weight converter for RMSNormProjxWqakis.""" + + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args, num_devices) + + def convert_to_decoupled( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Convert weights to decoupled FP8 MMA format. + + Args: + weights: [gamma, wq_a, wq_a_scale, wki, wki_scale, wis, wis_scale] + + Returns: + (wqaki_packed, wis_bf16, gamma) + """ + arch_name = self.model_args.arch_name + x_rmsnorm_gamma, wq_a, wq_a_scale, wki, wki_scale, wis, _wis_scale = weights + + if arch_name == "deepseek_v3_2": + wqaki_packed = ProjxWqakiWeightsConverter.convert_dsv32( + wq_a, wq_a_scale, wki, wki_scale + ) + elif arch_name == "glm_5": + wqaki_packed = ProjxWqakiWeightsConverter.convert_glm5_68cta( + wq_a, wq_a_scale, wki, wki_scale + ) + else: + raise ValueError(f"Unsupported architecture: {arch_name}") + + wis_bf16 = wis.to(torch.bfloat16) + return wqaki_packed, wis_bf16, x_rmsnorm_gamma.float() + + +class RMSNormProjxWqakisRefWeightsAlias: + """Reference weight aliases for RMSNormProjxWqakis.""" + + x_rmsnorm_gamma = "input_layernorm.weight" + q_a_weights = "self_attn.q_a_proj.weight" + q_a_scales = "self_attn.q_a_proj.weight_scale_inv" + wk_weights = "self_attn.indexer.wk.weight" + wk_scales = "self_attn.indexer.wk.weight_scale_inv" + wis_weights = "self_attn.indexer.weights_proj.weight" + + @property + def ref_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.wk_weights, + self.wk_scales, + self.wis_weights, + ] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +class RMSNormProjxWqakisTilertWeightsAlias: + """Tilert weight aliases for RMSNormProjxWqakis.""" + + x_rmsnorm_gamma = "x_rmsnorm_gamma" + q_a_weights = "q_a_weights" + q_a_scales = "q_a_scales" + wk_weights = "wk_weights" + wk_scales = "wk_scales" + wis_weights = "wis_weights" + wis_scales = "wis_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.wk_weights, + self.wk_scales, + self.wis_weights, + self.wis_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RMSNormProjxWqakisAlgorithm(Enum): + """RMSNormProjxWqakis algorithm.""" + + FP8MMA = "fp8mma" + + +class RMSNormProjxWqakis(TileRTModule): + """Decoupled RMSNorm + GEMV(W_q_a, W_ki, W_is).""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RMSNormProjxWqakisAlgorithm.FP8MMA], + "glm_5": [RMSNormProjxWqakisAlgorithm.FP8MMA], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int, + ref_weights_alias: RMSNormProjxWqakisRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = RMSNormProjxWqakisTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias + if ref_weights_alias is not None + else RMSNormProjxWqakisRefWeightsAlias() + ) + + self.arch_name = self.model_args.arch_name + self.dim = self.model_args.dim + self.q_lora_rank = self.model_args.q_lora_rank + self.idx_head_dim = self.model_args.index_head_dim + self.idx_score_dim = self.model_args.index_n_heads + self.block_size = self.model_args.block_size + self.eps = self.model_args.eps + + self.ref_norm_gamma: torch.Tensor | None = None + self.ref_wq_a: torch.Tensor | None = None + self.ref_wki: torch.Tensor | None = None + self.ref_wis: torch.Tensor | None = None + + self.tilert_norm_gamma: torch.Tensor | None = None + self.tilert_wqakis: torch.Tensor | None = None + self.tilert_wis: torch.Tensor | None = None + + self.q_out: torch.Tensor | None = None + self.ki_out: torch.Tensor | None = None + self.idx_scores_out: torch.Tensor | None = None + self.x_rmsnorm_out: torch.Tensor | None = None + self.x_rmsnorm_quant_out: torch.Tensor | None = None + self.x_rmsnorm_quant_scale_out: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + if self.arch_name == "glm_5": + self.compute_kernel_type = "fp8mma_68cta" + else: + self.compute_kernel_type = "fp8mma" + + self.tilert_tensor_alias: list[str] = [ + "x_rmsnorm_gamma", + "qakis_weights", + "qakis_scales", + ] + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_norm_gamma, self.tilert_wqakis, self.tilert_wis] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Repeat weights for device sharding.""" + input_layernorm_weight = ( + weights_map[self.ref_weights_alias.x_rmsnorm_gamma][None, ...] + .float() + .repeat(self.num_devices, 1) + ) + q_a_proj_weight = weights_map[self.ref_weights_alias.q_a_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + q_a_proj_weight_scale = weights_map[self.ref_weights_alias.q_a_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + wk_weight = weights_map[self.ref_weights_alias.wk_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + wk_weight_scale = weights_map[self.ref_weights_alias.wk_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + wis_weight = weights_map[self.ref_weights_alias.wis_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + is_n_rows = weights_map[self.ref_weights_alias.wis_weights].shape[0] + is_scale_rows = (is_n_rows + self.block_size - 1) // self.block_size + is_scale_cols = self.dim // self.block_size + wis_weight_scale = torch.ones( + self.num_devices, is_scale_rows, is_scale_cols, dtype=torch.bfloat16 + ) + return { + self.tilert_weights_alias.x_rmsnorm_gamma: input_layernorm_weight, + self.tilert_weights_alias.q_a_weights: q_a_proj_weight, + self.tilert_weights_alias.q_a_scales: q_a_proj_weight_scale, + self.tilert_weights_alias.wk_weights: wk_weight, + self.tilert_weights_alias.wk_scales: wk_weight_scale, + self.tilert_weights_alias.wis_weights: wis_weight, + self.tilert_weights_alias.wis_scales: wis_weight_scale, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + aliases = self.ref_weights_alias() + self.ref_norm_gamma = state_dict[aliases[0]] + self.ref_wq_a = weight_dequant(state_dict[aliases[1]], state_dict[aliases[2]]) + self.ref_wki = weight_dequant(state_dict[aliases[3]], state_dict[aliases[4]]) + self.ref_wis = state_dict[aliases[5]].to(torch.bfloat16) + + assert self.ref_norm_gamma.shape[-1] == self.dim + assert self.ref_wq_a.shape == (self.q_lora_rank, self.dim) + assert self.ref_wki.shape == (self.idx_head_dim, self.dim) + assert self.ref_wis.shape == (self.idx_score_dim, self.dim) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + tilert_aliases = self.tilert_weights_alias() + weights_list = [state_dict[alias] for alias in tilert_aliases] + converter = RMSNormProjxWqakisWeightsConverter(self.model_args, self.num_devices) + result = converter.convert_to_decoupled(weights_list) + self.tilert_wqakis, self.tilert_wis, self.tilert_norm_gamma = result + + def init_tilert_vars(self, batch_size: int, seq_len: int) -> None: + self.q_out = torch.zeros((batch_size, seq_len, self.q_lora_rank), dtype=torch.bfloat16) + self.ki_out = torch.zeros((batch_size, seq_len, self.idx_head_dim), dtype=torch.bfloat16) + self.idx_scores_out = torch.zeros( + (batch_size, seq_len, self.idx_score_dim), dtype=torch.bfloat16 + ) + self.x_rmsnorm_out = torch.zeros((batch_size, seq_len, self.dim), dtype=torch.bfloat16) + self.x_rmsnorm_quant_out = torch.zeros( + (batch_size, seq_len, self.dim), dtype=torch.float8_e4m3fn + ) + self.x_rmsnorm_quant_scale_out = torch.zeros( + (batch_size, seq_len, self.dim // self.block_size), dtype=torch.float32 + ) + self.profile_logs = get_profile_log_tensor() + self.is_init = True + + def init_random_weights(self) -> None: + bs = self.block_size + dim_scale_dim = self.dim // bs + q_scale_dim = (self.q_lora_rank + bs - 1) // bs + ki_scale_dim = (self.idx_head_dim + bs - 1) // bs + scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 + + tensor_list = [ + torch.randn(self.dim, dtype=torch.float32), + torch.randn(self.q_lora_rank, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(q_scale_dim, dim_scale_dim, dtype=scale_dtype), + torch.randn(self.idx_head_dim, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(ki_scale_dim, dim_scale_dim, dtype=scale_dtype), + torch.randn(self.idx_score_dim, self.dim, dtype=torch.bfloat16), + ] + ref_state_dict = dict(zip(self.ref_weights_alias(), tensor_list)) + self.init_reference_weights(ref_state_dict) + self.init_tilert_weights( + {_k: _v[self.device_id] for _k, _v in self.device_sharding(ref_state_dict).items()} + ) + + def golden_forward( + self, + x: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Pure PyTorch reference: RMSNorm -> q, ki, idx_scores.""" + assert self.ref_norm_gamma is not None + assert self.ref_wq_a is not None + assert self.ref_wki is not None + assert self.ref_wis is not None + + x_rmsnorm = torch.nn.functional.rms_norm( + x.float(), [x.size(-1)], self.ref_norm_gamma, self.eps + ) + q_out = torch.matmul(x_rmsnorm.float(), self.ref_wq_a.transpose(0, 1).float()) + ki_out = torch.matmul(x_rmsnorm.float(), self.ref_wki.transpose(0, 1).float()) + idx_scores_out = torch.matmul(x_rmsnorm.float(), self.ref_wis.transpose(0, 1).float()) + return ( + q_out.to(torch.bfloat16), + ki_out.to(torch.bfloat16), + idx_scores_out.to(torch.bfloat16), + ) + + def tilert_forward( + self, + x: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Run RMSNorm + ProjXWqaki + ProjXWis via TileRT CUDA kernels.""" + rmsnorm_quant( + x.to(torch.bfloat16), + self.tilert_norm_gamma, + self.x_rmsnorm_out, + self.x_rmsnorm_quant_out, + self.x_rmsnorm_quant_scale_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + projx_wqaki( + self.x_rmsnorm_quant_out, + self.x_rmsnorm_quant_scale_out, + self.tilert_wqakis, + self.q_out, + self.ki_out, + self.profile_logs, + self.compute_kernel_type, + model_arch=self.model_args.arch_name, + ) + wis_compute_kernel_type = "bf16" + projx_wis( + self.x_rmsnorm_out, + self.tilert_wis, + self.idx_scores_out, + wis_compute_kernel_type, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + + return self.q_out, self.ki_out, self.idx_scores_out + + def __call__( + self, + x: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + return self.golden_forward(x) diff --git a/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projx_wqkva.py b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projx_wqkva.py new file mode 100644 index 0000000..5343357 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_projx_wqkva.py @@ -0,0 +1,516 @@ +"""RMSNormProjxWqkva operation module.""" + +from enum import Enum + +import torch + +from tilert.models.base import TileRTModule, TilertWeightsConverter +from tilert.models.common import weight_dequant +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RMSNormProjxWqkva", + "RMSNormProjxWqkvaAlgorithm", +] + + +class RMSNormProjQKVAFP8MMAWeightsConverter: + """Weight converter: pack FP8 weights into the kernel's packed layout.""" + + HIDDEN_DIM = 6144 + Q_LORA_RANK = 2048 + KV_LORA_RANK = 512 + QK_ROPE_HEAD_DIM = 64 + TOTAL_ROWS = Q_LORA_RANK + KV_LORA_RANK + QK_ROPE_HEAD_DIM + ROWS_PER_CTA = 32 + NUM_CTAS = TOTAL_ROWS // ROWS_PER_CTA + COLS_PER_PAGE = 1024 + NUM_PAGES = HIDDEN_DIM // COLS_PER_PAGE + SCALES_PER_PAGE = COLS_PER_PAGE // 128 + BLOCK_SIZE = 128 + + MAT_BYTES = ROWS_PER_CTA * COLS_PER_PAGE + SCALE_OFFSET = MAT_BYTES + PAGE_BYTES = ((MAT_BYTES + 128 + 127) // 128) * 128 + + @staticmethod + def _swizzle_mma_16x32(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle [*, 16, 32] tiles into the packed weight layout.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) + + @staticmethod + def convert_to_fp8_mma_gemv( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wkv_a: torch.Tensor, + wkv_a_scale: torch.Tensor, + w_pe: torch.Tensor, + w_pe_scale: torch.Tensor, + attn_norm_weight: torch.Tensor, + *, + hidden_dim: int = 6144, + q_lora_rank: int = 2048, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Pack FP8 weights for the FP8 MMA kernel. + + Args: + hidden_dim: Model hidden dimension. + q_lora_rank: Q projection rank. + """ + C = RMSNormProjQKVAFP8MMAWeightsConverter + block_size = C.BLOCK_SIZE + kv_lora_rank = C.KV_LORA_RANK + qk_rope_head_dim = C.QK_ROPE_HEAD_DIM + + expected = q_lora_rank * hidden_dim + assert wq_a.numel() == expected, f"wq_a numel {wq_a.numel()} != expected {expected}" + expected = kv_lora_rank * hidden_dim + assert wkv_a.numel() == expected, f"wkv_a numel {wkv_a.numel()} != expected {expected}" + expected = qk_rope_head_dim * hidden_dim + assert w_pe.numel() == expected, f"w_pe numel {w_pe.numel()} != expected {expected}" + + total_rows = q_lora_rank + kv_lora_rank + qk_rope_head_dim + num_ctas = total_rows // C.ROWS_PER_CTA + num_pages = hidden_dim // C.COLS_PER_PAGE + + wq_a_f = weight_dequant(wq_a.reshape(q_lora_rank, hidden_dim), wq_a_scale) + wkv_a_f = weight_dequant(wkv_a.reshape(kv_lora_rank, hidden_dim), wkv_a_scale) + w_pe_f = weight_dequant(w_pe.reshape(qk_rope_head_dim, hidden_dim), w_pe_scale) + w_float = torch.cat([wq_a_f, wkv_a_f, w_pe_f], dim=0) + + w_blocks = w_float.reshape(total_rows, hidden_dim // block_size, block_size) + col_max = w_blocks.abs().amax(dim=(0, 2)) + fp8_max = torch.finfo(torch.float8_e4m3fn).max + w_scales = (col_max / fp8_max).clamp(min=1e-12) + + scales_expanded = w_scales.repeat_interleave(block_size) + w_scaled = w_float / scales_expanded.unsqueeze(0) + w_fp8 = w_scaled.to(torch.float8_e4m3fn) + + assert C.MAT_BYTES == C.SCALE_OFFSET, "Layout mismatch: scales must follow mat" + assert block_size == C.COLS_PER_PAGE // C.SCALES_PER_PAGE, "Block size mismatch" + assert w_scales.numel() == num_pages * C.SCALES_PER_PAGE, "Scale count mismatch" + + w_bytes = w_fp8.view(torch.uint8) + num_tiles = C.COLS_PER_PAGE // 32 + + mat = w_bytes.reshape(num_ctas, C.ROWS_PER_CTA, num_pages, C.COLS_PER_PAGE) + mat = mat.transpose(1, 2) + + mat = mat.reshape(num_ctas, num_pages, 2, 16, num_tiles, 32) + mat = mat.transpose(3, 4) + mat = C._swizzle_mma_16x32(mat) + mat = mat.contiguous().reshape(num_ctas, num_pages, C.MAT_BYTES) + + scales_f32 = w_scales.reshape(num_pages, C.SCALES_PER_PAGE).to(torch.float32).contiguous() + scales_bytes = scales_f32.view(torch.uint8) + scales_bytes = scales_bytes.unsqueeze(0).expand(num_ctas, -1, -1) + + pad_size = C.PAGE_BYTES - C.MAT_BYTES - C.SCALES_PER_PAGE * 4 + padding = torch.zeros(num_ctas, num_pages, pad_size, dtype=torch.uint8, device=w_fp8.device) + + packed = torch.cat([mat, scales_bytes, padding], dim=-1) + packed = packed.contiguous().reshape(-1) + + return packed.view(torch.float8_e4m3fn), attn_norm_weight.clone() + + +class RMSNormProjQKVAFP16MMAWeightsConverter: + """Weight converter: pack FP16 weights for the kernel.""" + + KV_LORA_RANK = 512 + QK_ROPE_HEAD_DIM = 64 + ROWS_PER_CTA = 32 + COLS_PER_PAGE = 512 + BLOCK_SIZE = 128 + + @staticmethod + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + """Swizzle [*, 16, 16] tiles into the packed weight layout.""" + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 + pre_shape = mat_in.shape[:-2] + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + + @staticmethod + def convert_to_fp16_mma_gemv( + wq_a: torch.Tensor, + wq_a_scale: torch.Tensor, + wkv_a: torch.Tensor, + wkv_a_scale: torch.Tensor, + w_pe: torch.Tensor, + w_pe_scale: torch.Tensor, + attn_norm_weight: torch.Tensor, + *, + hidden_dim: int = 6144, + q_lora_rank: int = 2048, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Pack weights into the FP16 layout expected by the kernel.""" + C = RMSNormProjQKVAFP16MMAWeightsConverter + kv_lora_rank = C.KV_LORA_RANK + qk_rope_head_dim = C.QK_ROPE_HEAD_DIM + cols_per_page = C.COLS_PER_PAGE + rows_per_cta = C.ROWS_PER_CTA + + total_rows = q_lora_rank + kv_lora_rank + qk_rope_head_dim + num_ctas = total_rows // rows_per_cta + num_pages = hidden_dim // cols_per_page + num_k_tiles = cols_per_page // 16 + + wq_a_f = weight_dequant(wq_a.reshape(q_lora_rank, hidden_dim), wq_a_scale) + wkv_a_f = weight_dequant(wkv_a.reshape(kv_lora_rank, hidden_dim), wkv_a_scale) + w_pe_f = weight_dequant(w_pe.reshape(qk_rope_head_dim, hidden_dim), w_pe_scale) + w_float = torch.cat([wq_a_f, wkv_a_f, w_pe_f], dim=0) + + w_fp16 = w_float.to(torch.float16) + + mat = w_fp16.reshape(num_ctas, rows_per_cta, num_pages, cols_per_page) + mat = mat.transpose(1, 2) + + mat = mat.reshape(num_ctas, num_pages, 2, 16, num_k_tiles, 16) + mat = mat.transpose(3, 4) + mat = C._swizzle_mma_16x16(mat) + mat = mat.contiguous() + + mat_bytes = mat.view(torch.uint8).reshape(num_ctas, num_pages, -1) + packed = mat_bytes.contiguous().reshape(-1) + + return packed.view(torch.float16), attn_norm_weight.clone() + + +class RMSNormProjxWqkvaAlgorithm(Enum): + """RMSNormProjxWqkva algorithm.""" + + DECOUPLED = "decoupled" + + +class RMSNormProjxWqkvaWeightsConverter(TilertWeightsConverter): + """Dispatch weight converter for RMSNormProjxWqkva.""" + + def __init__(self, model_args: ModelArgs, num_devices: int): + super().__init__(model_args, num_devices) + + def convert_to_fp8_mma_gemv( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tilert weights list to the FP8 kernel-ready format. + + Args: + weights: [gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, w_pe, w_pe_scale] + """ + gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, w_pe, w_pe_scale = weights + return RMSNormProjQKVAFP8MMAWeightsConverter.convert_to_fp8_mma_gemv( + wq_a, + wq_a_scale, + wkv_a, + wkv_a_scale, + w_pe, + w_pe_scale, + gamma, + hidden_dim=self.model_args.dim, + q_lora_rank=self.model_args.q_lora_rank, + ) + + def convert_to_fp16_mma_gemv( + self, weights: list[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tilert weights list to the FP16 kernel-ready format. + + Args: + weights: [gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, w_pe, w_pe_scale] + """ + gamma, wq_a, wq_a_scale, wkv_a, wkv_a_scale, w_pe, w_pe_scale = weights + return RMSNormProjQKVAFP16MMAWeightsConverter.convert_to_fp16_mma_gemv( + wq_a, + wq_a_scale, + wkv_a, + wkv_a_scale, + w_pe, + w_pe_scale, + gamma, + hidden_dim=self.model_args.dim, + q_lora_rank=self.model_args.q_lora_rank, + ) + + +class RMSNormProjxWqkvaRefWeightsAlias: + """Reference weight aliases for RMSNormProjxWqkva.""" + + x_rmsnorm_gamma = "input_layernorm.weight" + q_a_weights = "self_attn.q_a_proj.weight" + q_a_scales = "self_attn.q_a_proj.weight_scale_inv" + kv_a_weights = "self_attn.kv_a_proj_with_mqa.weight" + kv_a_scales = "self_attn.kv_a_proj_with_mqa.weight_scale_inv" + + @property + def ref_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.kv_a_weights, + self.kv_a_scales, + ] + + def __call__(self) -> list[str]: + return self.ref_tensor_alias + + +class RMSNormProjxWqkvaTilertWeightsAlias: + """Tilert weight aliases for RMSNormProjxWqkva.""" + + x_rmsnorm_gamma = "x_rmsnorm_gamma" + q_a_weights = "q_a_weights" + q_a_scales = "q_a_scales" + kv_a_weights = "kv_a_weights" + kv_a_scales = "kv_a_scales" + w_pe_weights = "w_pe_weights" + w_pe_scales = "w_pe_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.x_rmsnorm_gamma, + self.q_a_weights, + self.q_a_scales, + self.kv_a_weights, + self.kv_a_scales, + self.w_pe_weights, + self.w_pe_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RMSNormProjxWqkva(TileRTModule): + """Fused RMSNorm + GEMV(W_q_a, W_kv_a, W_pe).""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RMSNormProjxWqkvaAlgorithm.DECOUPLED], + "glm_5": [RMSNormProjxWqkvaAlgorithm.DECOUPLED], + } + + def __init__( + self, + model_args: ModelArgs, + num_devices: int, + device_id: int, + ref_weights_alias: RMSNormProjxWqkvaRefWeightsAlias | None = None, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + num_devices=num_devices, + device_id=device_id, + ) + + self.tilert_weights_alias = RMSNormProjxWqkvaTilertWeightsAlias() + self.ref_weights_alias = ( + ref_weights_alias + if ref_weights_alias is not None + else RMSNormProjxWqkvaRefWeightsAlias() + ) + + self.dim = self.model_args.dim + self.q_lora_rank = self.model_args.q_lora_rank + self.kv_lora_rank = self.model_args.kv_lora_rank + self.qk_rope_head_dim = self.model_args.qk_rope_head_dim + self.block_size = self.model_args.block_size + self.eps = self.model_args.eps + self.algorithm = RMSNormProjxWqkvaAlgorithm.DECOUPLED + + self.ref_norm_gamma: torch.Tensor | None = None + self.ref_wq_a: torch.Tensor | None = None + self.ref_wkv_a: torch.Tensor | None = None + self.ref_w_pe: torch.Tensor | None = None + + self.tilert_norm_gamma: torch.Tensor | None = None + self.tilert_wqkva: torch.Tensor | None = None + self.tilert_wqkva_scales = torch.zeros((1, 1), dtype=torch.bfloat16) + + self.x_rmsnorm_out: torch.Tensor | None = None + self.x_rmsnorm_quant_out: torch.Tensor | None = None + self.x_rmsnorm_quant_scale_out: torch.Tensor | None = None + + self.q_out: torch.Tensor | None = None + self.kv_out: torch.Tensor | None = None + self.pe_cache_out: torch.Tensor | None = None + self.cur_pos: torch.Tensor | None = None + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + self.tilert_tensor_alias: list[str] = [ + "x_rmsnorm_gamma", + "qkva_weights", + "qkva_scales", + ] + + def get_weights_list(self) -> list[torch.Tensor]: + return [self.tilert_norm_gamma, self.tilert_wqkva, self.tilert_wqkva_scales] + + def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Repeat weights for device sharding.""" + input_layernorm_weight = ( + weights_map[self.ref_weights_alias.x_rmsnorm_gamma][None, ...] + .float() + .repeat(self.num_devices, 1) + ) + q_a_proj_weight = weights_map[self.ref_weights_alias.q_a_weights][None, ...].repeat( + self.num_devices, 1, 1 + ) + q_a_proj_weight_scale = weights_map[self.ref_weights_alias.q_a_scales][None, ...].repeat( + self.num_devices, 1, 1 + ) + kv_a_mqa = weights_map[self.ref_weights_alias.kv_a_weights] + kv_a_proj_weight = kv_a_mqa[: self.kv_lora_rank, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + w_pe_weight = kv_a_mqa[self.kv_lora_rank :, :][None, ...].repeat(self.num_devices, 1, 1) + kv_a_mqa_scale = weights_map[self.ref_weights_alias.kv_a_scales] + kv_scale_rows = (self.kv_lora_rank + self.block_size - 1) // self.block_size + kv_a_proj_weight_scale = kv_a_mqa_scale[:kv_scale_rows, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + w_pe_weight_scale = kv_a_mqa_scale[kv_scale_rows:, :][None, ...].repeat( + self.num_devices, 1, 1 + ) + return { + self.tilert_weights_alias.x_rmsnorm_gamma: input_layernorm_weight, + self.tilert_weights_alias.q_a_weights: q_a_proj_weight, + self.tilert_weights_alias.q_a_scales: q_a_proj_weight_scale, + self.tilert_weights_alias.kv_a_weights: kv_a_proj_weight, + self.tilert_weights_alias.kv_a_scales: kv_a_proj_weight_scale, + self.tilert_weights_alias.w_pe_weights: w_pe_weight, + self.tilert_weights_alias.w_pe_scales: w_pe_weight_scale, + } + + def init_reference_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + aliases = self.ref_weights_alias() + self.ref_norm_gamma = state_dict[aliases[0]] + self.ref_wq_a = weight_dequant(state_dict[aliases[1]], state_dict[aliases[2]]) + kv_a_mqa = weight_dequant(state_dict[aliases[3]], state_dict[aliases[4]]) + self.ref_wkv_a = kv_a_mqa[: self.kv_lora_rank, :] + self.ref_w_pe = kv_a_mqa[self.kv_lora_rank :, :] + + assert self.ref_norm_gamma.shape[-1] == self.dim + assert self.ref_wq_a.shape == (self.q_lora_rank, self.dim) + assert self.ref_wkv_a.shape == (self.kv_lora_rank, self.dim) + assert self.ref_w_pe.shape == (self.qk_rope_head_dim, self.dim) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + tilert_aliases = self.tilert_weights_alias() + weights_list = [state_dict[alias] for alias in tilert_aliases] + converter = RMSNormProjxWqkvaWeightsConverter(self.model_args, self.num_devices) + self.tilert_wqkva, self.tilert_norm_gamma = converter.convert_to_fp8_mma_gemv(weights_list) + self.tilert_wqkva_scales = torch.zeros((1,), dtype=torch.float32) + + def init_tilert_vars(self, batch_size: int, seq_len: int, max_len: int = 128) -> None: + self.q_out = torch.zeros((batch_size, seq_len, self.q_lora_rank), dtype=torch.bfloat16) + self.kv_out = torch.zeros((batch_size, seq_len, self.kv_lora_rank), dtype=torch.bfloat16) + self.pe_cache_out = torch.zeros( + (batch_size, max_len, self.qk_rope_head_dim), dtype=torch.bfloat16 + ) + self.cur_pos = torch.zeros((1,), dtype=torch.int32) + self.x_rmsnorm_out = torch.zeros((batch_size, seq_len, self.dim), dtype=torch.bfloat16) + self.x_rmsnorm_quant_out = torch.zeros( + (batch_size, seq_len, self.dim), dtype=torch.float8_e4m3fn + ) + self.x_rmsnorm_quant_scale_out = torch.zeros( + (batch_size, seq_len, self.dim // self.block_size), dtype=torch.float32 + ) + self.profile_logs = get_profile_log_tensor() + self.is_init = True + + def init_random_weights(self) -> None: + bs = self.block_size + dim_scale_dim = self.dim // bs + q_scale_dim = (self.q_lora_rank + bs - 1) // bs + kv_mqa_rows = self.kv_lora_rank + self.qk_rope_head_dim + kv_mqa_scale_dim = (kv_mqa_rows + bs - 1) // bs + scale_dtype = torch.bfloat16 + + tensor_list = [ + torch.randn(self.dim, dtype=torch.float32), + torch.randn(self.q_lora_rank, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(q_scale_dim, dim_scale_dim, dtype=scale_dtype), + torch.randn(kv_mqa_rows, self.dim, dtype=torch.bfloat16).to(torch.float8_e4m3fn), + torch.randn(kv_mqa_scale_dim, dim_scale_dim, dtype=scale_dtype), + ] + ref_state_dict = dict(zip(self.ref_weights_alias(), tensor_list)) + self.init_reference_weights(ref_state_dict) + self.init_tilert_weights( + {_k: _v[self.device_id] for _k, _v in self.device_sharding(ref_state_dict).items()} + ) + + def golden_forward( + self, + x: torch.Tensor, + cur_pos: int = 0, # noqa: U100 + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Pure PyTorch reference: RMSNorm -> q, kv, pe.""" + assert self.ref_norm_gamma is not None + assert self.ref_wq_a is not None + assert self.ref_wkv_a is not None + assert self.ref_w_pe is not None + + x_rmsnorm = torch.nn.functional.rms_norm( + x.float(), [x.size(-1)], self.ref_norm_gamma, self.eps + ) + q_out = torch.matmul(x_rmsnorm.float(), self.ref_wq_a.transpose(0, 1).float()) + kv_out = torch.matmul(x_rmsnorm.float(), self.ref_wkv_a.transpose(0, 1).float()) + pe_out = torch.matmul(x_rmsnorm.float(), self.ref_w_pe.transpose(0, 1).float()) + return ( + q_out.to(torch.bfloat16), + kv_out.to(torch.bfloat16), + pe_out.to(torch.bfloat16), + ) + + def tilert_forward( + self, + x: torch.Tensor, + cur_pos: int = 0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Run RMSNorm + 3-way GEMV via the TileRT CUDA kernels.""" + assert self.cur_pos is not None + assert self.pe_cache_out is not None + self.cur_pos.fill_(cur_pos) + + from tilert.models.glm_5._dsa_v32.ops.projx_wqkva import projx_wqkva as _projx_wqkva + from tilert.models.glm_5._dsa_v32.ops.rmsnorm_quant import rmsnorm_quant as _rmsnorm_quant + + _rmsnorm_quant( + x.to(torch.bfloat16), + self.tilert_norm_gamma, + self.x_rmsnorm_out, + self.x_rmsnorm_quant_out, + self.x_rmsnorm_quant_scale_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + _projx_wqkva( + self.x_rmsnorm_quant_out, + self.x_rmsnorm_quant_scale_out, + self.tilert_wqkva, + self.cur_pos, + self.q_out, + self.kv_out, + self.pe_cache_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) + + seq_len = x.size(-2) + pe_at_pos = self.pe_cache_out[:, cur_pos : cur_pos + seq_len, :] + return self.q_out, self.kv_out, pe_at_pos + + def __call__( + self, + x: torch.Tensor, + cur_pos: int = 0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + return self.golden_forward(x, cur_pos) diff --git a/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_quant.py b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_quant.py new file mode 100644 index 0000000..1d399c5 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_quant.py @@ -0,0 +1,64 @@ +"""RMSNormQuant operation module.""" + +from __future__ import annotations + +import torch + +__all__ = [ + "BLOCK_SIZE", + "DIM_DEEPSEEK_V3_2", + "DIM_GLM_5", + "rmsnorm_quant", +] + +BLOCK_SIZE = 128 +DIM_DEEPSEEK_V3_2 = 7168 +DIM_GLM_5 = 6144 + + +def rmsnorm_quant( + hidden_in: torch.Tensor, + gamma_in: torch.Tensor, + hidden_out: torch.Tensor, + quant_hidden_out: torch.Tensor | None = None, + quant_hidden_scale_out: torch.Tensor | None = None, + profile_logs: torch.Tensor | None = None, + compute_kernel_type: str = "general", + *, + model_arch: str, +) -> None: + """ + Rmsnorm with optional activation quantization. + + Args: + hidden_in: Input tensor (..., dim). + gamma_in: RMSNorm gamma (dim,). + hidden_out: RMSNorm output (..., dim). + quant_hidden_out: Optional quantized output (..., dim). If None, no quant. + quant_hidden_scale_out: Optional quant scale (..., dim // block_size). If None, no quant. + profile_logs: Optional profile logs tensor. + """ + if profile_logs is None: + raise ValueError("profile_logs is required when calling rmsnorm_quant.") + + if quant_hidden_out is None or quant_hidden_scale_out is None: + torch.ops.tilert.rmsnorm_op( + hidden_in, + gamma_in, + hidden_out, + model_arch, + compute_kernel_type, + profile_logs, + ) + else: + torch.ops.tilert.rmsnorm_quant_op( + hidden_in, + gamma_in, + hidden_out, + quant_hidden_out, + quant_hidden_scale_out, + model_arch, + compute_kernel_type, + profile_logs, + torch.empty(0, dtype=torch.int64, device=hidden_in.device), + ) diff --git a/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_up_gate_silu.py b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_up_gate_silu.py new file mode 100644 index 0000000..25adae9 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/rmsnorm_up_gate_silu.py @@ -0,0 +1,363 @@ +"""RMSNormUpGateSiLU operation module.""" + +from dataclasses import dataclass +from enum import Enum + +import torch +import torch.nn.functional as F + +from tilert.models.base import TileRTModule +from tilert.models.common import weight_dequant +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.ops.expert_sel_up_gate_silu import ( + ExpertSelectUpGateSiLU, + ExpertSelectUpGateSiLUWeightsConverter, +) +from tilert.utils import get_profile_log_tensor + +__all__ = [ + "RMSNormUpGateSiLUAlgorithm", + "RMSNormUpGateSiLU", + "RMSNormUpGateSiLUTilertWeightsAlias", + "rmsnorm_up_gate_silu", +] + + +def rmsnorm_up_gate_silu( + hidden_in: torch.Tensor, + gamma_in: torch.Tensor, + weights_in: torch.Tensor, + hidden_out: torch.Tensor, + profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "fp8mma", +) -> None: + """rmsnorm_up_gate_silu operation.""" + torch.ops.tilert.rmsnorm_up_gate_silu_op( + hidden_in, + gamma_in, + weights_in, + hidden_out, + model_arch, + compute_kernel_type, + profile_logs, + ) + + +class RMSNormUpGateSiLUAlgorithm(Enum): + """RMSNormUpGateSiLU algorithm""" + + FP8MMA = "fp8mma" + FP16MMA = "fp16mma" + + +RMSNormUpGateSiLUWeightsConverter = ExpertSelectUpGateSiLUWeightsConverter +ExpertSelectUpGateSiLUW = ExpertSelectUpGateSiLUWeightsConverter + + +@dataclass +class RMSNormUpGateSiLUTilertWeightsAlias: + """TileRT weights alias for RMSNormUpGateSiLU.""" + + unproj_o_gamma = "unproj_o_gamma" + gate_weights = "gate_weights" + gate_scales = "gate_scales" + up_weights = "up_weights" + up_scales = "up_scales" + + @property + def tilert_tensor_alias(self) -> list[str]: + return [ + self.unproj_o_gamma, + self.gate_weights, + self.gate_scales, + self.up_weights, + self.up_scales, + ] + + def __call__(self) -> list[str]: + return self.tilert_tensor_alias + + +class RMSNormUpGateSiLU(TileRTModule): + """RMSNormUpGateSiLU module""" + + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RMSNormUpGateSiLUAlgorithm.FP8MMA, RMSNormUpGateSiLUAlgorithm.FP16MMA], + "glm_5": [RMSNormUpGateSiLUAlgorithm.FP8MMA, RMSNormUpGateSiLUAlgorithm.FP16MMA], + } + + def __init__( + self, + model_args: ModelArgs, + device_id: int, + num_devices: int, + algorithm: RMSNormUpGateSiLUAlgorithm = RMSNormUpGateSiLUAlgorithm.FP8MMA, + ): + super().__init__( + self.__class__.__name__, + model_args=model_args, + device_id=device_id, + num_devices=num_devices, + ) + + self.arch_name = self.model_args.arch_name + self.dim = self.model_args.dim + + self.inter_dim = self.model_args.inter_dim + self.moe_inter_dim = self.model_args.moe_inter_dim + self.moe_inter_dim_per_device = self.moe_inter_dim // self.num_devices + self.inter_dim_per_device = self.inter_dim // self.num_devices + self.n_experts = self.inter_dim_per_device // self.moe_inter_dim_per_device + self.eps = self.model_args.eps + + self.block_size = self.model_args.block_size + self.algorithm = algorithm + + self.ref_norm_gamma: torch.Tensor | None = None + self.ref_gate: torch.Tensor | None = None + self.ref_up: torch.Tensor | None = None + + self.tilert_norm_gamma: torch.Tensor | None = None + self.tilert_weights: torch.Tensor | None = None + self.tilert_scales = torch.zeros( + 9, 4, 64, dtype=torch.bfloat16, device=torch.device("cuda") + ) + + self.hidden_out: torch.Tensor | None = None + + self.profile_logs: torch.Tensor | None = None + self.is_init = False + + self.rmsnorm_up_gate_silu_func = rmsnorm_up_gate_silu + + self.tilert_weights_alias = RMSNormUpGateSiLUTilertWeightsAlias() + + self.ref_tensor_alias: list[str] = [ + "post_attention_layernorm.weight", + "mlp.gate_proj.weight", + "mlp.gate_proj.weight_scale_inv", + "mlp.up_proj.weight", + "mlp.up_proj.weight_scale_inv", + ] + + @property + def tilert_tensor_alias(self) -> list[str]: + return self.tilert_weights_alias() + + def get_weights_list(self) -> list[torch.Tensor]: + """ + Get the weights list. + + Returns: + List of weights. + """ + return [self.tilert_norm_gamma, self.tilert_weights, self.tilert_scales] + + def device_sharding( + self, + weights_dict: dict[str, torch.Tensor], + key_prefix: str, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Device sharding. + + Args: + weights_dict: Dictionary of weights. + + Returns: + Tuple of weights. + """ + rmsnorm_gamma_key = f"{key_prefix}.post_attention_layernorm.weight" + if ".mlp" in key_prefix: + key_prefix_without_mlp = key_prefix.replace(".mlp", "") + rmsnorm_gamma_key = f"{key_prefix_without_mlp}.post_attention_layernorm.weight" + elif key_prefix == "mlp": + rmsnorm_gamma_key = "post_attention_layernorm.weight" + rmsnorm_gamma = weights_dict[rmsnorm_gamma_key] + rmsnorm_gamma = rmsnorm_gamma[None, :].repeat(self.num_devices, 1) + + gate_weights, gate_scales, up_weights, up_scales = ( + ExpertSelectUpGateSiLU.process_gate_up_weights( + key_prefix, + weights_dict, + self.num_devices, + ) + ) + gate_weights = gate_weights.reshape(self.n_experts, self.num_devices, -1, self.dim) + gate_weights = gate_weights.transpose(0, 1) + gate_scales = gate_scales.reshape( + self.n_experts, self.num_devices, -1, self.dim // self.block_size + ) + gate_scales = gate_scales.transpose(0, 1) + up_weights = up_weights.reshape(self.n_experts, self.num_devices, -1, self.dim) + up_weights = up_weights.transpose(0, 1) + up_scales = up_scales.reshape( + self.n_experts, self.num_devices, -1, self.dim // self.block_size + ) + up_scales = up_scales.transpose(0, 1) + return ( + rmsnorm_gamma.contiguous(), + gate_weights.contiguous(), + gate_scales.contiguous(), + up_weights.contiguous(), + up_scales.contiguous(), + ) + + def init_reference_weights( + self, + state_dict: dict[str, torch.Tensor], + key_prefix: str, + device_id: int = 0, + ) -> None: + """ + Initialize the reference weights. + + Args: + state_dict: State dictionary. + device_id: Device ID. + """ + sharded_list = self.device_sharding(state_dict, key_prefix) + + gamma = sharded_list[0][device_id] + gate_weights = sharded_list[1][device_id] + gate_scales = sharded_list[2][device_id] + up_weights = sharded_list[3][device_id] + up_scales = sharded_list[4][device_id] + self.ref_norm_gamma = gamma + ref_gate_list = [ + weight_dequant(gate_weights, gate_scales) + for gate_weights, gate_scales in zip(gate_weights, gate_scales) + ] + ref_up_list = [ + weight_dequant(up_weights, up_scales) + for up_weights, up_scales in zip(up_weights, up_scales) + ] + self.ref_gate = torch.stack(ref_gate_list, dim=0) + self.ref_up = torch.stack(ref_up_list, dim=0) + + def init_tilert_weights(self, state_dict: dict[str, torch.Tensor]) -> None: + """ + Initialize the tilert weights. + + Args: + state_dict: State dictionary. + """ + assert self.algorithm is not None, "Algorithm is not set" + self.tilert_norm_gamma, self.tilert_weights = RMSNormUpGateSiLUWeightsConverter( + self.model_args, self.num_devices + ).dispatch(self.algorithm, [state_dict[alias] for alias in self.tilert_weights_alias()]) + + def init_tilert_vars(self, batch_size: int, seq_len: int, dev_id: int = 0) -> None: + """ + Initialize the tilert variables. + + Args: + batch_size: Batch size. + seq_len: Sequence length. + """ + self.hidden_out = torch.zeros( + ( + batch_size, + seq_len, + self.n_experts, + self.moe_inter_dim_per_device, + ), + dtype=torch.bfloat16, + device=f"cuda:{dev_id}", + ) + + self.profile_logs = get_profile_log_tensor(device=f"cuda:{dev_id}") + self.is_init = True + + def init_random_weights(self, dev_id: int = 0) -> None: + """ + Initialize the random weights. + + Returns: + None + """ + gamma = torch.randn(self.dim, dtype=torch.float32, device=f"cuda:{dev_id}") + gate_weights = torch.randn( + self.inter_dim, self.dim, dtype=torch.bfloat16, device=f"cuda:{dev_id}" + ).to(torch.float8_e4m3fn) + up_weights = torch.randn( + self.inter_dim, self.dim, dtype=torch.bfloat16, device=f"cuda:{dev_id}" + ).to(torch.float8_e4m3fn) + inter_dim_scale_dim = self.inter_dim // self.block_size + dim_scale_dim = self.dim // self.block_size + scale_dtype = torch.float32 if self.arch_name == "glm_5" else torch.bfloat16 + gate_scales = torch.randn( + inter_dim_scale_dim, dim_scale_dim, dtype=scale_dtype, device=f"cuda:{dev_id}" + ) + up_scales = torch.randn( + inter_dim_scale_dim, dim_scale_dim, dtype=scale_dtype, device=f"cuda:{dev_id}" + ) + tensor_list = [ + gamma, + gate_weights, + gate_scales, + up_weights, + up_scales, + ] + state_dict = dict(zip(self.ref_tensor_alias, tensor_list)) + self.init_reference_weights(state_dict, "mlp", dev_id) + sharded_list = self.device_sharding(state_dict, "mlp") + sharded_state_dict = { + alias: sharded_list[i][dev_id] for i, alias in enumerate(self.tilert_weights_alias()) + } + self.init_tilert_weights(sharded_state_dict) + + def golden_forward( + self, + x_in: torch.Tensor, + ) -> torch.Tensor: + assert self.ref_gate is not None + assert self.ref_up is not None + bsz = x_in.shape[0] + seq_len = x_in.shape[1] + assert bsz == 1 + x_in_rmsnorm = torch.nn.functional.rms_norm( + x_in.float(), [x_in.size(-1)], self.ref_norm_gamma, self.eps + ) + hidden_out_list = [] + for s in range(seq_len): + hidden_out_w1_list = [] + hidden_out_w3_list = [] + + for i in range(self.n_experts): + hidden_out_w1_sel = x_in_rmsnorm[0, s].float() @ self.ref_gate[i].float().T + hidden_out_w3_sel = x_in_rmsnorm[0, s].float() @ self.ref_up[i].float().T + hidden_out_w1_list.append(hidden_out_w1_sel) + hidden_out_w3_list.append(hidden_out_w3_sel) + hidden_out_w1 = torch.stack(hidden_out_w1_list, dim=0) + hidden_out_w3 = torch.stack(hidden_out_w3_list, dim=0) + hidden_out = F.silu(hidden_out_w1.float()) * hidden_out_w3.float() + hidden_out = hidden_out.to(torch.bfloat16) + hidden_out_list.append(hidden_out) + hidden_out = torch.stack(hidden_out_list, dim=0) + hidden_out = hidden_out[None, ...] + return hidden_out + + def tilert_forward( + self, + x_in: torch.Tensor, + ) -> torch.Tensor: + assert self.rmsnorm_up_gate_silu_func is not None + assert self.algorithm is not None, "Algorithm is not set" + self.rmsnorm_up_gate_silu_func( + x_in, + self.tilert_norm_gamma, + self.tilert_weights, + self.hidden_out, + self.profile_logs, + model_arch=self.model_args.arch_name, + compute_kernel_type=self.algorithm.value, + ) + return self.hidden_out + + def __call__( + self, + x_in: torch.Tensor, + ) -> torch.Tensor: + return self.golden_forward(x_in) diff --git a/python/models/deepseek_v3_2/ops/rotate.py b/tilert/models/glm_5/_dsa_v32/ops/rotate.py similarity index 79% rename from python/models/deepseek_v3_2/ops/rotate.py rename to tilert/models/glm_5/_dsa_v32/ops/rotate.py index 539f334..10a46f1 100644 --- a/python/models/deepseek_v3_2/ops/rotate.py +++ b/tilert/models/glm_5/_dsa_v32/ops/rotate.py @@ -1,12 +1,14 @@ +"""Rotate(hadamard transform) operation module.""" + from dataclasses import dataclass +from enum import Enum import torch import torch.nn.functional as F from tilert.models.base import TileRTModule -from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs from tilert.models.utils import apply_rotary_emb -from tilert.profiler.utils import parse_profile_log_tensor from tilert.utils import get_profile_log_tensor try: @@ -61,45 +63,31 @@ def rotate( output_raw: torch.Tensor, freqs_cis_raw: torch.Tensor, profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", ) -> None: """ Rotate (hadamard transform) operation. - Unified for deepseek_v3_2 (64 heads) and glm_5 (32 heads). Dispatches by - input_raw.shape[-2]: 64 -> rotate_op, 32 -> rotate_glm5_op. - Args: input_raw (torch.Tensor): The input tensor [..., head, 128]. output_raw (torch.Tensor): The output tensor where the result will be stored. freqs_cis_raw (torch.Tensor): The frequency tensor. profile_logs (torch.Tensor): Tensor for storing profiling logs. + model_arch: Model architecture string. + compute_kernel_type: Compute kernel type string. Returns: None """ - if input_raw.dtype != torch.bfloat16: - raise ValueError("input must be a bfloat16 tensor.") - - if output_raw.dtype != torch.bfloat16: - raise ValueError("output must be a bfloat16 tensor.") - - if freqs_cis_raw.dtype != torch.float32: - raise ValueError("freqs_cis must be a float32 tensor.") - - head = input_raw.shape[-2] - dim = input_raw.shape[-1] - if dim != 128: - raise ValueError("dim must be 128, as we precompute scale inner kernel") - - if head == 64: - torch.ops.tilert.rotate_op(input_raw, output_raw, freqs_cis_raw, profile_logs) - elif head == 32: - torch.ops.tilert.rotate_glm5_op(input_raw, output_raw, freqs_cis_raw, profile_logs) - else: - raise ValueError( - f"Unsupported head size: {head}. Rotate op supports " - "index_n_heads=64 (deepseek_v3_2) or 32 (glm_5)." - ) + torch.ops.tilert.rotate_op( + input_raw, + output_raw, + freqs_cis_raw, + model_arch, + compute_kernel_type, + profile_logs, + ) @dataclass @@ -126,6 +114,12 @@ def __call__(self) -> list[str]: return self.tilert_tensor_alias +class RotateAlgorithm(Enum): + """Rotate algorithm.""" + + GENERAL = "general" + + class Rotate(TileRTModule): """Rotate module: RoPE on first qk_rope_head_dim dims + hadamard transform. @@ -133,6 +127,11 @@ class Rotate(TileRTModule): No weights; uses model_args for dimensions. """ + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [RotateAlgorithm.GENERAL], + "glm_5": [RotateAlgorithm.GENERAL], + } + def __init__( self, model_args: ModelArgs, @@ -202,9 +201,11 @@ def tilert_forward(self, idx_q: torch.Tensor, freqs_cis: torch.Tensor) -> torch. assert self.output is not None assert self.profile_logs is not None freqs_cis_real = torch.view_as_real(freqs_cis).reshape(*freqs_cis.shape[:-1], -1) - rotate(idx_q, self.output, freqs_cis_real, self.profile_logs) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) + rotate( + idx_q, + self.output, + freqs_cis_real, + self.profile_logs, + model_arch=self.model_args.arch_name, + ) return self.output diff --git a/tilert/models/glm_5/_dsa_v32/ops/sparse_index.py b/tilert/models/glm_5/_dsa_v32/ops/sparse_index.py new file mode 100644 index 0000000..ca69c49 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/ops/sparse_index.py @@ -0,0 +1,135 @@ +"""Sparse index operation module.""" + +import torch + +__all__ = [ + "sparse_index", + "sparse_index_topk", +] + + +def sparse_index( + q: torch.Tensor, # noqa: VNE001 + kv: torch.Tensor, + weights: torch.Tensor, + logits: torch.Tensor, + cur_pos: int, + profile_logs: torch.Tensor, + compute_kernel_type: str = "bf16", + *, + model_arch: str, +) -> None: + """ + Sparse index operation. + + Calculate sparse index using q * kv * weights. + + Args: + q (torch.Tensor): The query tensor. + kv (torch.Tensor): The key-value tensor. + weights (torch.Tensor): The weights tensor. + logits (torch.Tensor): The logits tensor. + cur_pos (int): The position of the first token. + profile_logs (torch.Tensor): Tensor for storing profiling logs. + compute_kernel_type (str): Kernel type ("bf16"). + model_arch (str): Model architecture ("deepseek_v3_2"). + + Returns: + None + """ + if q.dtype != torch.bfloat16: + raise ValueError("input must be a bfloat16 tensor.") + if kv.dtype != torch.bfloat16: + raise ValueError("kv must be a bfloat16 tensor.") + if weights.dtype != torch.bfloat16: + raise ValueError("weights must be a bfloat16 tensor.") + if logits.dtype != torch.float32: + raise ValueError("logits must be a float32 tensor.") + + head = q.shape[-2] + dim = q.shape[-1] + + if head != 64 and head != 32: + raise ValueError( + f"Unsupported head size: {head}. Sparse index op currently only \ + supports a head number of 64 or 32." + ) + if dim != 128: + raise ValueError("dim must be 128, as we precompute scale inner kernel") + + device = q.device + if any(t.device != device for t in (kv, weights, logits, profile_logs)): + raise ValueError( + "sparse_index inputs must be on the same device: " + f"q={device}, kv={kv.device}, weights={weights.device}, " + f"logits={logits.device}, profile_logs={profile_logs.device}" + ) + if model_arch == "deepseek_v3_2" and head == 32: + model_arch = "glm_5" + torch.ops.tilert.sparse_index_op( + q, kv, weights, logits, cur_pos, model_arch, compute_kernel_type, profile_logs + ) + + +def sparse_index_topk( + q: torch.Tensor, # noqa: VNE001 + kv: torch.Tensor, + weights: torch.Tensor, + logits: torch.Tensor, + indices: torch.Tensor, + cur_pos: int, + profile_logs: torch.Tensor, +) -> None: + """ + Sparse index operation. + + Calculate sparse index using q * kv * weights. + + Args: + q (torch.Tensor): The query tensor. + kv (torch.Tensor): The key-value tensor. + weights (torch.Tensor): The weights tensor. + logits (torch.Tensor): The logits tensor. + cur_pos (int): The position of the first token. + profile_logs (torch.Tensor): Tensor for storing profiling logs. + + Returns: + None + """ + if q.dtype != torch.bfloat16: + raise ValueError("input must be a bfloat16 tensor.") + if kv.dtype != torch.bfloat16: + raise ValueError("kv must be a bfloat16 tensor.") + if weights.dtype != torch.bfloat16: + raise ValueError("weights must be a bfloat16 tensor.") + if logits.dtype != torch.float32: + raise ValueError("logits must be a float32 tensor.") + + seqlen = q.shape[-3] + head = q.shape[-2] + dim = q.shape[-1] + + if head not in (32, 64): + raise ValueError( + f"Unsupported head size: {head}. Sparse index topk fused op " + "supports head number of 32 (GLM5) or 64 (DSV3.2)." + ) + if dim != 128: + raise ValueError("dim must be 128, as we precompute scale inner kernel") + + device = q.device + if any(t.device != device for t in (kv, weights, logits, indices, profile_logs)): + raise ValueError( + "sparse_index inputs must be on the same device: " + f"q={device}, kv={kv.device}, weights={weights.device}, " + f"logits={logits.device}, profile_logs={profile_logs.device}" + ) + workspace = torch.zeros(seqlen, (200 * 1024 + 260), dtype=torch.int32, device=device) + if head == 64: + torch.ops.tilert.sparse_index_topk_dsv32_op( + q, kv, weights, logits, cur_pos, indices, workspace, profile_logs + ) + else: + torch.ops.tilert.sparse_index_topk_glm5_op( + q, kv, weights, logits, cur_pos, indices, workspace, profile_logs + ) diff --git a/python/models/deepseek_v3_2/ops/topk.py b/tilert/models/glm_5/_dsa_v32/ops/topk.py similarity index 75% rename from python/models/deepseek_v3_2/ops/topk.py rename to tilert/models/glm_5/_dsa_v32/ops/topk.py index bb41575..bb9dfbb 100644 --- a/python/models/deepseek_v3_2/ops/topk.py +++ b/tilert/models/glm_5/_dsa_v32/ops/topk.py @@ -1,10 +1,18 @@ """topk operations module.""" +from __future__ import annotations + +from typing import TYPE_CHECKING + import torch import torch.nn as nn from tilert.utils import get_profile_log_tensor +if TYPE_CHECKING: + from tilert.models.glm_5._dsa_v32.model_args import ModelArgs + + __all__ = [ "TopK", "topk_approximate", @@ -17,6 +25,8 @@ def topk_approximate( seq_len: int, topk: int, profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", ) -> torch.Tensor: """ Topk approximate operation. @@ -42,7 +52,9 @@ def topk_approximate( raise ValueError("batch must be 1 in this version") indices = torch.zeros(batch, topk, dtype=torch.int32, device=logits.device) - torch.ops.tilert.topk_approximate_op(logits, indices, seq_len, profile_logs) + torch.ops.tilert.topk_approximate_op( + logits, indices, seq_len, model_arch, compute_kernel_type, profile_logs + ) return indices @@ -52,6 +64,8 @@ def topk_accurate( seq_len: int, topk: int, profile_logs: torch.Tensor, + model_arch: str, + compute_kernel_type: str = "general", ) -> torch.Tensor: """ Topk approximate operation. @@ -71,8 +85,8 @@ def topk_accurate( if logits.dtype != torch.float32: raise ValueError("logits must be a float32 tensor.") - if topk != 2048: - raise ValueError("topk must be 2048.") + if topk not in (512, 2048): + raise ValueError("topk must be 512 or 2048.") assert logits.shape[0] == 1, "batch must be 1 in this version" num_samples = logits.shape[1] @@ -80,7 +94,13 @@ def topk_accurate( indices = torch.zeros(num_samples, topk, dtype=torch.int32, device=logits.device) indices_ws = torch.zeros(1, num_samples, 4, topk * 2, dtype=torch.int32, device=logits.device) torch.ops.tilert.topk_accurate_op( - logits, indices, seq_len - num_samples, indices_ws, profile_logs + logits, + indices, + seq_len - num_samples, + indices_ws, + model_arch, + compute_kernel_type, + profile_logs, ) return indices @@ -93,9 +113,14 @@ class TopK(nn.Module): (reference implementation) and tilert_forward (TileRT kernel). """ - def __init__(self, use_approximate: bool = False) -> None: + def __init__(self, use_approximate: bool = False, model_args: ModelArgs | None = None) -> None: super().__init__() self.use_approximate = use_approximate + if model_args is None: + from tilert.models.glm_5._dsa_v32.model_args import ModelArgs + + model_args = ModelArgs() + self.model_args = model_args def golden_forward( self, @@ -131,9 +156,13 @@ def tilert_forward( profile_logs = get_profile_log_tensor(device=logits.device) cache_len = logits.shape[-1] if self.use_approximate: - indices = topk_approximate(logits, cache_len, topk, profile_logs) + indices = topk_approximate( + logits, cache_len, topk, profile_logs, model_arch=self.model_args.arch_name + ) else: - indices = topk_accurate(logits, cache_len, topk, profile_logs) + indices = topk_accurate( + logits, cache_len, topk, profile_logs, model_arch=self.model_args.arch_name + ) if indices.dim() == 2: return indices.unsqueeze(0) return indices diff --git a/python/models/deepseek_v3_2/ops/unproj_o_allreduce.py b/tilert/models/glm_5/_dsa_v32/ops/unproj_o_allreduce.py similarity index 56% rename from python/models/deepseek_v3_2/ops/unproj_o_allreduce.py rename to tilert/models/glm_5/_dsa_v32/ops/unproj_o_allreduce.py index 50b413f..257acf5 100644 --- a/python/models/deepseek_v3_2/ops/unproj_o_allreduce.py +++ b/tilert/models/glm_5/_dsa_v32/ops/unproj_o_allreduce.py @@ -1,5 +1,6 @@ """UnprojOAllreduce operation module.""" +import math from dataclasses import dataclass from enum import Enum @@ -7,13 +8,13 @@ from tilert.models.base import TileRTModule, TilertWeightsConverter from tilert.models.common import weight_dequant -from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.profiler.utils import parse_profile_log_tensor +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs from tilert.utils import get_profile_log_tensor __all__ = [ "unproj_o_allreduce", "UnProjOAllReduce", + "UnProjOAllReduceAlgorithm", "UnProjOAllReduceRefWeightsAlias", "UnProjOAllReduceTilertWeightsAlias", ] @@ -27,7 +28,8 @@ def unproj_o_allreduce( flag: int, vec_out: torch.Tensor, profile_logs: torch.Tensor, - algorithm: str = "fp8mma", + model_arch: str, + compute_kernel_type: str = "bf16", ) -> None: """ Fused operation of unprojection and allreduce. @@ -39,29 +41,26 @@ def unproj_o_allreduce( x_in: Input tensor. flag: Input flag. vec_out: Output tensor. - profile_logs: Profile logs tensor. This is a 1D tensor of shape - (num_sms,) to store the profile logs of the unproj_o_allreduce - operation, where num_sms is the number of SMs on the - device. + profile_logs: Profile logs tensor. + model_arch: Model architecture ("deepseek_v3_2" or "glm_5"). + compute_kernel_type: Compute kernel type ("bf16", "fp16mma"). """ - if vec_out.shape[-1] == 7168: - assert algorithm == "fp8mma", "Only fp8mma is supported for deepseek_v3_2" - torch.ops.tilert.unproj_o_allreduce_op( - vec_in, mat_in, mat_scale, x_in, flag, vec_out, profile_logs - ) - - elif vec_out.shape[-1] == 6144: - torch.ops.tilert.unproj_o_allreduce_glm5_op( - vec_in, mat_in, mat_scale, x_in, flag, vec_out, profile_logs, algorithm - ) - else: - raise ValueError(f"Unsupported vector dimension: {vec_out.shape[-1]}") + torch.ops.tilert.unproj_o_allreduce_op( + vec_in, + mat_in, + mat_scale, + x_in, + flag, + vec_out, + profile_logs, + model_arch, + compute_kernel_type, + ) class UnProjOAllReduceAlgorithm(Enum): """UnprojOAllReduce algorithm""" - FP8MMA = "fp8mma" FP16MMA = "fp16mma" @@ -99,83 +98,102 @@ class UnProjOAllReduceWeightsConverter(TilertWeightsConverter): """UnProjOAllReduce weights converter""" @staticmethod - def _swizzle_qmma_16x32(mat_in: torch.Tensor) -> torch.Tensor: - assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 32 - assert mat_in.dtype == torch.float8_e4m3fn - # PTX isa fig.88 + def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: + assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 pre_shape = mat_in.shape[:-2] - mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 4).transpose(-4, -3).transpose(-5, -4) - return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 4).transpose(-3, -2) + mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) + return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) - def convert_to_fp8mma( - self, weights_list: list[torch.Tensor] + def convert_to_fp16mma_128cta( + self, + weights_list: list[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: - """ - Convert the weights to fp8mma format. + """Convert weights to the packed kernel layout (GLM5 or DSV3.2).""" + with torch.inference_mode(): + mat, scales = weights_list + if scales.dtype != torch.float32: + scales = scales.to(torch.float32) - Args: - weights_list: List of weights. + dim = self.model_args.dim + block_size = self.model_args.block_size + sms = 128 + vec_dim = mat.shape[-1] + dim_per_sm = dim // sms + full_tiles = dim_per_sm // 16 + remainder_rows = dim_per_sm % 16 + stages = vec_dim // 512 + vec_scale_dim = vec_dim // block_size + scale_per_stage = vec_scale_dim // stages + + dim_scale_dim = dim // block_size + scales_per_full_tile = 2 if remainder_rows > 0 else 1 + rem_scales = 1 if remainder_rows > 0 else 0 + total_scale_slots = (full_tiles * scales_per_full_tile + rem_scales) * scale_per_stage + repeat_factor = 8 if remainder_rows == 0 else 16 + + sc = scales.reshape(dim_scale_dim, 1, vec_scale_dim) + sc = sc.repeat(1, repeat_factor, 1) + scales_per_cta = full_tiles * scales_per_full_tile + rem_scales + sc = ( + sc.reshape(sms, scales_per_cta, stages, scale_per_stage) + .transpose(1, 2) + .reshape(sms, stages, total_scale_slots) + .view(torch.float8_e4m3fn) + ) + sc_packed = sc - Returns: - Tuple of weights. - """ - args = self.model_args - assert args.arch_name == "deepseek_v3_2" or args.arch_name == "glm_5" - arch_name = args.arch_name - dim = args.dim - num_sms = 128 - if arch_name == "deepseek_v3_2": - num_sms = 112 - dim_per_sm = dim // num_sms - dim_scale_dim = dim // args.block_size + mat_per_sm = mat.reshape(sms, dim_per_sm, vec_dim) - with torch.inference_mode(): - mat_in, scales_trt = weights_list - vec_dim = mat_in.shape[-1] # 2048 for both deepseek_v3_2 and glm_5 - assert scales_trt.shape == (dim // args.block_size, vec_dim // args.block_size) + full_rows = full_tiles * 16 + mat_full = ( + mat_per_sm[:, :full_rows, :] + .reshape(sms, full_tiles, 16, stages, 512) + .transpose(2, 3) + .reshape(sms, full_tiles, stages, 16, 32, 16) + .transpose(3, 4) + .reshape(sms, full_tiles, stages, 32, 16, 16) + ) + mat_full = UnProjOAllReduceWeightsConverter._swizzle_mma_16x16(mat_full) + mat_full = mat_full.transpose(1, 2).reshape(sms, stages, -1) - weights_trt = mat_in.reshape(num_sms, dim_per_sm, vec_dim) - # dim_per_stage is 512 - stages = vec_dim // 512 - weights_trt = weights_trt.reshape(num_sms, dim_per_sm, stages, 512).transpose(1, 2) - - weights_trt = weights_trt.reshape( - num_sms, stages, dim_per_sm // 16, 16, 16, 32 - ).transpose(-2, -3) - weights_trt = self._swizzle_qmma_16x32(weights_trt) - weights_trt = weights_trt.reshape(num_sms, stages, -1) - - if arch_name == "glm_5": - if scales_trt.dtype != torch.float32: - print( - "Warning: UnProjOAllReduceWeightsConverter: " - + f"scales_trt.dtype: {scales_trt.dtype} " - + "is not float32, convert to float32." - ) - scales_trt = scales_trt.to(torch.float32) - # repeat 8 times - scales_trt = ( - scales_trt.reshape((dim_scale_dim, 1, -1)).repeat(1, 8, 1).reshape(num_sms, -1) + if remainder_rows > 0: + mat_rem_raw = mat_per_sm[:, full_rows:, :] + mat_rem_padded = torch.zeros( + sms, 16, vec_dim, dtype=mat_rem_raw.dtype, device=mat_rem_raw.device ) - else: # DS v3.2, use bfloat16 for scales - scales_trt = scales_trt.to(torch.bfloat16) - - return weights_trt.contiguous(), scales_trt.contiguous() + mat_rem_padded[:, :remainder_rows, :] = mat_rem_raw + mat_rem = ( + mat_rem_padded.reshape(sms, 1, 16, stages, 512) + .transpose(2, 3) + .reshape(sms, 1, stages, 16, 32, 16) + .transpose(3, 4) + .reshape(sms, 1, stages, 32, 16, 16) + ) + mat_rem = UnProjOAllReduceWeightsConverter._swizzle_mma_16x16(mat_rem) + mat_rem = mat_rem.transpose(1, 2).reshape(sms, stages, -1) + mat_combined = torch.cat([mat_full, mat_rem], dim=-1) + else: + mat_combined = mat_full - @staticmethod - def _swizzle_mma_16x16(mat_in: torch.Tensor) -> torch.Tensor: - assert mat_in.shape[-2] == 16 and mat_in.shape[-1] == 16 - # PTX isa fig.88 - pre_shape = mat_in.shape[:-2] - mat_in = mat_in.reshape(*pre_shape, 2, 8, 2, 4, 2).transpose(-4, -3).transpose(-5, -4) - return mat_in.reshape(*pre_shape, 2 * 2, 8 * 4, 2).transpose(-3, -2) + scales_padding = torch.zeros( + sms, + stages, + 128 - sc_packed.shape[-1], + dtype=torch.float8_e4m3fn, + device=mat.device, + ) + mat_all = torch.cat([mat_combined, sc_packed, scales_padding], dim=-1).contiguous() + dummy_scales = torch.zeros(1, dtype=torch.float32, device=mat.device) + return mat_all, dummy_scales def convert_to_fp16mma( self, weights_list: list[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: """Convert common weights to TileRT FP16 MMA layout.""" - assert self.model_args.arch_name == "glm_5", "Only GLM-5 supports FP16 MMA" + if self.model_args.arch_name == "deepseek_v3_2": + return self.convert_to_fp16mma_128cta(weights_list) + assert self.model_args.arch_name == "glm_5", "Only GLM-5 and DSV3.2 support FP16 MMA" with torch.inference_mode(): mat, scales = weights_list @@ -187,32 +205,44 @@ def convert_to_fp16mma( ) scales = scales.to(torch.float32) - sms = 128 # use 128 sms for glm_5 - pages = 4 - scales = scales.reshape(6144 // 128, 1, 2048 // 128) + dim = self.model_args.dim + block_size = self.model_args.block_size + sms = 128 + vec_dim = mat.shape[-1] + dim_per_sm = dim // sms + tiles_per_stage = dim_per_sm // 16 + stages = vec_dim // 512 + dim_scale_dim = dim // block_size + vec_scale_dim = vec_dim // block_size + scale_per_stage = vec_scale_dim // stages + + scales = scales.reshape(dim_scale_dim, 1, vec_scale_dim) scales = scales.repeat(1, 8, 1) - scales = scales.reshape(128, 3, 4, 4).transpose(1, 2) - # to 128, 4, 12x4 - scales = scales.reshape(128, 4, 12).view(torch.float8_e4m3fn) + scales = ( + scales.reshape(sms, tiles_per_stage, stages, scale_per_stage) + .transpose(1, 2) + .reshape(sms, stages, tiles_per_stage * scale_per_stage) + .view(torch.float8_e4m3fn) + ) mat = ( - mat.reshape(128, 48, 2048) - .reshape(128, 3, 16, 4, 512) + mat.reshape(sms, dim_per_sm, vec_dim) + .reshape(sms, tiles_per_stage, 16, stages, 512) .transpose(2, 3) - .reshape(128, 3, 4, 16, 32, 16) + .reshape(sms, tiles_per_stage, stages, 16, 32, 16) .transpose(3, 4) - .reshape(128, 3, 4, 32, 16, 16) + .reshape(sms, tiles_per_stage, stages, 32, 16, 16) ) mat = UnProjOAllReduceWeightsConverter._swizzle_mma_16x16(mat) - mat = mat.transpose(1, 2).reshape(128, 4, -1) + mat = mat.transpose(1, 2).reshape(sms, stages, -1) scales_padding = torch.zeros( sms, - pages, + stages, 128 - scales.shape[-1], dtype=torch.float8_e4m3fn, device=mat.device, - ) # append 128-byte aligned scale: (128, 4, 24704) for glm_5 + ) mat_full = torch.cat([mat, scales, scales_padding], dim=-1).contiguous() dummy_scales = torch.zeros(1, dtype=torch.float32, device=mat.device) return mat_full, dummy_scales @@ -221,6 +251,15 @@ def convert_to_fp16mma( class UnProjOAllReduce(TileRTModule): """UnProjOAllReduce module""" + _SUPPORTED_ALGORITHMS = { + "deepseek_v3_2": [ + UnProjOAllReduceAlgorithm.FP16MMA, + ], + "glm_5": [ + UnProjOAllReduceAlgorithm.FP16MMA, + ], + } + def __init__( self, model_args: ModelArgs, @@ -228,7 +267,7 @@ def __init__( device_id: int = 0, ref_weights_alias: UnProjOAllReduceRefWeightsAlias | None = None, tilert_weights_alias: UnProjOAllReduceTilertWeightsAlias | None = None, - algorithm: UnProjOAllReduceAlgorithm = UnProjOAllReduceAlgorithm.FP8MMA, + algorithm: UnProjOAllReduceAlgorithm = UnProjOAllReduceAlgorithm.FP16MMA, ): super().__init__( self.__class__.__name__, @@ -253,17 +292,22 @@ def __init__( self.n_heads = self.model_args.n_heads self.head_dim = self.model_args.v_head_dim + if self.n_heads % self.num_devices == 0: + self.num_local_heads = self.n_heads // self.num_devices + else: + n_local = math.ceil(self.n_heads / self.num_devices) + if n_local % 2 != 0: + n_local += 1 + self.num_local_heads = n_local + self.block_size = self.model_args.block_size self.algorithm: UnProjOAllReduceAlgorithm = algorithm - # reference weights self.ref_unproj_o: torch.Tensor | None = None - # tilert weights self.tilert_weights: torch.Tensor | None = None self.tilert_scales: torch.Tensor | None = None - # tilert vars self.hidden_out: torch.Tensor | None = None self.profile_logs: torch.Tensor | None = None @@ -290,10 +334,55 @@ def device_sharding(self, weights_map: dict[str, torch.Tensor]) -> dict[str, tor """ unproj_o_weight = weights_map[self.ref_weights_alias.o_proj_weight] unproj_o_scale = weights_map[self.ref_weights_alias.o_proj_scale_inv] - unproj_o_weight = unproj_o_weight.reshape(self.dim, self.num_devices, -1) - unproj_o_weight = unproj_o_weight.transpose(0, 1) - unproj_o_scale = unproj_o_scale.reshape(self.dim // self.block_size, self.num_devices, -1) - unproj_o_scale = unproj_o_scale.transpose(0, 1) + + if self.n_heads % self.num_devices == 0: + unproj_o_weight = unproj_o_weight.reshape(self.dim, self.num_devices, -1) + unproj_o_weight = unproj_o_weight.transpose(0, 1) + unproj_o_scale = unproj_o_scale.reshape( + self.dim // self.block_size, self.num_devices, -1 + ) + unproj_o_scale = unproj_o_scale.transpose(0, 1) + else: + cols_per_head = self.head_dim + cols_per_dev = self.num_local_heads * cols_per_head + W = unproj_o_weight.view(self.dim, self.n_heads, cols_per_head) + + scale_cols_per_head = cols_per_head // self.block_size + scale_cols_per_dev = self.num_local_heads * scale_cols_per_head + S = unproj_o_scale.view(self.dim // self.block_size, self.n_heads, scale_cols_per_head) + + W_devs = [] + S_devs = [] + for dev in range(self.num_devices): + start = dev * self.num_local_heads + end = min(self.n_heads, start + self.num_local_heads) + real = max(0, end - start) + + dev_W = torch.zeros( + self.dim, + self.num_local_heads, + cols_per_head, + dtype=W.dtype, + device=W.device, + ) + if real > 0: + dev_W[:, :real] = W[:, start:end] + W_devs.append(dev_W.reshape(self.dim, cols_per_dev)) + + dev_S = torch.zeros( + self.dim // self.block_size, + self.num_local_heads, + scale_cols_per_head, + dtype=S.dtype, + device=S.device, + ) + if real > 0: + dev_S[:, :real] = S[:, start:end] + S_devs.append(dev_S.reshape(self.dim // self.block_size, scale_cols_per_dev)) + + unproj_o_weight = torch.stack(W_devs, dim=0) + unproj_o_scale = torch.stack(S_devs, dim=0) + return { self.tilert_weights_alias.unproj_weights: unproj_o_weight.contiguous(), self.tilert_weights_alias.unproj_scales: unproj_o_scale.contiguous(), @@ -413,12 +502,9 @@ def tilert_forward( flag, self.hidden_out, self.profile_logs, - self.algorithm.value, + model_arch=self.model_args.arch_name, + compute_kernel_type=self.algorithm.value, ) - if self.flag_enable_profiling_log: - parse_profile_log_tensor( - self.profile_logs, self.get_profile_log_path(), [(self.op_name, 0.0)] - ) return self.hidden_out def __call__( diff --git a/tilert/models/glm_5/_dsa_v32/temp_var_indices.py b/tilert/models/glm_5/_dsa_v32/temp_var_indices.py new file mode 100644 index 0000000..3a7af62 --- /dev/null +++ b/tilert/models/glm_5/_dsa_v32/temp_var_indices.py @@ -0,0 +1,118 @@ +"""Named indices for DSA temporary variables. + +Lets Python code reference temp_vars by name instead of magic numbers. + +Usage:: + + from tilert.models.glm_5._dsa_v32.temp_var_indices import Idx + + token_out = intermediates[Idx.TOKEN_OUT] # equivalent to intermediates[25] +""" + +from enum import IntEnum + + +class DsaTempVarIdx(IntEnum): + """Index constants for DSA temp_vars.""" + + Q = 0 + KV = 1 + KI = 2 + Q_NOPE_DOWN = 3 + Q_PE = 4 + IQ = 5 + IQ_RT = 6 + IDX_SCORES = 7 + IDX_LOGITS = 8 + IDX_SELECTS = 9 + Q_NOPE = 10 + O = 11 # noqa: E741 + O_ACC = 12 + O_LSE = 13 + O_LSE_ACC = 14 + PROJ_O = 15 + UNPROJ_O = 16 + SCORES = 17 + X_MLP_IN = 18 + UP_GATE = 19 + SEL_PROBS = 20 + SEL_INDICES = 21 + EXP_OUT = 22 + X_RMSNORM = 23 + LOGITS_OUT = 24 + TOKEN_OUT = 25 + EMBEDDING_RMSNORM = 26 + HIDDEN_RMSNORM = 27 + EH_PROJ = 28 + X_TENSOR = 29 + ROPE_FREQS = 30 + CUR_POS = 31 + TOKEN_ID = 32 + LAST_HIDDEN_STATES = 33 + DRAFT_TOKENS = 34 + PREDICTED_TOKENS = 35 + PREDICTED_HIDDEN = 36 + ACCEPTED_TOKENS = 37 + NEXT_DRAFT_TOKENS = 38 + X_QUANT = 39 + X_SCALE = 40 + MOE_UP_GATE = 41 + IDX_SEL_WS = 42 + MTP0_TOKEN_OUT = 43 + MTP1_TOKEN_OUT = 44 + MTP0_EXP_OUT = 45 + SAMPLING_SEED = 46 + SAMPLING_POSITIONS = 47 + SAMPLING_CONFIG = 48 + TOP_P_SCORES = 49 + TOP_P_DEBUG = 50 + LORA_SLOT_ID = 51 + LORA_RANK = 52 + TOP_N_LOG_PROBS = 53 + TOP_N_INDICES = 54 + LOGPROBS_FLAG = 55 + + +TEMP_VARS_SIZE = 56 + +Idx = DsaTempVarIdx + + +def validate_temp_vars_layout() -> None: + """Validate the temporary-variable index enum. + + Checks: + 1. Enum member count equals TEMP_VARS_SIZE. + 2. Indices are contiguous 0..TEMP_VARS_SIZE-1 with no gaps or duplicates. + 3. (If the backend is loaded) the backend temp_vars_size matches TEMP_VARS_SIZE. + + Raises: + RuntimeError: If any validation check fails. + """ + members = list(DsaTempVarIdx) + + if len(members) != TEMP_VARS_SIZE: + raise RuntimeError( + f"DsaTempVarIdx has {len(members)} members but TEMP_VARS_SIZE={TEMP_VARS_SIZE}" + ) + + indices = sorted(m.value for m in members) + expected = list(range(TEMP_VARS_SIZE)) + if indices != expected: + missing = set(expected) - set(indices) + dupes = [i for i in indices if indices.count(i) > 1] + raise RuntimeError( + f"DsaTempVarIdx indices are not contiguous 0..{TEMP_VARS_SIZE - 1}. " + f"Missing: {missing}, Duplicates: {set(dupes)}" + ) + + try: + import torch + + cpp_size = torch.ops.tilert.dsa_temp_vars_size() + if cpp_size != TEMP_VARS_SIZE: + raise RuntimeError( + f"TEMP_VARS_SIZE={TEMP_VARS_SIZE} != " f"backend temp_vars_size={cpp_size}" + ) + except (AttributeError, RuntimeError): + pass diff --git a/python/models/glm_5/generator.py b/tilert/models/glm_5/generator.py similarity index 79% rename from python/models/glm_5/generator.py rename to tilert/models/glm_5/generator.py index 8bc9757..b3e8ddd 100644 --- a/python/models/glm_5/generator.py +++ b/tilert/models/glm_5/generator.py @@ -7,10 +7,10 @@ from transformers import AutoTokenizer from tilert import logger -from tilert.models.deepseek_v3_2.generator import stats_time -from tilert.models.deepseek_v3_2.model_args import ModelArgs -from tilert.models.deepseek_v3_2.modules.end2end import ShowHandsDSALayer -from tilert.models.deepseek_v3_2.temp_var_indices import Idx +from tilert.models.glm_5._dsa_v32.generator import stats_time +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.modules.end2end import ShowHandsDSALayer +from tilert.models.glm_5._dsa_v32.temp_var_indices import Idx from tilert.tilert_init import tilert_init __all__ = [ @@ -64,10 +64,9 @@ def __init__( chat_template = f.read() self.tokenizer.chat_template = chat_template self.eos_id = self.tokenizer.eos_token_id - self.batch_size = 1 # fixed batch size to 1 for now + self.batch_size = 1 self.mtp_seq_len = 4 - # GLM5 uses multiple stop tokens self.stop_tokens = ["<|user|>", "<|endoftext|>", "<|observation|>", "<|assistant|>"] self.stop_token_ids: set[int] = set() for token in self.stop_tokens: @@ -75,13 +74,11 @@ def __init__( if len(token_ids) == 1: self.stop_token_ids.add(token_ids[0]) else: - # Try to get from added_tokens_encoder if ( hasattr(self.tokenizer, "added_tokens_encoder") and token in self.tokenizer.added_tokens_encoder ): self.stop_token_ids.add(self.tokenizer.added_tokens_encoder[token]) - # Always include eos_id if self.eos_id is not None: self.stop_token_ids.add(self.eos_id) logger.info(f"Stop token IDs: {self.stop_token_ids}") @@ -113,6 +110,37 @@ def from_pretrained(self) -> None: """Load the model weights from the given path.""" self.decode_layer.from_pretrained(self.model_weights_dir) + def extract_ffn_cache(self) -> tuple[dict[int, list], dict[int, set[str]]]: + """Extract MOE/MLP op objects and skip keys from current loaded weights. + + Returns: + Tuple of (cached_ffn_ops_per_device, skip_keys_per_device). + """ + from tilert.models.glm_5._dsa_v32.modules.end2end import ( + _extract_ffn_ops, + _get_moe_weight_keys, + ) + + cached_ffn_ops: dict[int, list] = {} + skip_keys: dict[int, set[str]] = {} + for device_id in range(self.decode_layer.num_devices): + dsa = self.decode_layer._dsa_objects[device_id] + if dsa is None: + raise RuntimeError(f"Device {device_id} Dsa not available for cache extraction") + cached_ffn_ops[device_id] = _extract_ffn_ops(dsa) + skip_keys[device_id] = _get_moe_weight_keys(dsa) + return cached_ffn_ops, skip_keys + + def from_pretrained_with_cache( + self, + cached_ffn_ops_per_device: dict[int, list], + skip_keys_per_device: dict[int, set[str]], + ) -> None: + """Load weights reusing cached MOE/MLP ops.""" + self.decode_layer.from_pretrained_with_cache( + self.model_weights_dir, cached_ffn_ops_per_device, skip_keys_per_device + ) + def update_sampling_params( self, temperature: float = 1.0, @@ -120,11 +148,7 @@ def update_sampling_params( top_k: int = 256, use_topp: bool = True, ) -> None: - """Update sampling parameters for the next generation. - - Updates both the Python attributes and the CUDA sampling_config tensor - that the TileRT kernel reads during forward pass. - """ + """Update sampling parameters for the next generation.""" self.temperature = temperature self.decode_layer.update_sampling_config( temperature=temperature, top_p=top_p, top_k=top_k, use_topp=use_topp @@ -137,7 +161,7 @@ def generate( print_log: bool = True, with_mtp: bool | None = None, prompt_tokens: list[int] | None = None, - ) -> tuple[str, list[float], list[int]]: + ) -> tuple[str, list[float], list[int], int]: """Main function to load the model and perform single sequence generation. Args: @@ -149,7 +173,7 @@ def generate( and use these tokens directly (useful for exact-length benchmarking). Returns: - Tuple of (result_text, time_list, accepted_counts). + Tuple of (result_text, time_list, accepted_counts, prompt_len). accepted_counts is empty for non-MTP mode. """ active_mtp = with_mtp if with_mtp is not None else self.with_mtp @@ -158,10 +182,10 @@ def generate( self.decode_layer.set_sampling_seed(self.sampling_seed, with_mtp=active_mtp) if active_mtp: return self._generate_with_mtp(prompt, print_log, prompt_tokens=prompt_tokens) - result, time_list = self._generate_without_mtp( + result, time_list, prompt_len = self._generate_without_mtp( prompt, print_log, with_mtp=active_mtp, prompt_tokens=prompt_tokens ) - return result, time_list, [] # Empty accepted_counts for non-MTP + return result, time_list, [], prompt_len def _generate_without_mtp( self, @@ -169,7 +193,7 @@ def _generate_without_mtp( print_log: bool = True, with_mtp: bool = False, prompt_tokens: list[int] | None = None, - ) -> tuple[str, list[float]]: + ) -> tuple[str, list[float], int]: """Standard generation without MTP.""" if prompt_tokens is None: messages = [{"role": "user", "content": prompt}] @@ -179,11 +203,7 @@ def _generate_without_mtp( add_generation_prompt=True, enable_thinking=self.enable_thinking, ) - # adapt to transformers 5.2.0 - if not isinstance(prompt_tokens, list) and prompt_tokens.get("input_ids") is not None: - prompt_tokens = prompt_tokens["input_ids"] - assert prompt_tokens is not None max_seq_len = self.config.max_seq_len prompt_len = len(prompt_tokens) total_len = min(max_seq_len, self.max_new_tokens + prompt_len) @@ -211,14 +231,12 @@ def _generate_without_mtp( time_list.append(end_time - start_time) intermediates, *_ = multi_devices_results[0] - next_token = intermediates[Idx.TOKEN_OUT][0][0] # only the first token + next_token = intermediates[Idx.TOKEN_OUT][0][0] - # replace the next token with the prompt token if the prompt mask is True next_token = torch.where( prompt_mask[0, cur_pos_val], tokens[0, cur_pos_val], next_token ) tokens[0, cur_pos_val] = next_token - # Check if next_token is any of the stop tokens is_stop_token = next_token.item() in self.stop_token_ids finished |= torch.logical_and( ~prompt_mask[0, cur_pos_val], @@ -242,13 +260,11 @@ def _generate_without_mtp( stats_time(time_list, "==== Performance ====") print("\n") - # Reset sequence after generation, i.e. reset the cur_pos to 0 internally self.decode_layer.reset_sequence() completion_tokens = [] for _, toks in enumerate(tokens.tolist()): toks = toks[prompt_len : prompt_len + self.max_new_tokens] - # Find first stop token and truncate stop_idx = len(toks) for i, tok in enumerate(toks): if tok in self.stop_token_ids: @@ -259,14 +275,14 @@ def _generate_without_mtp( decoded_tokens = self.tokenizer.batch_decode(completion_tokens, skip_special_tokens=True) - return f"{decoded_tokens[0]}\n" if decoded_tokens else "", time_list + return f"{decoded_tokens[0]}\n" if decoded_tokens else "", time_list, prompt_len def _generate_with_mtp( self, prompt: str, print_log: bool = True, prompt_tokens: list[int] | None = None, - ) -> tuple[str, list[float], list[int]]: + ) -> tuple[str, list[float], list[int], int]: """Generation with MTP (Multi-Token Prediction) speculative decoding.""" if prompt_tokens is None: prompt_tokens = self.tokenizer.apply_chat_template( @@ -274,16 +290,11 @@ def _generate_with_mtp( add_generation_prompt=True, enable_thinking=self.enable_thinking, ) - # adapt to transformers 5.2.0 - if not isinstance(prompt_tokens, list) and prompt_tokens.get("input_ids") is not None: - prompt_tokens = prompt_tokens["input_ids"] - assert prompt_tokens is not None max_seq_len = self.config.max_seq_len prompt_len = len(prompt_tokens) total_len = min(max_seq_len, self.max_new_tokens + prompt_len) - # Output tokens buffer tokens = torch.full( (self.batch_size, total_len), -1, dtype=torch.long, device=self.default_device ) @@ -293,17 +304,14 @@ def _generate_with_mtp( prefill_time_list = [] decode_time_list = [] - decode_accepted_counts = [] # Only track decode phase for statistics - cur_pos = 0 # Current position in the output sequence + decode_accepted_counts = [] + cur_pos = 0 - # Prefill phase: process prompt tokens in non-overlapping chunks. - # Each chunk fills unique KV cache positions for both main model and MTP[0]. while cur_pos < prompt_len - 1: draft_end = min(cur_pos + self.mtp_seq_len, prompt_len) draft_tokens = tokens[0, cur_pos:draft_end].clone() actual_token_count = draft_tokens.shape[0] - # Pad if needed (use last token for padding) if actual_token_count < self.mtp_seq_len: pad_token = draft_tokens[-1].item() padding = torch.full( @@ -316,18 +324,13 @@ def _generate_with_mtp( draft_tokens = draft_tokens.reshape(1, self.mtp_seq_len).to(torch.int32) - # Provide the extra token for MTP[0]'s shifted input last position. - # MTP[0] needs tokens[cur_pos+1 : cur_pos+mtp_seq_len+1], so the - # extra token is at cur_pos + mtp_seq_len. mtp_extra_pos = cur_pos + self.mtp_seq_len if mtp_extra_pos < prompt_len: mtp_extra_token = int(tokens[0, mtp_extra_pos].item()) else: - # Beyond prompt — use last valid draft token as padding mtp_extra_token = int(tokens[0, draft_end - 1].item()) self.decode_layer.set_prefill_mtp_extra_token(mtp_extra_token) - # Tell GPU how many tokens are valid (for cur_pos advancement) self.decode_layer.set_prefill_valid_tokens(actual_token_count) start_time = time.time() @@ -335,27 +338,16 @@ def _generate_with_mtp( end_time = time.time() prefill_time_list.append(end_time - start_time) - # No overlap: advance by the full actual_token_count cur_pos += actual_token_count - # After no-overlap prefill, cur_pos may have overshot to prompt_len. - # Reset to prompt_len - 1 for correct decode start (first decode - # reprocesses the last prompt token position). cur_pos = prompt_len - 1 self.set_cur_pos(prompt_len - 1) - # Decode phase: speculative decoding - # Set prefill_valid_tokens to 0 to switch to decode mode self.decode_layer.set_prefill_valid_tokens(0) finished = False while cur_pos < total_len - 1 and not finished: - # Get next_draft_tokens from previous iteration - # (or use last prompt tokens for first decode) if cur_pos == prompt_len - 1: - # First decode iteration: use last prompt token repeated as placeholder drafts - # We can't use [t6, t7, t8, t9] because that would apply wrong RoPE positions - # (cur_pos=9 means positions 9,10,11,12, but t6 should be at position 6) last_token = tokens[0, prompt_len - 1].item() draft_tokens = torch.full( (self.mtp_seq_len,), @@ -365,7 +357,6 @@ def _generate_with_mtp( ) draft_tokens = draft_tokens.reshape(1, self.mtp_seq_len).to(torch.int32) else: - # Use next_draft_tokens from previous iteration draft_tokens = self.decode_layer.get_next_draft_tokens(0).reshape( 1, self.mtp_seq_len ) @@ -376,11 +367,9 @@ def _generate_with_mtp( decode_time_list.append(end_time - start_time) num_accepted = self.decode_layer.get_num_accepted(0) - # Use predicted_tokens for output (not next_draft_tokens which is for next iteration) predicted_tokens = self.decode_layer.get_predicted_tokens(0).flatten() decode_accepted_counts.append(num_accepted) - # Add accepted tokens to output num_output_tokens = num_accepted for i in range(num_output_tokens): if cur_pos + 1 + i >= total_len: @@ -388,12 +377,10 @@ def _generate_with_mtp( new_token = int(predicted_tokens[i].item()) tokens[0, cur_pos + 1 + i] = new_token - # Print generated token if cur_pos + 1 + i >= prompt_len and print_log: decoded_text = self.tokenizer.decode([new_token], skip_special_tokens=True) print(decoded_text, end="", flush=True) - # Check for any stop token if new_token in self.stop_token_ids: finished = True break @@ -414,7 +401,6 @@ def _generate_with_mtp( f"min={min_accepted}, max={max_accepted}" ) - # Calculate correct TPS accounting for MTP's multiple tokens per call if decode_time_list: total_decode_time = sum(decode_time_list) effective_tps = total_tokens / total_decode_time if total_decode_time > 0 else 0 @@ -427,16 +413,12 @@ def _generate_with_mtp( print("\n") - # Reset sequence after generation self.decode_layer.reset_sequence() - # Extract completion tokens completion_tokens = [] for _, toks in enumerate(tokens.tolist()): toks = toks[prompt_len : prompt_len + self.max_new_tokens] - # Remove -1 padding and tokens after any stop token toks = [t for t in toks if t != -1] - # Find first stop token and truncate stop_idx = len(toks) for i, tok in enumerate(toks): if tok in self.stop_token_ids: @@ -451,6 +433,7 @@ def _generate_with_mtp( f"{decoded_tokens[0]}\n" if decoded_tokens else "", decode_time_list, decode_accepted_counts, + prompt_len, ) def inject_cache( @@ -490,7 +473,6 @@ def inject_cache( logger.warning("inject_cache called with empty layer_caches") return - # Infer seqlen from first tensor if end_pos not specified first_ki, _, _ = layer_caches[0] seqlen = first_ki.size(0) if end_pos is None: @@ -509,13 +491,8 @@ def inject_cache( logger.warning(f"Layer index {layer_id} is out of bounds, skipping.") break - # GLM-5 cache layout: 3 tensors per layer (ki, kv, pe) - # Based on CacheVarsGlm5: k_cache, kv_cache, pe_cache base_idx = layer_id * 3 - # Copy to device and inject into cache - # Cache layout: [batch=1, max_seq_len, dim] - # External data: [seqlen, dim] ki_src = ki[:cache_len].to(f"cuda:{device_id}") kv_src = kv[:cache_len].to(f"cuda:{device_id}") pe_src = pe[:cache_len].to(f"cuda:{device_id}") @@ -527,14 +504,11 @@ def inject_cache( logger.info(f"Cache injection completed for {num_devices} devices") def set_cur_pos(self, cur_pos: int) -> None: - """Set the current position for RoPE in C++ backend. - - This should be called after inject_cache() to ensure the C++ global - g_cur_pos matches the injected cache length. This is critical for - correct RoPE position encoding during continued generation. + """Set the current position for RoPE. - For MTP mode, sets the GPU tensor at intermediates[31] directly. - For non-MTP mode, calls the C++ dsa_show_hands_set_cur_pos_glm5 API. + This should be called after inject_cache() to ensure the runtime position + matches the injected cache length, for correct RoPE position encoding + during continued generation. Args: cur_pos: The current sequence position (typically the length of prefilled tokens). @@ -545,14 +519,12 @@ def set_cur_pos(self, cur_pos: int) -> None: >>> # Now generate continues from the correct position """ if self.with_mtp: - # MTP E2E uses cur_pos tensor in TempVars num_devices = self.decode_layer.num_devices for device_id in range(num_devices): intermediates, _, _, _ = self.decode_layer._get_device_result(device_id) cur_pos_tensor = intermediates[Idx.CUR_POS] cur_pos_tensor.fill_(cur_pos) else: - # Non-MTP uses the C++ global g_cur_pos torch.ops.tilert.dsa_show_hands_set_cur_pos_glm5(cur_pos) logger.info(f"Set cur_pos to {cur_pos}") @@ -560,8 +532,7 @@ def inject_last_hidden_state(self, last_hidden_state: torch.Tensor) -> None: """Inject the last hidden state for MTP mode. For MTP (Multi-Token Prediction), the MTP preprocess layer needs the - last hidden state from the main model's last token. This method injects - the hidden state into intermediates[33] (last_hidden_states slot). + last hidden state from the main model's last token. Args: last_hidden_state: [hidden_size] or [1, hidden_size] BF16 tensor. @@ -577,14 +548,12 @@ def inject_last_hidden_state(self, last_hidden_state: torch.Tensor) -> None: logger.warning("inject_last_hidden_state called but with_mtp is False, skipping") return - # Normalize shape to [1, hidden_size] if last_hidden_state.dim() == 1: last_hidden_state = last_hidden_state.unsqueeze(0) num_devices = self.decode_layer.num_devices for device_id in range(num_devices): intermediates, _, _, _ = self.decode_layer._get_device_result(device_id) - # Shape: [batch=1, seq=4, hidden_size], we set seq[0] since it's the last token lhs_tensor = intermediates[Idx.LAST_HIDDEN_STATES] lhs_src = last_hidden_state.to(f"cuda:{device_id}") lhs_tensor[0, 0, :].copy_(lhs_src.squeeze(0)) diff --git a/python/models/glm_5/model_args.py b/tilert/models/glm_5/model_args.py similarity index 92% rename from python/models/glm_5/model_args.py rename to tilert/models/glm_5/model_args.py index a64ed6f..74e830c 100644 --- a/python/models/glm_5/model_args.py +++ b/tilert/models/glm_5/model_args.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from typing import Literal -from tilert.models.deepseek_v3_2.model_args import ModelArgs +from tilert.models.glm_5._dsa_v32.model_args import ModelArgs __all__ = [ "ModelArgsGLM5", @@ -52,7 +52,7 @@ class ModelArgsGLM5(ModelArgs): arch_name = "glm_5" - max_batch_size: int = 1 # NOTE: the current implementation only supports a batch size being 1 + max_batch_size: int = 1 max_seq_len: int = 202752 dtype: Literal["bf16", "fp8"] = "fp8" scale_fmt: str | None = None @@ -65,23 +65,18 @@ class ModelArgsGLM5(ModelArgs): n_dense_layers: int = 3 n_heads: int = 64 - # moe n_routed_experts: int = 256 n_shared_experts: int = 1 n_activated_experts: int = 8 - # n_expert_groups: int = 8 - # n_limited_groups: int = 4 score_func: Literal["softmax", "sigmoid"] = "softmax" route_scale: float = 2.5 - # mla q_lora_rank: int = 2048 kv_lora_rank: int = 512 qk_nope_head_dim: int = 192 qk_rope_head_dim: int = 64 v_head_dim: int = 256 - # yarn original_seq_len: int | None = None rope_theta: float = 1000000.0 rope_factor: float | None = None @@ -89,12 +84,10 @@ class ModelArgsGLM5(ModelArgs): beta_slow: int | None = None mscale: float = 1.0 - # index index_n_heads: int = 32 index_head_dim: int = 128 index_topk: int = 2048 - # quant block_size: int = 128 eps: float = 1e-5 diff --git a/python/models/preprocess/weight_converter.py b/tilert/models/preprocess/weight_converter.py similarity index 90% rename from python/models/preprocess/weight_converter.py rename to tilert/models/preprocess/weight_converter.py index c0926aa..4973f69 100644 --- a/python/models/preprocess/weight_converter.py +++ b/tilert/models/preprocess/weight_converter.py @@ -2,7 +2,7 @@ import os import pprint from collections import OrderedDict -from typing import Any, TypedDict +from typing import Any, TypedDict, cast import torch from safetensors.torch import load_file, save_file @@ -10,7 +10,7 @@ from tilert import logger from tilert.models.deepseek_v3_2.model_args import ModelArgs from tilert.models.deepseek_v3_2.model_args import ModelArgs as ModelArgsDsav32 -from tilert.models.deepseek_v3_2.modules.mla import Mla +from tilert.models.deepseek_v3_2.modules.mla_v2 import PureMlaV2, SparseSelectMlaV2 from tilert.models.deepseek_v3_2.ops.down_allreduce import DownAllReduce from tilert.models.deepseek_v3_2.ops.eh_proj_allreduce import EHProjAllReduce from tilert.models.deepseek_v3_2.ops.expert_down_allreduce import ExpertDownAllReduce @@ -36,13 +36,13 @@ class WeightConverter: def __init__( self, - model_args: ModelArgs, + model_args: ModelArgs | ModelArgsGLM5, num_devices: int, model_dir: str, save_dir: str, test_mode: bool = False, ) -> None: - self.model_args = model_args + self.model_args = cast(ModelArgs, model_args) self.num_devices = num_devices self.model_dir = model_dir self.save_dir = save_dir @@ -62,8 +62,6 @@ def __init__( self.index_file = "model.safetensors.index.json" self.__check_dir() - # specially treated the embedding, norm, and head weights - # at the beginning and end of the model self.emb_name = "model.embed_tokens.weight" self.norm_name = "model.norm.weight" self.head_name = "lm_head.weight" @@ -156,17 +154,16 @@ def save_file_sharded( max_size_bytes = self.parse_size(max_shard_size) - tensor_nums = len(weights_dict) # placeholder for number sharded files + tensor_nums = len(weights_dict) shards: list[ShardInfo] = [] current_shard: dict[str, torch.Tensor] = {} current_size = 0 - shard_index = 1 # first shard is for embedding + shard_index = 1 def get_shard_filename(shard_index: int) -> str: return f"{base_filename}-{shard_index:05d}-of-{tensor_nums:05d}.safetensors" - # Save embedding tensor to separate file save_file(self.emb_weights_dict, get_shard_filename(shard_index)) shards.append( { @@ -184,10 +181,8 @@ def get_shard_filename(shard_index: int) -> str: {name: self.get_tensor_size_bytes(tensor) for name, tensor in dev_tensors.items()} ) - # If adding this tensor would exceed max size, start a new shard for tensor_name, tensor_size in tensor_sizes.items(): if current_size + tensor_size > max_size_bytes and current_shard: - # Save current shard shard_filename = get_shard_filename(shard_index) logger.info(f"Saving shard {shard_index} to {shard_filename}") save_file(current_shard, shard_filename) @@ -199,11 +194,9 @@ def get_shard_filename(shard_index: int) -> str: current_size = 0 shard_index += 1 - # Add tensor to current shard current_shard[tensor_name] = dev_tensors[tensor_name] current_size += tensor_size - # Save the last shard for the current device if current_shard: shard_filename = get_shard_filename(shard_index) logger.info(f"Saving shard {shard_index} to {shard_filename}") @@ -213,7 +206,6 @@ def get_shard_filename(shard_index: int) -> str: current_size = 0 shard_index += 1 - # Update shard filenames with correct total count total_shards = len(shards) for i, shard in enumerate(shards, 1): old_filename = shard["filename"] @@ -253,24 +245,31 @@ def transform_mla( weights_hf: dict[str, torch.Tensor], layer_id: int, ) -> dict[str, dict[str, torch.Tensor]]: - mla_weights_map: dict[str, dict[str, torch.Tensor]] = {} - for dev_id in range(self.num_devices): - mla_weights_map.setdefault(f"dev_{dev_id}", {}) - mla = Mla(self.model_args, device_id=0, num_devices=self.num_devices) - mla_raw_dict = { - _k: weights_hf[f"model.layers.{layer_id}.{_k}"] for _k in mla.get_ref_weights_alias() + """Shard MLA weights across devices.""" + mla_weights: dict[str, dict[str, torch.Tensor]] = { + f"dev_{dev_id}": {} for dev_id in range(self.num_devices) } - mla_sharded_dict = mla.device_sharding(mla_raw_dict) - for dev_id in range(self.num_devices): - for key, value in mla_sharded_dict.items(): - mla_weights_map[f"dev_{dev_id}"].update({key: value[dev_id].contiguous()}) - mla_weights = {} - for dev_id in range(self.num_devices): - mla_weights_dev = {} - for key in mla_weights_map[f"dev_{dev_id}"].keys(): - mla_weights_dev.update({key: mla_weights_map[f"dev_{dev_id}"][key]}) - mla_weights.update({f"dev_{dev_id}": mla_weights_dev}) + sparse_mla = SparseSelectMlaV2(self.model_args, device_id=0, num_devices=1) + sparse_raw_dict = { + _k: weights_hf[f"model.layers.{layer_id}.{_k}"] + for _k in sparse_mla.get_ref_weights_alias() + } + sparse_sharded = sparse_mla.device_sharding(sparse_raw_dict) + for key, value in sparse_sharded.items(): + mla_weights["dev_0"][key] = value[0].contiguous() + + num_pure_mla_devices = self.num_devices - 1 + pure_mla = PureMlaV2(self.model_args, device_id=0, num_devices=num_pure_mla_devices) + pure_raw_dict = { + _k: weights_hf[f"model.layers.{layer_id}.{_k}"] + for _k in pure_mla.get_ref_weights_alias() + } + pure_sharded = pure_mla.device_sharding(pure_raw_dict) + for shard_idx in range(num_pure_mla_devices): + gpu_id = shard_idx + 1 + for key, value in pure_sharded.items(): + mla_weights[f"dev_{gpu_id}"][key] = value[shard_idx].contiguous() return mla_weights @@ -328,7 +327,6 @@ def transform_mlp( layer_id: int, ) -> dict[str, dict[str, torch.Tensor]]: """Transform MLP weights.""" - print(RMSNormUpGateSiLU) rmsnorm_up_gate_silu = RMSNormUpGateSiLU( self.model_args, device_id=0, num_devices=self.num_devices ) @@ -384,13 +382,7 @@ def transform_mtp( weights_hf: dict[str, torch.Tensor], layer_id: int, ) -> dict[str, dict[str, torch.Tensor]]: - """Transform MTP weights. - - Transformations applied: - - enorm.weight: Direct use (fp32) - - hnorm.weight: Direct use (fp32) - - eh_proj.weight: Split along dim 1, reshape [7168, 1792] -> [128, 7, 56, 256] - """ + """Transform MTP weights.""" enorm_weight_key = f"model.layers.{layer_id}.enorm.weight" hnorm_weight_key = f"model.layers.{layer_id}.hnorm.weight" enorm_weight = weights_hf[enorm_weight_key] @@ -530,7 +522,7 @@ def _sort_key(filename: str) -> tuple[int, int]: try: return _get_layer_num(filename) except ValueError: - return (999999, 999999) # If layer number not found, put at the end + return (999999, 999999) tilert_weights = sorted( self.converted_weights_dict, key=lambda x: _sort_key(x), reverse=False @@ -566,7 +558,6 @@ def append_mtp_weights_to_safetensors( """ torch.set_default_device(self.default_device) - # Load existing index.json existing_index_file = os.path.join(existing_save_dir, "model.safetensors.index.json") if not os.path.exists(existing_index_file): raise ValueError(f"Existing index file not found: {existing_index_file}") @@ -577,11 +568,9 @@ def append_mtp_weights_to_safetensors( existing_weight_map: dict[str, str] = existing_index["weight_map"] existing_total_size: int = existing_index["metadata"]["total_size"] - # Find the next shard number existing_shards = set(existing_weight_map.values()) max_shard_num = 0 for shard_name in existing_shards: - # Parse shard number from filename like "model.safetensors-00001-of-00010.safetensors" parts = shard_name.replace(".safetensors", "").split("-") if len(parts) >= 2: try: @@ -594,14 +583,11 @@ def append_mtp_weights_to_safetensors( f"Found {len(existing_shards)} existing shards, max shard number: {max_shard_num}" ) - # Convert MTP layer (layer 61) weights - mtp_layer_idx = self.num_dense_layers + self.num_moe_layers # 61 + mtp_layer_idx = self.num_dense_layers + self.num_moe_layers logger.info(f"Converting MTP layer {mtp_layer_idx} weights...") mla_weights, mlp_weights, mtp_weights = self.convert_a_layer(mtp_layer_idx) - # Collect MTP layer weights for all devices - # Clone tensors to avoid shared memory issues when saving to safetensors mtp_layer_weights: dict[str, torch.Tensor] = {} for weights_group in [mla_weights, mlp_weights, mtp_weights]: for dev, params in weights_group.items(): @@ -611,21 +597,17 @@ def append_mtp_weights_to_safetensors( logger.info(f"Collected {len(mtp_layer_weights)} MTP layer weight tensors") - # Calculate size of new weights new_weights_size = sum(self.get_tensor_size_bytes(t) for t in mtp_layer_weights.values()) - # Save MTP weights to new shard file(s) - # Use a separate naming scheme to avoid modifying existing shards max_size_bytes = self.parse_size(max_shard_size) new_shards: list[ShardInfo] = [] current_shard: dict[str, torch.Tensor] = {} current_size = 0 - mtp_shard_index = 1 # Start from 1 for MTP shards + mtp_shard_index = 1 for tensor_name, tensor in mtp_layer_weights.items(): tensor_size = self.get_tensor_size_bytes(tensor) if current_size + tensor_size > max_size_bytes and current_shard: - # Save current shard with MTP-specific naming shard_filename = f"model_mtp_layer61-{mtp_shard_index:05d}.safetensors" shard_path = os.path.join(existing_save_dir, shard_filename) logger.info(f"Saving MTP shard to {shard_filename}") @@ -640,7 +622,6 @@ def append_mtp_weights_to_safetensors( current_shard[tensor_name] = tensor current_size += tensor_size - # Save the last shard if current_shard: shard_filename = f"model_mtp_layer61-{mtp_shard_index:05d}.safetensors" shard_path = os.path.join(existing_save_dir, shard_filename) @@ -648,12 +629,10 @@ def append_mtp_weights_to_safetensors( save_file(current_shard, shard_path) new_shards.append({"filename": shard_filename, "tensors": list(current_shard.keys())}) - # Update weight_map with new MTP weights (existing shards remain unchanged) for shard in new_shards: for tensor_name in shard["tensors"]: existing_weight_map[tensor_name] = shard["filename"] - # Update index.json updated_index = { "metadata": {"total_size": existing_total_size + new_weights_size}, "weight_map": existing_weight_map, @@ -683,6 +662,7 @@ def append_mtp_weights_to_safetensors( args = parser.parse_args() model_type = args.model_type + model_args: ModelArgsDsav32 | ModelArgsGLM5 if model_type == "deepseek-v32": model_args = ModelArgsDsav32() elif model_type == "glm-5": diff --git a/python/models/utils.py b/tilert/models/utils.py similarity index 76% rename from python/models/utils.py rename to tilert/models/utils.py index b5e81e5..8caaaee 100644 --- a/python/models/utils.py +++ b/tilert/models/utils.py @@ -10,13 +10,25 @@ import torch +_FACTOR_OVERRIDE_UNSET = object() +_THETA_OVERRIDE_UNSET = object() -def precompute_freqs_cis(args) -> torch.Tensor: # type: ignore + +def precompute_freqs_cis( # type: ignore[no-untyped-def] + args, + *, + factor_override=_FACTOR_OVERRIDE_UNSET, + theta_override=_THETA_OVERRIDE_UNSET, +) -> torch.Tensor: """ Pre-computes frequency-based complex exponential values for rotary positional embeddings. Args: args (ModelArgs): Model arguments containing positional embedding parameters. + factor_override: If unset, ``args.rope_factor`` is used. Pass a + numeric value to override the factor inline. + theta_override: If unset, ``args.rope_theta`` is used. Pass a numeric + value to override the rope base. ``None`` is rejected. Returns: torch.Tensor: Precomputed complex exponential values for positional embeddings. @@ -25,8 +37,8 @@ def precompute_freqs_cis(args) -> torch.Tensor: # type: ignore seqlen = args.max_seq_len beta_fast = args.beta_fast beta_slow = args.beta_slow - base = args.rope_theta - factor = args.rope_factor + base = args.rope_theta if theta_override is _THETA_OVERRIDE_UNSET else theta_override + factor = args.rope_factor if factor_override is _FACTOR_OVERRIDE_UNSET else factor_override def find_correction_dim(num_rotations: float, dim: int, base: float, max_seq_len: int) -> float: """ @@ -106,27 +118,33 @@ def linear_ramp_factor(min_value: float, max_value: float, dim: int) -> torch.Te return torch.polar(torch.ones_like(freqs), freqs) -def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: - """ - - Applies rotary positional embeddings to the input tensor. +def apply_rotary_emb( + x_in: torch.Tensor, freqs_cis: torch.Tensor, interleaved: bool = True +) -> torch.Tensor: + """Applies rotary positional embeddings to the input tensor. Args: - x (torch.Tensor): Input tensor with positional embeddings to be applied. - freqs_cis (torch.Tensor): Precomputed complex exponential values for - positional embeddings. + x_in: Input tensor with positional embeddings to be applied. + freqs_cis: Precomputed complex exponential values for positional embeddings. + interleaved: If True (default), adjacent pairs (x0,x1),(x2,x3)... form + complex numbers. If False, half-half layout: (x0,x_{d/2}),(x1,x_{d/2+1})... + The DeepSeek-V3.2-Exp indexer uses interleaved=False. Returns: torch.Tensor: Tensor with rotary embeddings applied. """ dtype = x_in.dtype - x_in = torch.view_as_complex(x_in.float().view(*x_in.shape[:-1], -1, 2)) + shape = x_in.shape + if not interleaved: + x_in = x_in.view(*shape[:-1], 2, -1).transpose(-1, -2).contiguous() + x_in = torch.view_as_complex(x_in.float().view(*shape[:-1], -1, 2)) freqs_cis = freqs_cis.view(1, x_in.size(1), 1, x_in.size(-1)) y_out = torch.view_as_real(x_in * freqs_cis).flatten(3) + if not interleaved: + y_out = torch.cat([y_out[..., 0::2], y_out[..., 1::2]], dim=-1) return y_out.to(dtype) -# enumerate swizzle mode class SwizzleMode(IntEnum): """Swizzle mode.""" @@ -136,7 +154,6 @@ class SwizzleMode(IntEnum): SWIZZLE_128B = 128 // 16 -# See CUDA C++ programming Guide 10.29.3.2 for more details. def gen_tensor_swizzle_map_1d( rows: int, cols_in_16bytes: int, swizzle_mode: SwizzleMode = SwizzleMode.SWIZZLE_128B ) -> torch.Tensor: diff --git a/python/tilert_init.py b/tilert/tilert_init.py similarity index 100% rename from python/tilert_init.py rename to tilert/tilert_init.py diff --git a/python/utils.py b/tilert/utils.py similarity index 62% rename from python/utils.py rename to tilert/utils.py index 47335d7..4cc3b47 100644 --- a/python/utils.py +++ b/tilert/utils.py @@ -5,28 +5,35 @@ import torch __all__ = [ + "alloc_misc_ws", "cosine_similarity", "relative_l2_error", "get_profile_log_tensor", "SLICES_FOR_TILERT_OP", ] - SLICES_FOR_TILERT_OP = 1 def get_profile_log_tensor( - device_index: int = 0, device: torch.device | None = None, num_max_insts: int = 64 -) -> torch.Tensor: + device_index: int = 0, + device: torch.device | None = None, + num_max_insts: int = 64, +) -> torch.Tensor | None: """Get a profile log tensor for the given device index. + Returns ``None`` when no CUDA GPUs are visible so the offline + weight-conversion path can run with ``CUDA_VISIBLE_DEVICES=""``. + Args: device_index: The index of the device. device: The device to use. Returns: - A profile log tensor. + A profile log tensor, or ``None`` if CUDA is unavailable. """ + if not torch.cuda.is_available(): + return None if device is None: device = torch.device("cuda", device_index) @@ -38,6 +45,23 @@ def get_profile_log_tensor( ) +def alloc_misc_ws( + num_max_insts: int = 64, + device_id: int = 0, +) -> torch.Tensor: + """Allocate a misc workspace tensor. + + Args: + num_max_insts: Maximum number of profiled instructions. + device_id: CUDA device index to allocate on. + + Returns: + A zeroed int64 tensor of shape (total_rows, num_sm, 16) on the + requested CUDA device. + """ + return torch.ops.tilert.alloc_misc_ws(num_max_insts, device_id) + + def cosine_similarity(gt: torch.Tensor, out: torch.Tensor) -> torch.Tensor: """Calculate the cosine similarity.