From 5735b5bb8547d0419a9175382622ce29ff051f32 Mon Sep 17 00:00:00 2001 From: dglr Date: Wed, 17 Apr 2024 00:41:13 +0800 Subject: [PATCH 01/27] complete the float type cholesky operator --- kernels/cholesky/cholesky.cpp | 169 ++++ kernels/cholesky/cholesky.h | 50 + kernels/cholesky/cholesky_union1.mlu | 940 ++++++++++++++++++ mlu_op.h | 54 + .../pb_gtest/src/zoo/cholesky/cholesky.cpp | 344 +++++++ .../pb_gtest/src/zoo/cholesky/cholesky.h | 48 + .../src/zoo/cholesky/testcase/case_0.prototxt | 37 + 7 files changed, 1642 insertions(+) create mode 100644 kernels/cholesky/cholesky.cpp create mode 100644 kernels/cholesky/cholesky.h create mode 100644 kernels/cholesky/cholesky_union1.mlu create mode 100644 test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp create mode 100644 test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h create mode 100644 test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/testcase/case_0.prototxt diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp new file mode 100644 index 000000000..09b60e4d3 --- /dev/null +++ b/kernels/cholesky/cholesky.cpp @@ -0,0 +1,169 @@ +#include "cholesky.h" + + + + + +//dA:输入被分解方阵 +//dC:cholesky分解结果方阵 +//trans -> false: col major; true: row major +//uplo -> false: lower; true: upper +//ldda:leading dimension +//batch=1 +mluOpStatus_t MLUOP_WIN_API +mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper) +{ + PARAM_CHECK("mluOpCholesky", handle != NULL); + PARAM_CHECK("mluOpCholesky", input_desc != NULL); + PARAM_CHECK("mluOpCholesky", d_input != NULL); + PARAM_CHECK("mluOpCholesky", output_desc != NULL); + PARAM_CHECK("mluOpCholesky", d_output != NULL); + + PARAM_CHECK("mluOpCholesky", input_desc->dim == 2||input_desc->dim == 3); + PARAM_CHECK("mluOpCholesky", output_desc->dim == input_desc->dim); + PARAM_CHECK("mluOpCholesky", input_desc->dims[0] > 0); + PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); + PARAM_CHECK("mluOpCholesky", output_desc->dims[0] > 0); + PARAM_CHECK("mluOpCholesky", output_desc->dims[1] > 0); + if(input_desc->dim == 3) + { + PARAM_CHECK("mluOpCholesky", input_desc->dims[2] > 0); + PARAM_CHECK("mluOpCholesky", output_desc->dims[2] > 0); + } + + + int recnb = REC_NB; + int gbstep = 0; + int dim = input_desc->dim; + bool is_row_major = (input_desc->strides)[dim-1]==1; + + int size_a = 0, lda = 0, size_c = 0, ldc = 0; + int batch_size = 1; + if(dim == 2) + { + size_a = input_desc->dims[0]; + lda = input_desc->dims[1]; + size_c = output_desc->dims[0]; + ldc = output_desc->dims[1]; + } + else if(dim == 3) + { + batch_size = input_desc->dims[0]; + size_a = input_desc->dims[1]; + lda = input_desc->dims[2]; + size_c = output_desc->dims[1]; + ldc = output_desc->dims[2]; + } + + + float* work_space; + float* work_space_h; + CNRT_CHECK(cnrtMalloc((void **)&work_space, NB*NB*sizeof(float))); + CNRT_CHECK(cnrtMemset(work_space, 0, NB*NB*sizeof(float))); + work_space_h = (float*)malloc(NB*NB*sizeof(float)); + PARAM_CHECK("mluOpCholesky", lda >= size_a); + PARAM_CHECK("mluOpCholesky", ldc >= size_c); + + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + // CNRT_CHECK(cnrtSetDevice(0)); + // CNRT_CHECK(cnrtQueueCreate(&queue)); + + // cnrtNotifier_t start, end; + // CNRT_CHECK(cnrtNotifierCreate(&start)); + // CNRT_CHECK(cnrtNotifierCreate(&end)); + + int jb; + const float s_one = 1.0; + const float s_neg_one = -1.0; + + if(upper == true) + { + printf("start transpose 1\n"); + CHECK_RETURN("mluOpCholesky", + transpose(size_a,d_input,d_output,handle)); + } + else + { + CNRT_CHECK(cnrtMemcpy(d_output, d_input, sizeof(float)*size_a*lda, CNRT_MEM_TRANS_DIR_DEV2DEV)); + } + cnrtQueueSync(queue); + + //TODO:检查拷贝开销 + + // if(upper == true) + // { + // CHECK_RETURN("mluOpCholesky", + // transpose(size_a,d_input,d_output,handle)); + // //print d_output + // cnrtMemcpy(work_space_h, d_output, sizeof(float)*size_a*size_a, CNRT_MEM_TRANS_DIR_DEV2HOST); + // //print work_space_h + // // printf("matrix after transpose:\n"); + // // for(int i = 0; i < size_a; i++) + // // { + // // for(int j = 0; j < size_a; j++) + // // { + // // printf("%.2f ",work_space_h[i*size_a+j]); + // // } + // // printf("\n"); + // // } + + // } + // else + // { + int row = is_row_major ? lda : size_a; + // int nb = row > 512 ? NB : (NB/2); + int nb = NB; + for(int j = 0; j < row; j+=nb) + { + jb = std::min(nb, row-j); + CHECK_RETURN("mluOpCholesky", + ssyrk(false,is_row_major,jb,j,OFFSET_ROW(d_output,j,0),lda,OFFSET_ROW(d_output,j,j),lda,handle)); + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + mlu_spotrf_rectile(is_row_major,false,jb,recnb,OFFSET_ROW(d_output,j,j),lda,j, handle)); + // cnrtQueueSync(queue); + if(j+jb < row) + { + CHECK_RETURN("mluOpCholesky", + sgemm(!is_row_major,is_row_major,row-j-jb,jb,j,-1.0f,1.0f, + OFFSET_ROW(d_output,j+jb,0),lda, + OFFSET_ROW(d_output,j,0),lda, + OFFSET_ROW(d_output,j+jb,j),lda, handle)); + cnrtQueueSync(queue); + } + if(j+jb < row) + { + CHECK_RETURN("mluOpCholesky", + strsm(false,is_row_major,jb,row-j-jb,OFFSET_ROW(d_output,j,j),lda,OFFSET_ROW(d_output,j+jb,j),lda, handle)); + cnrtQueueSync(queue); + } + } + // } + + if(upper) + { + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + transpose(size_a,d_output,d_output,handle)); + } + + + + + cnrtQueueSync(queue); + + // cnrtMemcpy(work_space_h, work_space, sizeof(float)*NB*NB, CNRT_MEM_TRANS_DIR_DEV2HOST); + //print work_space_h + // printf("work_space:\n"); + // for(int i = 0; i < NB; i++) + // { + // for(int j = 0; j < NB; j++) + // { + // printf("%.2f ",work_space_h[i*NB+j]); + // } + // printf("\n"); + // } + + return MLUOP_STATUS_SUCCESS; +} \ No newline at end of file diff --git a/kernels/cholesky/cholesky.h b/kernels/cholesky/cholesky.h new file mode 100644 index 000000000..e6ada4f8b --- /dev/null +++ b/kernels/cholesky/cholesky.h @@ -0,0 +1,50 @@ +#ifndef __CHOLESKY_H +#define __CHOLESKY_H + +#define DEBUG + +#include +#include +#include +#include +#include +#include +#include +// #include +#include "mlu_op.h" +#include "core/gen_case.h" +#include "core/logging.h" +#include "core/runtime/device.h" +#include "core/tensor.h" +#include "core/type.h" +#include "kernels/kernel.h" +#include "kernels/utils/cnnl_helper.h" + + +#define REC_NB (8) +#define POTF_NB ((REC_NB)/4) +#define __CNRT_FUNC_TYPE__ CNRT_FUNC_TYPE_UNION1 +#define TASK_NUM (4) +#define NB (16) +#define CLUSTER_NUM 1 +#define M (TASK_NUM * POTF_NB) //POTF边长 +#define ZERO 0.0 +#define SHARED_MEM_SIZE (((M*POTF_NB/TASK_NUM * 4)+(POTF_NB * POTF_NB))) +#define OFFSET_ROW(A, i, j) A + ((i) * (lda) + (j)) +#define OFFSET_B_ROW(B, i, j) B + ((i) * (ldb) + (j)) + + +mluOpStatus_t mlu_spotrf_rectile(bool trans, bool uplo, int n, int recnb, float* dA, int ldda, int gbstep, mluOpHandle_t handle); +// void mluOpCholesky(bool trans, bool uplo, int n, float* dA, float* dC, int ldda); + +mluOpStatus_t ssyrk(bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle); + +mluOpStatus_t sgemm(bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, float* d_b, int ldb, float* d_c, int ldc, mluOpHandle_t handle); + +//side:true->right +// false->left +mluOpStatus_t strsm(bool upper, bool trans, int m, int n, float* d_a, int ldda, float* d_b, int lddb, mluOpHandle_t handle); + +mluOpStatus_t transpose(int m, float* d_input,float* d_output, mluOpHandle_t handle); + +#endif \ No newline at end of file diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu new file mode 100644 index 000000000..39bc2dd71 --- /dev/null +++ b/kernels/cholesky/cholesky_union1.mlu @@ -0,0 +1,940 @@ +#include "cholesky.h" +__nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + +__mlu_func__ +void sgemm_fixwidth_device(int m, int k, + float* A0, const int lda, + float *sC, float *sB) +{ + int id = taskId; + + int span = POTF_NB; + + + __nram__ float rC[M * POTF_NB/TASK_NUM ]; + __nram__ float rA[M * POTF_NB/TASK_NUM ]; + __nram__ float rp[M * POTF_NB/TASK_NUM ]; + + __nram__ float rB[POTF_NB * POTF_NB]; + + + + if(id*span 0; + int span = (remain > POTF_NB||remain <= 0) ? POTF_NB : remain; + + float *rA = (float*)nram_buffer + id * NB * NB * 4; + + float *rB = rA + NB * NB; + + float *rC = rB + NB * NB; + + float* rp = rC + NB * NB; + + int span_b = POTF_NB > m ? m : POTF_NB; + + + + __memset_nram(rC,span_b*span,(float)ZERO); + + if(if_execute) + { + if(k>0) + { + __memcpy(rA,A0+id*POTF_NB*lda,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span-1); + } + __memcpy(rp,sC+id*POTF_NB*lda,span_b*sizeof(float),SRAM2NRAM,span_b*sizeof(float),lda*sizeof(float),span-1); + + } + + if(k>0) + { + __memcpy(rB,A0,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span_b-1); + + } + + + __sync_cluster(); + + for(int i = 0; i < span; i++) + { + for(int j = 0; j < span_b; j++) + { + for(int h = 0; h < k; h++) + { + rC[i*span_b+j] += rA[i*NB+h] * rB[j*NB+h]; + } + } + } + + __bang_sub(rp,rp,rC,span_b * span); + + __sync_cluster(); + + if(id==0) + { + for(int i = 0; i < span; i++) + { + __memcpy(sC+(i*lda),rp+i*span_b,(i+1)*sizeof(float),NRAM2SRAM); + } + + } + else if(if_execute) + { + __memcpy(sC+(id*POTF_NB*lda),rp,span_b*sizeof(float),NRAM2SRAM,lda*sizeof(float),span_b*sizeof(float),span-1); + } + + + + + + +} + +static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) +{ + float factor; + int id = coreId; + int finish = id * POTF_NB; + int remain = m - finish; + bool if_execute = remain > 0; + int span = remain > POTF_NB ? POTF_NB : remain; + int iter_num = m > POTF_NB ? POTF_NB : m; + for(int iter = 0; iter < iter_num; iter++) + { + factor=sqrt(A[iter*lda+iter]); + factor = 1.0/factor; + __sync_cluster(); + for(int i = 0; i < span; i++) + { + if(if_execute) + A[i*lda+iter+id*POTF_NB*lda] *= factor; + + } + __sync_cluster(); + + if(if_execute) + { + for(int i = iter + 1; i < iter_num; i++) + { + for(int j = finish; j < finish + span; j++) + { + if(j < i) + continue; + A[j * lda + i ] -= A[i*lda+iter] * A[j * lda + iter]; + } + } + } + + __sync_cluster(); + + } + +} + +__mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float *A0, float *A, int lda, const int localstep, const int gbstep) +{ + int id = taskId; + __mlu_shared__ float shared_data[SHARED_MEM_SIZE]; + float* sdata_A = shared_data; + float* sdata_B = shared_data + m *POTF_NB/TASK_NUM * 4; + sgemm_fixwidth_device(m, localstep, A0, lda, sdata_A, sdata_B); + + __sync_cluster(); + + + spotf2_sminout_fixsize_device(m, sdata_A, m); + + __sync_cluster(); + + int span = POTF_NB; + + + if(id==0) + { + for(int i = 0; i < span; i++) + { + __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); + } + + } + else if(id*span < m) + { + __memcpy(A+(id*POTF_NB*lda),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,lda*sizeof(float),POTF_NB*sizeof(float),span-1); + } + + __sync_cluster(); + + +} + +__mlu_func__ void spotf2_smlpout_anywidth_device(const int m, float *A0, float *A, int lda, const int localstep, const int gbstep) +{ + + sgemm_anywidth_device(m, localstep, A0, lda, A, nullptr); + + spotf2_sminout_anysize_device(m, A, lda); + + __sync_cluster(); + + +} +__mlu_global__ void spotf2_smlpin_fixwidth_kernel(bool trans, int m, float *dA, int lda, int localstep, int gbstep) +{ + + for(int i = 0; i < m; i += POTF_NB) + { + spotf2_smlpout_fixwidth_device(m-i,OFFSET_ROW(dA, localstep + i,0), OFFSET_ROW(dA, localstep + i, localstep + i), lda, localstep+i, gbstep); + } + +} + +__mlu_global__ void spotf2_smlpin_anywidth_kernel(bool trans, int m, float *dA, int lda, int localstep, int gbstep) +{ + int id = taskId; + + __mlu_shared__ float shared_data[NB * NB]; + + if(m%4==0) + { + for(int i = 0; i < m; i += POTF_NB) + { + spotf2_smlpout_fixwidth_device(m-i,OFFSET_ROW(dA, localstep + i,0), OFFSET_ROW(dA, localstep + i, localstep + i), lda, localstep+i, gbstep); + } + } + else + { + + if(id == 0) + { + __memcpy(shared_data,dA,m*sizeof(float),GDRAM2SRAM,NB*sizeof(float),lda*sizeof(float),m-1); + } + __sync_cluster(); + + for(int i = 0; i < m; i += POTF_NB) + { + spotf2_smlpout_anywidth_device(m-i,shared_data+i*NB, shared_data+i*NB+i, NB, localstep+i, gbstep); + } + + __sync_cluster(); + + if(id == 0) + { + __memcpy(dA,shared_data,m*sizeof(float),SRAM2GDRAM,lda*sizeof(float),NB*sizeof(float),m-1); + } + __sync_cluster(); + } + + + + +} + +mluOpStatus_t mlu_spotf2_lpin(bool trans,bool uplo, int n, int ldda, float* dA, int gbstep, cnrtQueue_t queue) +{ + cnrtDim3_t dim; + cnrtFunctionType_t func_type = __CNRT_FUNC_TYPE__; + dim.x = TASK_NUM; + dim.y = 1; + dim.z = 1; + + KERNEL_CHECK( + spotf2_smlpin_anywidth_kernel<<>>(trans, n, dA, ldda, 0,gbstep)); + return MLUOP_STATUS_SUCCESS; +} + + +__mlu_entry__ void mlu_strsm_rectile_kernel( + int m,int n, bool trans, + float *dA, int32_t lda, + float *dB, int32_t ldb) +{ + int id = taskId; + + + int span = n / 4; + int start = id * span; + if(id == 3) + { + span = n - 3 * span; + } + bool if_execute = span > 0; + __mlu_shared__ float sA[8*POTF_NB]; + __nram__ float rB[4*POTF_NB * 8*POTF_NB]; + __nram__ float rC[4*POTF_NB * 8*POTF_NB]; + __nram__ float rBp[4*POTF_NB]; + __nram__ float rA[8*POTF_NB]; + int calc_length = (8 * POTF_NB) > m ? m : (8 * POTF_NB); + __memset_nram(rB,POTF_NB*calc_length,(float)ZERO); + __sramset(sA,calc_length*calc_length,0); + + + float temp_b = 0, factor = 0; + + + if(id == 0) + { + __memcpy_async(sA,dA,sizeof(float),LDRAM2SRAM); + } + if(if_execute) + __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); + __sync_cluster(); + + + if(trans) + { + __memcpy_async(rA,sA,(1)*sizeof(float),SRAM2NRAM); + if(if_execute) + __memcpy_async(rB,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); + __sync_cluster(); + if(id == 0) + { + __memcpy_async(sA,OFFSET_ROW(dA,1,0),2*sizeof(float),LDRAM2SRAM); + } + if(if_execute) + __memcpy_async(rBp,OFFSET_B_ROW(dB,start,1),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); + factor = 1.0 / rA[0]; + for(int i = 0; i < span; i++) + { + rB[i*calc_length] *= factor; + } + + __sync_cluster(); + + for(int iter = 1; iter < m - 1; iter++) + { + __memcpy_async(rA,sA,(iter+1)*sizeof(float),SRAM2NRAM); + if(if_execute) + __memcpy_async(rB+iter,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); + __sync_cluster(); + if(id == 0) + { + __memcpy_async(sA,OFFSET_ROW(dA,iter+1,0),(iter+2)*sizeof(float),LDRAM2SRAM); + } + if(if_execute) + __memcpy_async(rBp,OFFSET_B_ROW(dB,start,iter+1),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); + factor = 1.0 / rA[iter]; + for(int i = 0; i < span; i++) + { + __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,iter); + temp_b = 0; + for(int j = 0; j < iter; j++) + { + temp_b += rC[i*calc_length+j]; + } + temp_b = rB[i*calc_length+iter] - temp_b; + rB[i*calc_length+iter] = temp_b * factor; + } + + __sync_cluster(); + } + + __memcpy_async(rA,sA,(m)*sizeof(float),SRAM2NRAM); + if(if_execute) + __memcpy_async(rB+m-1,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); + __sync_cluster(); + factor = 1.0 / rA[m-1]; + for(int i = 0; i < span; i++) + { + __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,m-1); + + temp_b = 0; + for(int j = 0; j < m-1; j++) + { + temp_b += rC[i*calc_length+j]; + } + temp_b = rB[i*calc_length+m-1] - temp_b; + + rB[i*calc_length+m-1] = temp_b * factor; + } + __sync_cluster(); + + + if(if_execute) + { + __memcpy(OFFSET_B_ROW(dB,start,0),rB,calc_length*sizeof(float),NRAM2LDRAM,ldb * sizeof(float), calc_length * sizeof(float), span - 1); + } + __sync_cluster(); + + } + +} + + + +mluOpStatus_t strsm_rectile(bool upper, bool trans, int m, int n, float *d_a, int lda, float *d_b, int lddb, cnrtQueue_t queue) +{ + cnrtDim3_t dim; + dim.x = TASK_NUM; + dim.y = 1; + dim.z = 1; + cnrtFunctionType_t func_type = __CNRT_FUNC_TYPE__; + if(!upper && trans) + { + KERNEL_CHECK( + mlu_strsm_rectile_kernel<<>>(m,n,trans,d_a,lda,d_b,lddb)); + } + return MLUOP_STATUS_SUCCESS; +} + +__mlu_global__ +void add_c(float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) +{ + + + __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; + if (beta == 0.0f) + { + if(taskId == 0) + { + __memcpy(sram_buffer,src,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); + + } + __sync_cluster(); + if(taskId == 0) + { + __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); + } + __sync_cluster(); + return; + } + + float* a_sram = (float*)sram_buffer + 3* m * n; + + if (taskId == 0) { + __memcpy(sram_buffer,d_c,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldc*sizeof(float),m-1); + __memcpy(a_sram,src,n*m*sizeof(float),GDRAM2SRAM); + } + + __sync_cluster(); + + + int32_t data_num = m*n; + int32_t data_per_core = data_num / taskDim; + int32_t data_last_core = data_per_core + data_num % taskDim; + const float *a_offset = a_sram + taskId * data_per_core; + const float *b_offset = (float*)sram_buffer + taskId * data_per_core; + float *output_offset = (float*)sram_buffer + taskId * data_per_core; + + if (taskId == taskDim - 1) { + data_per_core = data_last_core; + } + + int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); + + int32_t data_nram_num = + MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; + float *a_nram = (float *)nram_buffer; + float *b_nram = (float *)a_nram + data_nram_num; + int32_t loop_num = data_per_core / data_nram_num; + int32_t rem_nram_num = data_per_core % data_nram_num; + + for (int32_t i = 0; i < loop_num; i++) { + __memcpy(a_nram, a_offset + i * data_nram_num, + data_nram_num * sizeof(float), SRAM2NRAM); + __memcpy(b_nram, b_offset + i * data_nram_num, + data_nram_num * sizeof(float), SRAM2NRAM); + __bang_add(a_nram, a_nram, b_nram, data_nram_num); + __memcpy(output_offset + i * data_nram_num, a_nram, + data_nram_num * sizeof(float), NRAM2SRAM); + } + if (rem_nram_num != 0) { + int32_t rem_align_num = + (rem_nram_num + align_num - 1) / align_num * align_num; + __memcpy(a_nram, a_offset + loop_num * data_nram_num, + rem_nram_num * sizeof(float), SRAM2NRAM); + __memcpy(b_nram, b_offset + loop_num * data_nram_num, + rem_nram_num * sizeof(float), SRAM2NRAM); + __bang_add(a_nram, a_nram, b_nram, rem_align_num); + __memcpy(output_offset + loop_num * data_nram_num, a_nram, + rem_nram_num * sizeof(float), NRAM2SRAM); + } + __sync_cluster(); + + if (taskId == 0) { + __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2GDRAM,ldc*sizeof(float),n*sizeof(float),m-1); + + } + + __sync_cluster(); + +} + + +mluOpStatus_t sgemm(bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, float* d_b, int ldb, float* d_c, int ldc, mluOpHandle_t handle) +{ + if(k==0) + return MLUOP_STATUS_SUCCESS; + int matmul_is_transA = trans_a; + int matmul_is_transB = trans_b; + + int matmul_requested_algo = 1; + int matmul_recieved_algo = 0; + size_t tempSize_matmulExtra = 0; + int matmul_computetype = MLUOP_DTYPE_FLOAT; + float *workspace; + int matmul_use_beta = beta == 0.0f ? 0 : 1; + + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + + + + + + + mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc; + + cnnlMatMulDescriptor_t matmul_desc; + cnnlMatMulHeuristicResult_t heuristic_result; + cnnlMatMulAlgo_t matmul_algo; + + std::string api_name = "Cholesky"; + + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc));; + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc)); + + CALL_CNNL(cnnlMatMulDescCreate(&matmul_desc)); + CALL_CNNL(cnnlMatMulAlgoCreate(&matmul_algo)); + CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result)); + + CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSA, + &matmul_is_transA, sizeof(int32_t))); + CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSB, + &matmul_is_transB, sizeof(int32_t))); + CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_COMPUTE_TYPE, + &matmul_computetype, sizeof(int32_t))); + + CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_USE_BETA, + &matmul_use_beta, sizeof(int32_t))); + + CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_USE_STRIDE, + &lda, sizeof(int32_t))); + + int32_t matmul_a_shape[2] = {m, lda}; + int32_t matmul_b_shape[2] = {n, ldb}; + int32_t matmul_c_shape[2] = {m, n}; + + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + matmul_a_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_a_shape)); + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + matmul_b_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_b_shape)); + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + matmul_c_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_c_shape)); + + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_c_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_d_desc); + + + + CALL_CNNL(cnnlGetMatMulAlgoHeuristic( + cnnl_handle, matmul_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, + cnnl_d_desc, nullptr, matmul_requested_algo, &heuristic_result, + &matmul_recieved_algo)); + + CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo, + &tempSize_matmulExtra)); + + // CNRT_CHECK(cnrtMalloc((void **)&workspace, tempSize_matmulExtra)); + CNRT_CHECK(cnrtMalloc((void **)&workspace, m*n*sizeof(float))); + + cnnlStrideBatchMatMul(cnnl_handle, trans_a, trans_b, m, n, k, 1, alpha, cnnl_a_desc, d_a, lda, m*lda, cnnl_b_desc, d_b, ldb, ldb*n, 0.0f, cnnl_c_desc, workspace, n, m*n); + + if ( beta == 1.0f || beta == 0.0f) + { + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; + dim.x = 4; + dim.y = 1; + dim.z = 1; + KERNEL_CHECK(add_c<<>>(beta,d_c,workspace,ldc,n,m,n)); + + } + + return MLUOP_STATUS_SUCCESS; +} + +__mlu_global__ +void inverse_kernel(float *d_input, int ld_input, float* d_output, int ld_output, int m) +{ + + __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; + + + if (taskId == 0) { + __memcpy(sram_buffer,d_input,m*sizeof(float),GDRAM2SRAM,m*sizeof(float),ld_input*sizeof(float),m-1); + } + __sync_cluster(); + + int id = taskId; + int span = m/taskDim; + int start = id * span; + if (id == 3) + { + span = m - 3 * span; + } + float* nram_offset = (float*)nram_buffer + id * 3 * m * m; + float* nram_src1 = nram_offset; + float* nram_src2 = nram_src1 + m * m; + float* mul_result = nram_src2 + m; + float* nram_dst = nram_src2 + m * m; + float* diag_start = ((float*)sram_buffer) + m * start + start; + int height = m - start; + + __memset_nram(nram_offset, 3 * m * m, (float)ZERO); + + float result = 0.0; + for(int i = 0; i < span; i++) + { + int off = i * m + i; + result = diag_start[off]; + result = 1.0 / result; + nram_src1[i*height+i] = result; + nram_dst[i*span + i] = result; + diag_start[off] = result; + + } + __sync_cluster(); + + + for(int i = 1; i < height; i++) + { + __memcpy(nram_src2,diag_start+i*m,i*sizeof(float),SRAM2NRAM); + int num = std::min(i, span); + float diag_element = diag_start[i*m+i]; + for(int j = 0; j < num; j++) + { + float temp = 0.0; + __bang_mul(mul_result,nram_src2,nram_src1+j*height,i); + for(int k = 0; k< i; k++) + { + temp += mul_result[k]; + } + temp = temp * -1.0 * diag_element; + nram_dst[i*span+j] = temp; + nram_src1[j*height+i] = temp; + } + __sync(); + + } + + __sync_cluster(); + + if (span > 0) + __memcpy(diag_start,nram_dst,span*sizeof(float),NRAM2SRAM,m*sizeof(float),span*sizeof(float),height-1); + + __sync_cluster(); + + if (taskId == 0) { + // __memcpy(d_input,sram_buffer,m*m*sizeof(float),SRAM2GDRAM); + __memcpy(d_output,sram_buffer,m*sizeof(float),SRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); + } + + +} + +__mlu_global__ void set_zero(bool upper, int m, float* d_c, int lddc) +{ + int id = taskId; + int span = m/taskDim; + int pre = id * span; + float* start_c = d_c + pre * lddc + pre; + float* temp_c = start_c; + if (id == 3) + { + span = m - 3 * span; + + } + for(int i = 0; i < span - 1; i++) + { + temp_c = start_c + i * lddc + i; + int num = m - pre - i; + __ldramset(temp_c+1, num - 1, 0); + } + if (id != 3) + { + temp_c = start_c + (span - 1) * lddc + span - 1; + int num = m - pre - span + 1; + __ldramset(temp_c+1, num - 1, 0); + + } +} + + + +mluOpStatus_t strsm(bool upper, bool trans, int m, int n, float* d_a, int lda, float* d_b, int ldb, mluOpHandle_t handle) +{ + if(n==0) + return MLUOP_STATUS_SUCCESS; + mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, info_desc; + std::string api_name = "Cholesky"; + + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + + int32_t *info; + CNRT_CHECK(cnrtMalloc((void **)&info, sizeof(int32_t))); + + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&info_desc)); + int32_t matmul_a_shape[2] = {m, m}; + int32_t matmul_b_shape[2] = {n, ldb}; + int32_t info_shape[1] = {1}; + + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + matmul_a_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_a_shape)); + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + matmul_b_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_b_shape)); + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + info_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_INT32, 1, info_shape)); + + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(info_desc, cnnl_info_desc); + + float* workspace; + CNRT_CHECK(cnrtMalloc((void **)&workspace, m*m*sizeof(float))); + CNRT_CHECK(cnrtMemset(workspace, 0.0, m*m*sizeof(float))); + + + + int m1 = m/2; + int m2 = m - m1; + + float* workspace1 = workspace; + float* workspace2 = workspace1 + m1*m+m1; + + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; + dim.x = 4; + dim.y = 1; + dim.z = 1; + KERNEL_CHECK(inverse_kernel<<>>(d_a,lda,workspace1,m,m1)); + KERNEL_CHECK(inverse_kernel<<>>(d_a+m1*lda+m1,lda,workspace2,m,m2)); + + sgemm(false,false,m2,m1,m1,1.0f,0.0f,d_a+m1*lda,lda,workspace1,m,workspace1+m1*m,m,handle); + sgemm(false,false,m2,m2,m1,-1.0f,0.0f,workspace2,m,workspace1+m1*m,m,workspace1+m1*m,m,handle); + + + + cnnlStrideBatchMatMul(cnnl_handle, false, true, n,m, m, 1, 1.0, cnnl_b_desc, d_b, ldb, n*ldb, cnnl_a_desc, workspace, m, m*m, 0.0f, cnnl_b_desc, d_b, ldb, n*ldb); + + + + return MLUOP_STATUS_SUCCESS; +} + + + +mluOpStatus_t ssyrk(bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle) +{ + if(k==0) + return MLUOP_STATUS_SUCCESS; + + sgemm(false,true,n,n,k,-1.0f,1.0f,d_a,ldda,d_a,ldda,d_c,lddc,handle); + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; + dim.x = 4; + dim.y = 1; + dim.z = 1; + KERNEL_CHECK(set_zero<<>>(upper, n, d_c,lddc)); + + + return MLUOP_STATUS_SUCCESS; +} + +mluOpStatus_t mlu_spotrf_rectile(bool trans, bool uplo, int n, int recnb, float* d_A, int lda, int gbstep, mluOpHandle_t handle) +{ + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + if(n==0) + return MLUOP_STATUS_SUCCESS; + + if(n <=recnb) + { + // printf("n:%d, recnb:%d, mlu_spotf2_lpin\n",n,recnb); + mlu_spotf2_lpin(trans, uplo,n,lda,d_A,gbstep,queue); + } + else + { + int n1 = n/2; + int n2 = n-n1; + mlu_spotrf_rectile(trans,uplo,n1,recnb,OFFSET_ROW(d_A,0,0),lda,gbstep, handle); + strsm_rectile(uplo,trans,n1,n2,OFFSET_ROW(d_A,0,0),lda,OFFSET_ROW(d_A,n1,0),lda,queue); + ssyrk(uplo,trans,n2,n1,d_A+n1*lda,lda,OFFSET_ROW(d_A,n1,n1),lda,handle); + mlu_spotrf_rectile(trans,uplo,n2,recnb,OFFSET_ROW(d_A,n1,n1),lda,gbstep+n1,handle); + + + + } + return MLUOP_STATUS_SUCCESS; +} + +mluOpStatus_t transpose(int m, float* d_input,float* d_output, mluOpHandle_t handle) +{ + if(m==0) + return MLUOP_STATUS_SUCCESS; + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + + mluOpTensorDescriptor_t trans_input_desc, trans_output_desc; + std::string api_name = "Cholesky"; + const int input_dim = 2; + + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_input_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_output_desc)); + + int32_t transpose_input_shape[2] = {m, m}; + int32_t transpose_output_shape[2] = {m, m}; + + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + trans_input_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, transpose_input_shape)); + + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + trans_output_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, transpose_output_shape)); + + int permute[2] = {1, 0}; + + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_input_desc, cnnl_in_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_output_desc, cnnl_out_desc); + + cnnlTransposeDescriptor_t cnnl_trans_desc = NULL; + + CALL_CNNL(cnnlCreateTransposeDescriptor(&cnnl_trans_desc)); + + CALL_CNNL(cnnlSetTransposeDescriptor(cnnl_trans_desc, input_dim, permute)); + + size_t *size = NULL; + size = (size_t*)malloc(sizeof(size_t)); + + + CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, cnnl_trans_desc, size)); + + float *workspace = NULL; + + CALL_CNNL(cnnlTranspose_v2(cnnl_handle, cnnl_trans_desc, cnnl_in_desc, + d_input, cnnl_out_desc, d_output, + workspace, *size)); + + return MLUOP_STATUS_SUCCESS; + +} + diff --git a/mlu_op.h b/mlu_op.h index 91c4d9887..e75c6d715 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -14361,6 +14361,60 @@ mluOpExecFFT(mluOpHandle_t handle, */ mluOpStatus_t MLUOP_WIN_API mluOpDestroyFFTPlan(mluOpFFTPlan_t fft_plan); + +/*! + * @brief Computes the Cholesky factorization of a Hermitian positive-definite matrix. + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * deformable convolution operation. For detailed information, see ::mluOpHandle_t. + * + * @param[in] input_desc + * The descriptor of the input matrix to factorise, it is an n×n Hermitian matrix, + * only the lower or upper part is meaningful. + * + * @param[in] d_input + * Pointer to the MLU memory that stores the input matrix. + * + * @param[in] output_desc + * The descriptor of the result matrix, it is an n×n lower triangular matrix or an upper triangular matrix. + * + * @param[out] d_output + * Pointer to the MLU memory that stores the result matrix. + * + * @param[in] upper + * upper indicates which part of the matrix is used. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_EXECUTION_FAILED + * + * @par Data Type + * - float32 + * + * @par Data Layout + * - None. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par Note + * - None. + * + * @par Example. + * - None. + * + * @par Reference. + * - None. + */ +mluOpStatus_t MLUOP_WIN_API +mluOpCholesky(mluOpHandle_t handle, + const mluOpTensorDescriptor_t input_desc, + float* d_input, + const mluOpTensorDescriptor_t output_desc, + float* d_output,bool upper); #if defined(__cplusplus) } #endif diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp new file mode 100644 index 000000000..d99c2922d --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp @@ -0,0 +1,344 @@ +/************************************************************************* + * Copyright (C) [2022] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include +#include "cholesky.h" + +namespace mluoptest { + + +void CholeskyExecutor::paramCheck() { + if (parser_->getInputNum() != 1) { + LOG(ERROR) << "cholesky input number is wrong. "; + } + if (parser_->getOutputNum() != 1) { + LOG(ERROR) << "cholesky output number is wrong. "; + } + flag_quant_mode_ = NO_QUANT; + + + +} + +void set_matrix_zero(float*A, bool upper, bool trans_, int n_, int ldda_) +{ + if(trans_) + { + for (int i = 0; i < n_; i++) + { + for (int j = 0; j < ldda_; j++) + { + + if(upper) + { + if(i > j) + A[j + i * ldda_] = 0.0; + } + else + { + if(i < j) + A[j + i * ldda_] = 0.0; + } + } + } + } + else + { + for (int i = 0; i < n_; i++) + { + for (int j = 0; j < ldda_; j++) + { + if((i > j && ~upper)||(i < j && upper)) + A[j + i * ldda_] = 0.0; + } + } + } + +} + +void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int ldda_) +{ + if(trans_) + { + for(int i = 0; i i) + continue; + else + { + A[i+j*lda] += (C[k*lda+i]*C[k*lda+j]); + } + } + } + } + } + } + else + { + for(int i = 0; i getProtoNode()->input(0); + auto input_shape = input_tensor.shape(); + upper_ = parser_->getProtoNode()->cholesky_param().upper(); + int dim_size = input_shape.dims_size(); + if(dim_size ==2) + { + n_ = input_shape.dims(0); + int dim = input_desc_->dim; + stride_ = (input_desc_->strides)[dim-1]; + ldda_ = input_desc_->dims[1]; + + printf("n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n",n_,ldda_,stride_,upper_,trans_); + int size = input_desc_->dims[0]; + + printf("size:%d, dim:%d, \n",size,dim); + printf("strides:\n"); + for(int i = 0; i < dim; i++) + { + printf("%ld ",(input_desc_->strides)[i]); + } + printf("\n"); + printf("data vector length : %ld\n",data_vector_.size()); + } + else if(dim_size == 3) + { + batch_size_ = input_shape.dims(0); + n_ = input_shape.dims(1); + int dim = input_desc_->dim; + stride_ = (input_desc_->strides)[dim-1]; + ldda_ = input_desc_->dims[2]; + printf("batch_size:%d,n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n",batch_size_,n_,ldda_,stride_,upper_,trans_); + + int size = input_desc_->dims[1]; + + printf("size:%d, dim:%d, \n",size,dim); + printf("strides:\n"); + for(int i = 0; i < dim; i++) + { + printf("%ld ",(input_desc_->strides)[i]); + } + printf("\n"); + printf("data vector length : %ld\n",data_vector_.size()); + } + + std::memcpy(dev_c,dev_a,sizeof(float)*n_*ldda_); + set_matrix_zero((float*)dev_c,upper_,trans_,n_,ldda_); + trans_mul(dev_a,dev_c,ldda_,upper_,trans_,n_,ldda_); + printf("matrix A:\n"); + if(dim_size == 3) + { + for(int i = 1; i < batch_size_;i++) + { + std::memcpy(dev_a+i*n_*ldda_,dev_a,sizeof(float)*n_*ldda_); + std::memcpy(dev_c+i*n_*ldda_,dev_c,sizeof(float)*n_*ldda_); + } + } + + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(dev_d, dev_a, sizeof(float)*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_HOST2DEV)); + float* cpu_a = cpu_fp32_input_[0]; + std::memcpy(cpu_a,dev_a,sizeof(float)*n_*ldda_); + printf("end prepare compute.\n"); + +} + +void CholeskyExecutor::compute() { + +// prepareComputeParam(); + + VLOG(4) <<" CholeskyExecutor compute "; + auto input_desc_ = tensor_desc_[0].tensor; + auto output_desc_ = tensor_desc_[1].tensor; + auto h_input = (float*)(data_vector_[0].host_ptr); + auto h_output = (float*)(data_vector_[1].host_ptr); + auto d_intput = (float*)(data_vector_[0].device_ptr); + auto d_output = (float*)(data_vector_[1].device_ptr); + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(h_output, d_intput, sizeof(float)*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_DEV2HOST)); + + interface_timer_.start(); + MLUOP_CHECK(mluOpCholesky(handle_,input_desc_,d_intput, output_desc_, d_output, upper_)); + interface_timer_.stop(); + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(h_output, d_output, sizeof(float)*n_*ldda_, CNRT_MEM_TRANS_DIR_DEV2HOST)); + + printf("mlu after cholesky result:\n"); + + return; +} + +void CholeskyExecutor::cpuCompute() { + + float* cpu_a = cpu_fp32_input_[0]; + float* cpu_c = cpu_fp32_output_[0]; + + if(n_ > 2000) + { + auto dev_c = (float*)(data_vector_[1].host_ptr); + std::memcpy(cpu_c,dev_c,sizeof(float)*n_*ldda_*batch_size_); + return; + } + std::memcpy(cpu_c,cpu_a,sizeof(float)*n_*ldda_); + if(trans_) + { + for(int i = 0; i < n_; i++) + { + float dia = cpu_c[i+i*ldda_]; + float dia_root = sqrt(dia); + cpu_c[i+i*ldda_] = sqrt(dia); + if(upper_==false) + { + for(int j = i+1;j1) + { + for(int i = 1; i < batch_size_;i++) + { + std::memcpy(cpu_c+i*n_*ldda_,cpu_c,sizeof(float)*n_*ldda_); + } + } + + return; +} + + +int64_t CholeskyExecutor::getTheoryOps() { + int64_t theory_ops = batch_size_*n_*n_*n_/2; + VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; + return theory_ops; +} +} // namespace mluoptest diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h new file mode 100644 index 000000000..730740b1f --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h @@ -0,0 +1,48 @@ +/************************************************************************* + * Copyright (C) [2022] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_CHOLESKY__ +#define TEST_MLU_OP_GTEST_SRC_ZOO_CHOLESKY_ +#include "executor.h" +namespace mluoptest { +class CholeskyExecutor : public Executor { + private: + size_t size_workspace_ = 0; + int stride_ = 0; + bool trans_ = true; + bool upper_ = false; + int ldda_ = 0; + int n_ = 0; + int batch_size_ = 1; + + public: + CholeskyExecutor() {} + ~CholeskyExecutor() {} void paramCheck(); + void compute(); + void cpuCompute(); + void prepareComputeParam(); +//void workspaceMalloc(); +// void workspaceFree(); + int64_t getTheoryOps() override; +}; +} // namespace mluoptest +#endif // TEST_MLU_OP_GTEST_SRC_ZOO_CHOLESKY_ diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/testcase/case_0.prototxt b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/testcase/case_0.prototxt new file mode 100644 index 000000000..565179d15 --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/testcase/case_0.prototxt @@ -0,0 +1,37 @@ +op_name: "cholesky" +input { + id: "input" + shape: { + dims: 2 + dims: 8 + dims: 8 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 25 + upper_bound: 5.0 + lower_bound: 5.0 + distribution: UNIFORM + } +} +output { + id: "output" + shape: { + dims: 2 + dims: 8 + dims: 8 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +cholesky_param{ + upper: false +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.03 + error_threshold: 0.03 + baseline_device: CPU +} From 123d886b6ec7a7de26c55a7a823a45c9cf1d3e20 Mon Sep 17 00:00:00 2001 From: dglr Date: Wed, 1 May 2024 03:12:32 +0800 Subject: [PATCH 02/27] [WIP]add mluop cholesky --- kernels/cholesky/cholesky_union1.mlu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index 39bc2dd71..3fe714804 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -1,6 +1,6 @@ #include "cholesky.h" __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; - + __mlu_func__ void sgemm_fixwidth_device(int m, int k, float* A0, const int lda, From 09fa51cc35ece48f74b36eeb3d530715201a22b7 Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 24 May 2024 03:55:55 +0800 Subject: [PATCH 03/27] add cholesky doc --- docs/design_docs/cholesky/cholesky.md | 208 +++++++++++++++++++++++++ docs/design_docs/cholesky/divide.png | Bin 0 -> 16033 bytes docs/design_docs/cholesky/gemm.png | Bin 0 -> 12780 bytes docs/design_docs/cholesky/potrf.png | Bin 0 -> 1983 bytes docs/design_docs/cholesky/recur_p1.png | Bin 0 -> 4805 bytes docs/design_docs/cholesky/recur_p2.png | Bin 0 -> 5666 bytes docs/design_docs/cholesky/syrk.png | Bin 0 -> 5088 bytes docs/design_docs/cholesky/timeline.png | Bin 0 -> 43155 bytes docs/design_docs/cholesky/trsm.png | Bin 0 -> 9343 bytes 9 files changed, 208 insertions(+) create mode 100644 docs/design_docs/cholesky/cholesky.md create mode 100644 docs/design_docs/cholesky/divide.png create mode 100644 docs/design_docs/cholesky/gemm.png create mode 100644 docs/design_docs/cholesky/potrf.png create mode 100644 docs/design_docs/cholesky/recur_p1.png create mode 100644 docs/design_docs/cholesky/recur_p2.png create mode 100644 docs/design_docs/cholesky/syrk.png create mode 100644 docs/design_docs/cholesky/timeline.png create mode 100644 docs/design_docs/cholesky/trsm.png diff --git a/docs/design_docs/cholesky/cholesky.md b/docs/design_docs/cholesky/cholesky.md new file mode 100644 index 000000000..de436f129 --- /dev/null +++ b/docs/design_docs/cholesky/cholesky.md @@ -0,0 +1,208 @@ +# MLU Cholesky 分解实现方案 + + + +## 1 Cholesky分解算法介绍 + +Cholesky分解是科学和数值领域中最重要的算法之一。Cholesky算法是指将一个厄密特矩阵分解成一个下三角矩阵与其共轭转置之乘积,这种分解方式可以提高代数运算效率。 + +### 1.1 厄密特矩阵 + +厄密特矩阵,又称自伴随矩阵,是共轭对称的方阵。厄密特矩阵中对角线元素均为实数,且每个第i行j列的元素都与第j行i列的元素互为共轭转置。例如: +$$ +\begin{bmatrix} +3 & 2+i \\ +2-i & 1 \\ +\end{bmatrix} +$$ +对于一个矩阵$A$,如果其是厄密特矩阵,则可以对其进行Cholesky分解,如果其是正定矩阵(对于所有的非零实数$x$,都有$x^TAx>0$)则Cholesky分解的结果唯一,否则结果不唯一。 + +### 1.2 Cholesky分解 + +对正定厄密特矩阵$A$进行Cholesky分解,即求矩阵$L$使下式成立: +$$ +A=LL^* +$$ +其中,$L$是一个下三角矩阵且对角元素均为正实数,$L^*$表示$L$的共轭转置,是一个上三角矩阵。当$A$是一个实数矩阵时,Cholesky分解可以改写为 +$$ +A=LL^T +$$ +下文中为表述方便,所有矩阵$A$均为实数矩阵。 + +对于一个$n\times n$的实矩阵$A$,Cholesky分解可以被写作如下过程: +$$ +\begin{align*} +\begin{bmatrix} +a_{11} & a_{12} & a_{13} & a_{14} \\ +a_{21} & a_{22} & a_{23} & a_{24} \\ +a_{31} & a_{32} & a_{33} & a_{34} \\ +a_{41} & a_{42} & a_{43} & a_{44} \\ +\end{bmatrix} +&= +\begin{bmatrix} +l_{11} & 0 & 0 & 0 \\ +l_{21} & l_{22} & 0 & 0 \\ +l_{31} & l_{32} & l_{33} & 0 \\ +l_{41} & l_{42} & l_{43} & l_{44} \\ +\end{bmatrix} +\begin{bmatrix} +l_{11} & l_{21} & l_{31} & l_{41} \\ +0 & l_{22} & l_{32} & l_{42} \\ +0 & 0 & l_{33} & l_{43} \\ +0 & 0 & 0 & l_{44} \\ +\end{bmatrix} \\ +&= +\begin{bmatrix} +l_{11}^2 & l_{11}l_{21} & l_{11}l_{31} & l_{11}l_{41} \\ +l_{11}l_{21} & l_{21}^2 + l_{22}^2 & l_{21}l_{31} + l_{22}l_{32} & l_{21}l_{41} + l_{22}l_{42} \\ +l_{11}l_{31} & l_{21}l_{31} + l_{22}l_{32} & l_{31}^2 + l_{32}^2 + l_{33}^2 & l_{31}l_{41} + l_{32}l_{42} + l_{33}l_{43} \\ +l_{11}l_{41} & l_{21}l_{41} + l_{22}l_{42} & l_{31}l_{41} + l_{32}l_{42} + l_{33}l_{43} & l_{41}^2 + l_{42}^2 + l_{43}^2 + l_{44}^2 \\ +\end{bmatrix} +\end{align*} +$$ + +根据上式不难看出,每个$a_{i,j}$等于由$l_{i,j}$和$L$矩阵的其它元素组成的多项式,例如$a_{32}=l_{21}l_{31}+l_{32}l_{22}$,并且多项式中只有一个项包含了$l_{i,j}$($a_{32}$等价的多项式中只有$l_{22}l_{32}$这一项),包含了$l_{i,j}$的项另一个因子都为对角线元素,因此为了计算$l_{i,j}$,可以由$a_{i,j}$减去不包含$l_{i,j}$的其它项然后除以对角线元素,这样就能算出每个$l_{i,j}$。 + +## 2 Cholesky分解实现 + +将输入矩阵进行分块,然后使用以下流程计算Cholesky分解: + +![image](timeline.png) +图1 cholesky分解时间线 + +上图中,假设矩阵$L$的左边两列块已经计算完毕(黄色部分的非对角元和红色的对角元),这个流程展示了计算中间列块的过程(蓝色部分和橙色部分),完整的Cholesky计算只需要对分块后的所有列重复执行此流程。 + +SYRK(HERK)、GEMM和TRSM均为标准BLAS库中的操作,POTRF为计算对角块(完整矩阵的对角元素所在的块)内部依赖的kernel。下面将按照计算顺序依次介绍。 + +### 2.1 SYRK(HERK) + +SYRK是BLAS的标准操作(数据类型是复数时为HERK),定义为: +$$ +C=\alpha AA^T+\beta C +$$ +其中$C$为$n\times n$的方阵,$A$为$n\times m$的矩阵,$\alpha$和$\beta$是标量。 + +此处使用SYRK是为了计算橙色块的外部依赖,上式中的$C$代表橙色对角块(完整矩阵的对角元素所在的块),$A$代表橙色块左侧的所有黄色块,$\alpha$、$\beta$分别取-1和1。 + +![image](syrk.png) +图2 syrk示意 + +### 2.2 GEMM + +GEMM是BLAS的标准操作,定义为: +$$ +C=\alpha AB+\beta C +$$ +其中$C$,$A$,$B$分别是$m\times n$,$m\times k$,$k\times n$的矩阵,$\alpha$和$\beta$是标量。 + +这里使用GEMM计算蓝色非对角块的外部依赖,上式的$C$代表蓝色块,$A$和$B$分别代表橙色块左侧的黄色块和蓝色块左侧的黄色块。$\alpha$和$\beta$分别为-1和1。 + +![image](gemm.png) +图3 gemm示意 + +### 2.3 TRSM + +TRSM是BLAS的标准函数,定义为: +$$ +XA=\alpha B +$$ +已知下三角矩阵$A$和矩阵$B$,TRSM解出矩阵$X$,$A$为$n\times n$方阵,$X$和$B$为$m\times n$的矩阵。 + +对角块在SYRK后需要经过POTRF完成后续计算,这里假设已经计算完毕,于是可以通过TRSM完成蓝色块的剩余计算,TRSM执行后蓝色部分计算完毕。上式中$A$为红色块,$X$和$B$均为蓝色块,计算结果覆盖原矩阵。 + +![image](trsm.png) +图4 trsm示意 + +### 3.4 POTRF + +POTRF这个函数名取自LAPACK中Cholesky分解的函数,POTRF的目的是计算橙色对角块的所有依赖,POTRF执行后对角块中的所有元素计算完毕。 + +对于POTRF计算的块边长的典型取值为512,这仍然是一个较大的规模,为了进一步分解,将其分成四个部分: + +![image](potrf.png) +图5 potrf示意 + +由于输入矩阵是对角块,因此右上角部分忽略不计,剩下三个部分分别称作P1、P2、P3。 + +对于P1,它和POTRF的输入矩阵(完整的橙色矩阵)结构完全一致,因此直接递归调用POTRF进行计算,当P1的规模小于设定值时停止递归开始计算,后文详细介绍计算方法。 + +对于P2,使用TRSM即可完成对P2部分的计算,使用方式和上文相同。 + +![image](recur_p1.png) +图6 递归计算potrf中P1部分 + +对于P3,使用syrk可以完成P3外部依赖的计算,剩下的内部依赖继续调用POTRF即可完成计算。 + +![image](recur_p2.png) +递归计算potrf中P3部分 + +接下来介绍递归停止时计算POTRF的实现,此时输入矩阵的典型规模为32,将其分成若干8x8的小块,然后计算每个列块(由小块组成的列) + +![image](divide.png) +图7 最后一步划分 + +每个列块,仍然需要先计算该列块的外部依赖(该列块左侧的所有列块),然后对列块中的每一列分别计算内部依赖,对于这两个部分可以分别用两个kernel来实现。由于这一步骤是严重的串行瓶颈,因此在划分小块时需要尽量让计算的快更小,减少串行瓶颈对性能的影响 + +## 3 MLU层需求分析 + +### 3.1 算子需求分析 + +| 算子功能简介 | 对厄密特矩阵进行Cholesky分解 | +| :----------------------------------------------------------: | :--------------------------: | +| 需求来源 | pytorch | +| 应用网络 | - | +| 输入数据类型 | float/complex float | +| 输入Shape | [batch,N,N] | +| 输入Layout | input/output:ARRAY | +| 输出数据类型 | float/complex float | +| 输出Shape | [batch,N,N] | +| 输出Layout | ARRAY | +| 模式 | 无 | +| 是否含有 dim/axis 等类似语义的参数且该参数支持负数/其他特殊处理 | 无 | +| 是否含有 labels/index 等类似语义的参数且该参数支持负数/界外情况/其他特殊处理 | 无 | +| 是否需要支持原位 | 是 | +| 是否需要支持stride机制 | 是 | +| 是否需要支持广播 | 否 | +| 0元素检查是否直接返回 | 无 | +| 其他特殊需求 | 无 | +| 本次开发优先支持的规模/模式 | batch<=32,N<=3072 | + +### 3.2 算子功能和应用场景描述 + +厄密特矩阵,又称自伴随矩阵,是共轭对称的方阵。 + +对正定厄密特矩阵$A$进行Cholesky分解,即求矩阵$L$使下式成立: +$$ +A=LL^* +$$ +其中,$L$是一个下三角矩阵且对角元素均为正实数,$L^*$表示$L$的共轭转置,是一个上三角矩阵。当$A$是一个实数矩阵时,Cholesky分解可以改写为 +$$ +A=LL^T +$$ + +### 3.3 算子输入输出参数要求 + +| 参数 | 语义 | 类型 | 支持类型 | 物理布局 | 规模限制 | +| :---------: | :------------: | :--: | :------------------: | :---------: | :---------------: | +| handle | | 句柄 | | / | 无 | +| input_desc | 矩阵描述符 | 输入 | | | | +| d_input | 输入矩阵 | 输入 | float、complex float | [batch,N,N] | batch<=32,N<=3072 | +| output_desc | 输出矩阵描述符 | 输入 | float、complex float | | | +| d_output | 输出矩阵 | 输出 | | [batch,N,N] | | +| upper | 上三角/下三角 | 输入 | bool | | | + + + +## 4 算子接口设计 + +接口为: + +```c++ +void mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper) +``` + +变量含义为上文所述。 + +## 5 总结 + +本文介绍了在MLU上实现Cholesky分解的方案和需求分析。Cholesky分解是一种分解正定厄密特矩阵为下三角矩阵及其共轭转置的算法,广泛应用于科学和数值计算。本文首先解释了厄密特矩阵和Cholesky分解的基本原理,随后通过将输入矩阵分块,并利用BLAS标准操作中的的SYRK、GEMM和TRSM函数,以及自定义POTRF函数,展示了如何逐步实现分解。然后本文详细描述了算子的需求,包括支持的数据类型、形状、布局,以及特定的计算需求,如原位操作和步长机制,并提供了算子的接口设计。 \ No newline at end of file diff --git a/docs/design_docs/cholesky/divide.png b/docs/design_docs/cholesky/divide.png new file mode 100644 index 0000000000000000000000000000000000000000..e971dcb257e88a6817a71e015e7a92bbbd4ee801 GIT binary patch literal 16033 zcmeHu2T+sSx9?{~1VKfm2~kneLkS$3(gLEQB1KUI5h9=?b5-zwiDBLm$n zn?*K55VYm=spID%h>He6YbG~s1i!4md-5sxx90YF-D6Ph*WEwi-|V#YwIL`!f@j5Q z1NhGEddloJ1nr-I|JK~I*{~aeG?h;u*S_FsNgUkreD@_QNm@TCqm3(CYb*D3t>`*` zYOl7sQ$yn}F8=)MmZqlk%9wRd50Cmz->Qg+fbuQ1KJ7O7Wcp~Yq%`g~{af3=?VqmG z4dN0yEd6!wuV+GA(!;jmmG_zo^%pq#;dHSdG?dJp&G=^{E?W{DrF%x{& z%p|R+@?w@DH`pZHw6@a<@;DzMt+neJs~?|i)kdv;+<&~4fAu5xw#9Xvk6Oq6-+c7R zWxq2f8=(MnDPyU>CQ-wupvYcS~&u;~Xoa2!nz`wUQE>E_55I%U%!v zpq0J}^!?(H|EhvM&*r@J7WPcF^m&(|ilgT7cWZZEx|UjB;@n@Htti9Hy*o3{7M73I zos-4|zk>4$=#IMdy+3RoziS;mLCwR~ad~*dn)18E#_lxYP^CFXwpurUl>a{!7N`uh{1U}W;E!E+3+4m$f7tzPW)9#e*J<+DuR+K-s2?wpfL z>^}EL8FX-Hr{X6VTLL7it4Sa1Q22(i2n&Wi%p%9$)+PPqGWM`*Tv31%KVg3X#gav3^A% zbdn1`>V&!kO!q^x{xtKFn=NNZDyph+QE5uSOt=?v4@yQr$@i9~dn2O^;&9IyIm0`c zISaD;*uBX|*j$YA*v4UN?LRK9v?*Xdkoo2_oXPI7CtIkd68lT+4zPb~%r;4S7PSU? z`{Z&?bX1x%VD(f$o#vY!{$H3~c@t-GXpmN_#UWQd`4^hPg* zhK4HMqkj4FWv)G@O0j}IILkNw^0t9Qc3zi5Z%Q9!9K#$Z+()cj{Z`_F7D6C!#fk6z z88p|CFqNQ7RSQwQyH>ZnAk>Dpu@=Jjl+4*`crV`YocqL8Q^Swi&790*W^Sv%%-l|L zTp0?yBCtS8VyXLnUx(P#gU0xkjwkxu(qX>EVln3g+1v&bUI`a~x44FRpNl`J2g@WA%BFI8P}EQSBK>hSzMnKXThOAfXY6$CHnR&vVwRf-vn=MU zoyFbRAu;AKlDY4S|MFoKzGNVa*& z%=2>$%Ag1Oi2sUb@!hh~rWm`_ttNSvgQZu$SI+vmP;>CKoOiY@ab7st)!|y2+NeTr z0uTCIsRyP|SKMp=fLpy-Z{M^@w&}JZa;`@piaCPf6SVZSDklG!{(aBVp+Eo<-0Fl0 z+=iaVij|2IZ7BpMeHPbk-~puFb$K)<7?^{=k}7fz;inswz1-xhFXZGpR^FdunU2X6L5mac=;W*o{X5JS^WCp;zb`!RVgD_UfW&ZFAXn z8dB!#p!3S|X8bXg483$|H&OQEdU~z6>|wFq`kAl zC7x_R=kO~G(#6$1_emuzl3xBn4eZ(~7GpN}dQ)^@kz>!- zvh2%6zf^#$$eXm>Y*+nMLRSQ(BCvoW+8grpSa?{Jj>S2OOhH;oE7C5 z8+AKmFRR$Ozj_MIqB-$EMmzCUv-#QX?HVX`KMtD)FJ|F*`_m++2g_9|NC~sNNwm)N z0vR4);6#R4OiD0(gZt0Fo@$D(v9x4atu=1K?@ zw8d|L8fO$(r6}KpCAuDBK{r1fLkdh%oaBKIqvZqi)m*ckrUFdrbzFSSAEt*Y)dsw$ zatJyInvo~anElk=%Q@ZD&Cqjv`gYzH*34I~0*;a-9RS)0h<9CqH+k}COqA56pq)}C z51u}KS~s#Lz+HWD;*sktQ=<2>+wcy0p6~n%0^Xva>YQvJ#sAhJ^;rU#iBjR?wDFQH|rhhNz;O6h=1n#U^+`$Qw3| z$z1BxhDX_o#KT#^h3A*mueW5}am-O;KX6+BFnci|33W?dO5fb4&dB7KB*~J_oT3?a zce6f#bFJ(A{pUK*%^ThnRze@l`R7<0$%6&$@1rEvnPvE1p6b5r!5%qw5P*P*HIj7) z3)|nn`fWjctf-qvePlev2X}uQfKjhTveE=XT-vP_f4(H&O`EL+wba%#q#psvy`F0C z=jUg&yfE4;0=4Fj$Y`^!KGI#}U+e*^b(gUTDpvxd|1_?D2;bh9J~JPzfVS_wZ0Gkw z0>N!#WAn;HjlAB!5U^~n7w4}${ViXje|z<0S5{5bB}-w;xqE^S=I7^~rcMK0MyTdN z^5T4kNY!(5uEjf{*!g@56d%U31B2^6=J}nA6&nQ^6tQL^>k05--kfg5+eE8QBuRaD z*@@^kpp=9AR1II7I3-6Q;97-u1g}IzLAEe10?;EVgMPR(WT#X?`-me&B=SfRM(qz{ zytq$-=FwO2(IpQk72_IV_RKW_ z9ZC@sc%RWqvy6(b<)hQt&+9ggjE!INdz@aC5Ovva%?(4+l&?bt(jL%w*GaHl>tvQB zW%C>0(MvmF=7Dx9;ICMajn{$a_5k*j*p1^wu|NJDHvZ3}|CRzw*|$AYX1~WHcBPxd z9P_B9qZWGzOKLNGO+mp0GOlH`PKYOVyG;GIiPW10Cg>mG}u z*Id<%%e%QDHY^Lj#@mqOmb{JeQ$91d?>&mxZkKtw+;~4D!($Vz{Mx|@gs7=_E@tRH zp#(EZkW)5ca_`*8C0V1pl*{e%VdRU|jo}+qGMe5sr-E7V>Gf!u9YW0V zuzc}_L;K8*)0j!Iw3*q0BS9?cahcEva$x;-)8X1p`BnKtZZBeEt2}&v;X@o1aR_@P z;IKDP?=qj5rY7S!+blp(bM-@}>D_}ABn zfF&jLU|ioRSqWUo_~i80^Ju8?&GH>hrJ^(w$RHnleJl={`1g7r7 zX_9tXGS>4R^Xqy1*E2a)(EIei4*FA!EG4ic}>2>syB?xuWkp34ma<2PAKrC4<3F-Ndzi%cDpvK9mKti4zzGp!pfQkE9<6U z>wux?1-E3%0pg`Q;ZlFm)j-ZU+79~vHB$KB834`bO22kpbz)M*0&#bsOf8gOn!m@{ z^H9ae$~r<@{sP)A-@*0Qf%h(VzTN8SlT&n4bgEi>J%?gW`4r4X<4cwphpxESiq56b6i7ci1sqL z`tvvUSaVy({YXLz{<6T3Z+&D<>*uF)?M+O5>QuZxl74fUL6t=Dj9acuzH=B|cxOw_ zoJ&APsZF4i#jEow$%yKPcggUrL%iYn`dAW(zod+6)9GCSLu1#R``)9A{>2EKZAGd7 zy8Wx=(Oq9J%kQd_bOcx1@oHuW>((38%3Q0_M3&(LrL3f_iMNN2TKQV5`yBTochRbC zf+HT}`I5@5lCjwQitk=HzEt9ZF*$A;EzXxCo?&8}S?`${A$;U6o#{i7k&y)Iqx_8Xn~U;-fp787 z>BhTrl+xX5cYP;WquD-{n%kmf&;p6{XL!EI5lNALphkes++*QtsjiK&Q=*{BGL{bH z4apjZ5nSG@*z3~mq|;%Vg{}a9DN)jQ^hp-c?nWkK_2TX`Qv{0Yo}IiPUR}WSHq|UT zKEE%pzK)(YnRv3~k>Fu{h?f=eT1sGSaHIQORngspJ7SK;}G87P$Ap z3rc?XZJ3rugsky4Z><0AA4=~Hip-~tiD_QX@T-v^qyl3WgCd@im`0WhEnwVOr@p!2 z8|xIAMK9UOVJM+1vZR=p=`0TzU6&+rQm10 zw?9IyTaZalkg&Q++r3>U>@P+gujf3F?l;$~}5gR{((b}eOo!h6>6eu5$`#IQqoqZ7_F`{rMFK~yf{ZV(S zr@f2Jsycpt|38aP1CBCKSXLVxt<@6dpxD?Bw6AsgI|K;t9)~?I2vV1|0)|G*YqejH z{q5-LHx>BX$9C8^grL@4oNwDX-;_DuI5B8B7eiG?9h2>Fpj~>WHrsTOmOAaQrr$u= z_T6W&me$WB=XfEbn;^^r1*9GTI_FINBWoyNL4Z9-3=A?lW3Xc#^gIdP-&K3Chk*85 zuZ5uGl?U)v2QOtAera@X?Q$v8oG!iNA%p;kq<_xHvwoSYI>lxb`_;PZ`?XSg{N%gQ z+#;*DSn-oR2yS55F-JUKuiQaQW(B*53-QGeRTZ))k~U@GOogk;$47ux`1#wq`kITg zRT81RWR-3dW~4SO54u=5b1w_Hm^~H4A#+R*MzB*@uOiCDC$B<9Dyqk;Uk196uy@8%!eMof{z|e|EmYJt7$aow_Gb3qhlj(?CN{@EKD)KnkNA2Tb%& zhs5y1iHDRoOs{}VrB%aMKxvEld}qn-IErI&#^>KstwdNeXO@q&E@t)RO#(!BhWJgh zh!!2ifyr{O5zrx}lxhfS9}xn+cyb||M@IpVo`~1G0(vLFX_;r=kIO?^-n(l6RApx; zP6fd``sv0Cc4VXunD%6cWXPJ|CowIeJ zy0UGEmCJgPbJ3XO*f}IM+j3^~&XA0q-2pQ3!5D1@>!WM(OWqhWtEyHnr;5~7fiA2> zL!-k&xeEVBK0OQFybd}QVRHnCvNbHH*ZxAc$99tduie2=UX&#n+(_J4xA@LTJS`xh zdqw2>FDtNy=y_zt?sk%SdtjaI51esP%Rz5Wz!& zdtqIXzn-C&oVV*32CcDJpT{j-lyTj*FF0ekv)Fx7JZZ|_{_K@DJc?}CzFQOu+zM{@ z^ZRhVF4+@6if1n}=rKrickFq8xv>x2>LmFehfeSD1ii#rFh%d6ZwLY)pwUB<+w-3l3Dg@bmL6lH&yjV@=oy z=jPdug!E)}3>;OyzaX&Sp-Y;wayX04WMh=v&Tyh}mW$b+upBrx=zQPa#Q-7|8>=b# z`a@=zc%;){Q~#h%D@w6IbtNx_b-ER8-Qp5pa8O4z^VYo&%SvaqT7Xc5G%&v8$) zZH;aoW6s$7Yn%;>EaF_XKb@!8GHjFegu1dBPjDT*nvlPM3{Hy{=zU1N@ml;@hR>Do z@T6>?P%3`}CC2-xYFkXZ(hlg$!wxFa{DZQ&LKt9fIycO1)uvTP=4h$<3LfXbzhdbH zm*{U^y*Wa$ycklA(B^v4{?G|z(wNj>sK|YHp*FO^$p(+;ZH$Djm05ki z7=y-?t$VIVP|Qv>;Qav15Q$3V5cGM(Z{!z&9Wy&JKA=)tHGw*dUl06bfyD@bbZ7l$ zbO;`@o=4PyyZBB$XoNM|pH-ta7}bgPeB z2NBZr$_pDX^J1#-$rFq{34L z(y?ZWPqYnf+j1!!B#cK*Y6V(a^3G*gbmaks^r0&EATluv|68qh1@71}W@^;$&RSp6 zU4Yy>2WAedM*wm=fIu1Ng1)@55t9KN7{IyR*mpVFeo}2wl|4?8T7LY&h(49ljC#w)ou=o)p11!PbsVK7;=L)L1UjCI=YU9Bg zVVy`q7P1@}s^49p`&oB?h3iF=wPmCCn(&EH z3rE|0&;P6Xe*TBmWmRS%jW!he?b^N&qw^QU1srMP+vVwV>S-qU%(f(Nws|dp3l8+^ z^95m00|S@S+ZWbB(qe3)={11}kM$&TM&S-ht1K?vrZK(@-H?>G?4{>S(*%$g7+B{k z^Ji~!Nt=+S9@^wBoCfNe{_H=HFJN9zg#fj8^(UL&hqz!<$qV%7$9A`AP)KAX4j8b9 z`xvN>NM-XV`3eV+0ama*ZzJ!MB(OOd_U1aY%Y*m;-$purhK$Ve6J25zwl@FZfZlQhGIhSUCw z%7p%NUUDazKV>T7K|Hb&wOjohvy7kIRinbs8{Bj^LD+=%Y>-0G!5a9c0dMPi#Gx3b z+%|s`w5X*AHRxUT-HLo78#`=Dt{6GDZ{(ItcR;Tlm7JG>vI(5 zr>CtdMe5O{*ac~+8Uk&>DC@%#JO2p_&Nha{!YrOTGe=I!;bBc;mb+Z7outq1U(BZT z920umrIUSYTE>QaEBql&z)?hZ38y|Nc}6u3b258SODM_mg*5;h#2Zy2J)36W$h~kI z-mE2$Ki8jY;t%)M?!tk;M<;*avGZkfMGQkey9D|@xugF3`ZwS9A&=TARWZBAFkK*% zcl-46`d74FhH#!jU;xa>4mUoJSXYU(|7+SIj&8h}7bzSzS96w7Kr+_mvhHG%P<+D~xe|8Y(rP&n1ZiXHd5c4x1DEM%9MP-0$kf00ZLg!r z7TZ*NE^*N&c(tnbz8KD=0S*F-9u099b{@uPR0D=F_{IM1q_iRl&})cLkBho`6XiG8nT#z$veh93bse0TeL^;L5^iJ7 z%^T&}Ny$C_raOZvJuu<&lU-m3lwT1D;#|8{ids2utfh9;j33X6sp%E@@z`(wQj~B$ ze{B`J-U))_Kx9iB=iAo^a;W7fFKH?5s&38pzF^J)A#jGp>_USr_SkubZsGPHjzozZ zKTiG?(t<~N)d_=r#$f&SNPYS_Nv7HR{Wv=Q@{jQjj>eq)-d(IVr4R%pXZquniI$JdF6LL z%SA=F&9JA_p=fT!4yRf8=?Z=|KLE^SQm*$`g~TUO1}Z(2RYt}hoJh|~yfc=W?Oyhw zifv^9Vo=mZS&&21b2j<;6*&>#l*;(8n#F(*+_ayWmt`K14R?R-Hn`0Cd_8P-l}vIf zU=*bj2A4osQNJ(o7%Xhgdefo9SS?-wx8I=EuqVgtXzU*qC1BN`>lyBYlNZy2 zX{&W25OgE~9vdkzIqtlAjPLIE{L5~RHfWis9(EC3T_RmzvEz=;x^$27KUc_w%l}?U z`BOVUdT5ABc5?319je;9vS57anj@&d;5gKBQH@g-=3l>_ z8H(BWACUK?e$SRIOLnP|3!K^+2y#^gCs$E40)>>Lf9^AFCVh%eT%3nBUXFd4qhG#U ziQ~7+7HS6dFoCDML{vDs6hJLtG-vHK`Ob2TCg5o5N-sLe>OdxDFn_Se=Ss5|E2G}J=WoGxn-+KK(A!hqCG8` z>&^)d!2*`ym`JiNJHz4*Pr$xDQ=h-@=c~;k>rj>`yUZV*|vWE@+o=Ov@;Z z25hX)%WVDlrL4cFu*URO#prJ_b}*jP{_p0KnS3AMB4|)-tdnEn`25wSmPHcvBAT;< zfUg3Jf=&k)2S5=tT)kGj(;GcKn~yIq1*jBms(P*HCqhnj0mh~LueMRqZnZntVmMtB zz+SAeP8Pne|NV_c`CTQZ1srXFTe9gr=82HK7+yuvfNRYbE6U6u;hCQ7m*S7@53PJE za*mU>m+ahD?J`iB-_~O%P{X9IUN$&>B4`Cla!F$?VF9c@$Gn{jmoD%YkL2tQxxDL_ zr8X)_-YKFa>W1ZTap~t-1h$~+F*9)Z zyWQ!QD>vl&odj~Mw;~YOZ`+vp_S;{XwOMOFzq?iZ1pN_r{;}Yz&qjWG zL(0;bPF9$e)hyV@VX(bD$m|^p#v9%Cvz$_>Hgn>$si?x{v<@rm;T6v5uez;gI5+Yf zwz=`kkM!PVjj4W%;SCZw3)Ks~Kf-=Qpb!`j-XWVx+vpeAo-a=+^Df1|267NOBpsfN z)%VhN;MKRvx26t?ysO`;iIm0(q))r-L>_xFH9lUJ?tjk8cc_-fXKDHir?eN+DhCTT z@Mw<5trZ({%-@UYRK1Q_Szfk$PykBVvfak>=6$t+v6MF5h0=#v3E&Zf4>4*q> z<}x+5f<9G}qC}mpyTN?Hct~~`8h_#Ui#0a7&xtNrQWEqjCqmB;Kf)XhT#*;TiE^Xj zE5=`)6@2xB`1t3lv|P1%Cfb%H7+QC@$!4N|_hwMk&PAG+chlHdoAHo9Z1#U|&?H{# z+(PdX35l`DcFCr|Q`a*LGg|u2U1&(k?7vwq^SdP?o|2uw@tm}C>Mzcjzmt1zNZ!cu zl)^wq!@&jCCWgFc{`5H2Tt3Zb2izfAegFXexm_V zo8)N0AzQ%E^9!wrbt~Y-0+qFGKtxT4y$fGJMa%MDly(=7zPX!&hU!Qt-1ekBZn~q{ zzi0-dc>9x%Sxa$*8|E@aJ{LD_qo%O?xmm?liQ;pGrAc~XUrU(r4O8Zi+cJdZIEy4Z zJc}>GjDJl?U;o;z(u~*0T&Ke^ud;ZqwLrl72wnuL_pbk_=JNdRKwc@!4P)N2D=kKQ zO7o-}Vwm}U#^y={{YZzu7b^Z)1t?w@eD+zGCwgIRShsgrc)0EFaO~`}%{0=Z4?AsedsL$Mw#&hS ze}#0DvK3jeDoTx#AGR^Pys!Uoo9bN55tM8`;L?5X*P#wg7jJ4k&$8WnlJ6VYjZ7es zlFiDIt4PbpKx5k?T zS-5*RC_cwHxukF^2P5&zLPisv@pxyt6|#R5@d!z)u(!`$Ij5tggigwsxPWP*du*+J zf|^}>f1k_}eh(DkGPvQ=^#cLDL^y`MHu;hwf@|h4)gv(?Ql7Ja(`SpT0iXP zr6P89!}UfYT%Ga7#}E|U;5w`^%=t~Ual3pJUM3%_=%`|Z%tSpu@FRj9(`Xeo=uZ2a zQ^fH(amJR1`b5TL>p+!$;dz(-3a{HHaor*-UsD^#%OkCm#cIf7*E7JqmXS)>(SP-D z+7w*$510K_t&+DnVjIJG;>V4JeUmZwD$Wg(#4LtXoiP{cwlPkGUU3=h5ek{*yx&kU z?zk75e(1zB``s9=D7F=s!lZDK{Q1Sqi*MzdS@L>ip#WOUduozChs)MwG2i--Cp)y!F=p>^m2zSFB5S zBk&c{CoPS3W9$a1gyg4h4&n}N9GO2BSc+&FE*>lU!I-vow?>iHGAbU6+IDzvQCK18p^3#5pIhMBsngplYyGg_yoP$0eMJ*~ z**NWn8g7_7ne3q`I20kK(826TB^;?mxmlTGSUtPRy zUTk@MYf0@B|Kj485d}djUVbGB!mC~^9FgjO-+GQhQZhdzs(V}|!aYKdH;{A^Q}0?b z-SN6aV5y#A))2Tx&pwy&*jc%!c|%K|eN#nJ(udNkuko{$lW^^@I*OVAD+q#d+LrwUG(cvP^ad{z%dW>#G;E5rPesd)qThy`Qg^z4f77 zve+$Susq#cXxGgnHykLp8uo(@MJHn+`Ol#qS`DIuhVsL;+r3h1_50D5^TnIzj|r{D z1EC!NASS?nl)@>%oogG}A2IBrBh5>=sdHx$A_kRtL|DRd^hXVgYf&VWl dAiQ^lwb?f`{0tvg1g!E-pD;L{d+h4H{{_iTVZ#6b literal 0 HcmV?d00001 diff --git a/docs/design_docs/cholesky/gemm.png b/docs/design_docs/cholesky/gemm.png new file mode 100644 index 0000000000000000000000000000000000000000..7cf06c4d5e1c660a31ed50d5d3b49c1f2b002061 GIT binary patch literal 12780 zcmeIZcRbbc`!{}wkRl3|gB(q>6gtR=V-BSS9E2)Mlt$@GxSmH z(jk5h$wODFc8?OF2P0J}{=$Edxfe^Nl4b5uA;u->YR@s#IklmS0xzMY`KAzScU5Rc zrdUH&W_M4~NKAh)L?Wu*v3m|*3lyF~;y7Pk7)-iM=FyoDFrn}Y6K5A(fH@l~dLQV{ z0i9W4lU&ETI?AVszx@Hi6`@M9!+4#yAAO|o<9ze4_or$W(ol{+$U{jx)nI9`?4`PhRPKh_N|Sg#L5SL+of*&pv! z+GrX?Yhu8Jb-&*O$vTYW^i!IcrqmXHga)N5iDilA4_4aSC={9G+0c-W7l`uCXOQdK zeD#yK_E=JK0|sqpBoSGj?4CP60+Ry^>1aZUft(!%Ip4B-s_St6IX^##-@foX1x|Wh z5wljrjG_uv#`i2@#`@l$5@S&%AsxkZV>+x9$O0?U;UnG*azO2D1>eAm6S5l_ z4^w ze=+_KBy(&P8vWb4;KGK}P#TYt{UoMMW^KPTslcsP1h9LmzJrbbwO6?5|D+o}a)FOz zJ~Wwx9(ByCg!)HYBOEDINm`$ChfO0p&tFdcOzI(^h{^l%6Lma`6ur$W1+5TGJ2u3u-g%TcF#`7C$GO@D>Q$c zzkPBf^EA|Mm@KLXKV?fJaqTG_dk3`X)r|n*NhI#tOF!hvN9W+9xP6SFP-A?+fTX=? z1CgSh3XN;`rO?^fsZP*2COGA9khY$kZxctyx&Y`LJ*>A7%sTvIQT)yO6VOi<@AIQ65)Zotr27-|L$ zyZO(50P{#va+;bZMn?ZgcjgJg#FyRkI1i+BCBUyqH}p{J8|B9KEhAz@N)?)emg9g2 zlFM$5eKS>_kal)#jD8q-L{Jn+xSTE7Zy-Z%fp&ydNMji@iG z+@f@cy5%{|ftrkD0YPWVv;Im$5F_p!Yx=-?MLE^u@H7I$!(%Wp={pl3*0NlVKXgwQ|DF=+uyc? z$#6wv|Ip+Ihx5pxIko_`Tg?qtL5c5Cnqj^}RC6W`Me=SNHIIrCOigNi=t}J&m_h#b zON2~WlPJC?A9W9GeV0=89iscm57zD;G!qZ6{5>jbP(;XFwNi2DdzDd0O6!MZ9ltf5K03dh)L2Bp-?=2Bg@EY?hPD_ zcu3F@eT}2JJd3Qp&Z~Hj^1MbGYJ%q;hS@4O%1`5P(!--VA;6>hGxtUww&GE8u}r=p z^6FJZj7R2elydCU4rt^n*y=Ha$p{&m&y&bwtsm#nS(nMyuqrf)r4*+orlArH*)U^P!To^)lKgLg)j3s?-z|4Si({wsfYl;6Vs~$LViVJZCf-@c0-#d^ zI#D>wwX^4+U#Go>LJh-KdB7_kd9V{JD6qiQfAaHNqRsbz&3=>Em06VpUnmKKuV`on zd0zTuDJhWb@@J5z1!hVjQSIPr{rp(N67l3~`fx633J|zCSVxJU{Pe>aNsnCEbXmD1 zNLXA_^0DMUKTdU^zXtSCn;0ZYC$?u%k{*oNl@|7`p5-Tz9D@~e9iWjFNTDiyt zgr!*L;wEJn8W)4F`X2pY-FQ0tS0ohNrU!LG0Aj}!1-zoL|2&#bgq4{TfC%wBFUxQj z-{Xs6hWMm$MU}+y@u|=qOpcvGs-JM$WwJc(4Dz)sI+VbJ#vaT{e;cue%8B8`Wo2cR zVrN%0tLyNmN#`5VeeNY%1XRad=gl5IhF>)quB^5v5tF>;5L=g| z@PLCDaqHDE@j@WW?BayX%D;+-)yAnOG3=4sOpaC}Q=-kU{I5*L?eDl-@uFMfUoIw} zHp;!_#+}eoCT}j;NOgzoT2{!n59}HVr#ST|zTm%H&@RE8ScTlK)l_1#H(HwV=^HYl zi*VtpuHW{oy1(Eh8Uw$3A;tjY(T7!T$#mD@EB7!%;s_8iLQbZ@j+ac&yi-o|W4KF+ zq5$=Tyb8?=3IV-=g>+NFn5k8J0=4$X&HY;9Q5lc^6Ru}Pt7i(_%0EqQl?PL4e=0B@ zw}vSPnDRa=lKWHcP)@Y5K)*BpZJ-VR5;}#EVvRV1JTG#wT)Mq- zFaxFlGvwjjjRbJ_=2fr<`?a1xRW4LP45l0Kw?O^H)KfmCGB zRYU0FggczicwGk-b=KeC|o6-Z;0Fdn@+-RbLLtar^(5}4#U$>tgi zI#`d|#FrRCAhB!B-ZCTCWn@pH@h9|Q_0Jf!Lsu~Y+6gHYnhD02NsQc=X?ytCNASUp zMw{QZ_U+h+RzT5J{~gWVStiF}4Hx~cOS~-m{{BvClA4**zONhj_F`~IE!jFO_R2Pa zA>Y^vy$6w!=4xh`qQKLuA8jy2Ng~o z$j=h=;jH-nocp~4J4NmLh9MT$65`TOHL7F3682O1#w!H|@Cva#OQixab5b_Ha%AVi zYVR`GzcEu-Ej~Wlh{Bz)_hKj34vJ^q>S^fO&~uv1Zffql-_xR;t`;N_dDg$Q5SxsS z{gi)L+G)QCU%zK3J~Hf9ZWlTHW*em$u7=aOrDHYc%5na~r*}DD zjm*MDM%Mr093WaP_yE~P+oF7=(MMe`&-RbQn;j1MpcA$SR-FF-_U)q_Iu2TU!bvo7kd*|s4PHj^(+k~ z)|W$Fj&oKSykBG#MfereT7C7t@UH9BNBh*Ms+5|rn2OnPtH;OK#}`DxD>-Jon97_y zhLQr#&$We|kA0vdxcs})U|5-BoU{KhKDI2bzs$^}R(>p(YA*WXa2>LGZEuBQe%Njk z@m&%k;CFLX*N%B_gP!sJ)%H=1nOLcEo$i&ttwT}rTqVu>-lpYe591>VyS=$b6|0Bi zCuHJ4=ItBN=JfjboRs^ZdSCWnk)2M&4$mCzF#C2!he6NRZK+qBMa}!4MPe-lt0T~H z)%KmAdWUT{5q#hnXhNkMkQJ}y^g#FqsFHz2?c&M&3!}D4;P~q zigi`WB6s^7Tvi<&GA)=BoJ~|Os(6D>T^Jw8RL^}p{wXq2ZA!eSyIXr~CS|;R9KW=h z_C!rBlHpGxdy+4c`)*D(vuyFqzGy#AduAcB^~NfTd;Z9c)ytKFW7fuu*oxPqRa%jj zzuHf!)Y}+g1_uXwtFN-L)eFg<_c*=X5&zH__vVd1`f#fM&~!P5GK4|>-^)eI4fiEI zriLHi`5HOgM{rZ!?Qs1W8aV+so_tbl&y5t_l6~^zuPc%SA7|aD`*gTJN8b2wY1Dmp zu@~(`7X05Vl8yGQDSi=lEA^<(I=Lvz`dgq9Xd~0U3`Mm>^O?3ZsO#8882!2073-i@ zyJ+n4s`J*Un#1ey{9N_frD3P!jDGcz>B9qJO7ji(wfJS`k}t97j@{)K;F3LGE|B-V z^k=tGYX2Frk%_gow$}Cc68p_s=BPU~#=#?Gs6ZJ^@3uLXGJ+qn*l>3jpT5XpzQKGP zGUuDi9#ni|RuSBIh3AFdYSOhEnJP*qCMJU=4Xv2E1*9+!s-o`3YDkfJ=GKkX>-59Z z54Sgpl#hCAkwzA1eBrIxn*5gBw?tz`5$x=^0Oj79FB=R28nIpuC1x9A6Rj~7(t1ON z&M&302YW?qvXt1PHjeHfl24pUn9G+hudlDqR6m=)qO(VdX>M-*j&>qm$o&IC%N(B` z9WDc6oe}4hU(HBd0UM98Qb&h`_sJfKYFoAWomLO*jy3lQUo!Ze zq@O#7@Ay$_wy)@veb}vM**nEBWA;!nfsrClkxsQ*s&N5ntNN{ZzcQNN$zgvO4iz~# zeTaLgDKefLh(0pj`2_)me!3@gc8zM9P_=aVObXKfxs(_a8gB!`>AN-a?o_zwD1deYO zO0Z67WCmoEX{)%~aih_L$d9WxYxWmPn}?_NOqOv^{}b)%yH2CN9lucm`Dbyd>LN}F z*J`R>i_qbIX-5^QTGlL-%pFs~H4~s5%?Y9CK1xF}_~ONzTrGQ@DfL5%=fxfso+_~! z?2g6b+ZOOYpYP^B=4dJky`g#|HTD4B9SDP>srvlSFDIglAZm*DT;M?~!Pm zm66h++Xw9nTl=D-eM@Uq7T2%;6TM3Reg2Z`-_1XsY$%(x9iFe#b$*1{FR#en%w`Z# zxJ4c5{R_zWN5YYbrN8^~2l+C&`&>=%L7v}XyXN6VWEPG(?!(#*C7r29b#e|G$dRM1 zUx!+DY_-+T^8=<=O>J+e&1QL(>o_XN41^kHey7v9^G$YF4~rAp} z6|LN3DVJl3?N2t4H0$IJit&xAoS0waZKbM7Qp(j*6H*hpp;OoL;ZEyvWO1mJTysb^o ziiaqh)yf@udir1%QC$JLgZFlsew4Ipr~i zSxR%c@yqv9OGPV{9L@AnHzPV9d0u(Pze?+1v421->kjPy{wCx! zySnc*b#rQC`*3N{@B7+H`Km7NaEstLRA{Vy-%|?!9=yFlJl z!j093F={TnFS0qcORRR+y0hI1j5>=Ze1j&m-~_`b!Wp}85=^pY7R&Vp(-)nxTT_Pn zJ30>MgRYF*Ev*#^5;9}=HfP6eTDb_jd$aK@d?p{IsmYE3;h$-fALG_*Yey^go?Q9~ zMmXGRi67f_Tkh#~J%*KI1nzC6sp~q;1PiU)VjQ>ak&hEW9nh&xTN4W2jP5_)ed}k{ z&mqNUzG0!$DnCZ%aOAK*H29Nr%<=5vW~ZpAVcR9FFz)CeQNq`GGGR>=&@?Nf$nf59 z6p#d`S%f=`jl~zc9)0q=$lyF5mjQ>XIzHNOU$66~3=a#dz-sSa$Y?s$wm|6~y`Agj zIJQa&7$+lo+3J8h+Ff5&Jv0g24rSO*w+VL;*Inu3Vdg+9-9n9_Id~tg|G04Bro#P3 zxbNlh>gs-~bH`$IrSYxuKko7Jc6WE1xJI=m^>Dbe>{ce6{+cQzgd45yt13-P*wC@> zkRSUh@QtHs{QLTpH5_V3(z-Shm^Y0ZxK}o58DUb4ds5`1?>8f>p={Hyjg?nY3Kck2 zld6PSQD49DhdWh>&cMLn`n78%uK9n$c_+pd%KtRyNtw5;k1hLF>&AvVBy2R!dHayX zTjDk07HSDg((YPV^`k1i7Tu}#0#wg*b9kyktVu9bjPx%_hcIv1oZh{a(Zt4{lTAuW zsw%vCS9M6j-`{`aJb%}H3#V;kb>YFZpO*>0E5dhbbghF0URxe$1yD8W+cW@#6Hx&v)RByguk>!A!#qR3^wXyNCT&vTgcc%M0kB8>ZUt!jEtkAgM z5%2^eePUwb&ZtAYpX0)m&aINj&rBY(Y^U(R6@)hU#uTgF5>UwSl zml}%9<0|)0xn5D77B^&!{T;W|*7k%smP+|ATEKqkZi<=a=8e^{yWk#te4<3Ewb!$n z?wkBhgB(A4YPD*=93*S=p2``S@8WeBJu`Yu)p~G<_iqFXhud>7up-xW4gDhv?9l&;59v+g}>j1ZWsGmnUD_q`MM_jq-h=~+&ItpoSR{%YKDZ>GAyn=Ua)`=tT>OAWK}xE!W4^pG=GC5l_^ z-i~@|ez7-ui6f?Z)ZG1Dns8JBipM+Cd#uuBczJtm&5CBQ(xtfFaQ|=5)9(_s?BGF- zmYSc(Rv!ysk$rcjN8)sjhIrH+{|toIvF!&l4IcMbda78IC%}%skwJx~_jDuWLycp3rmTR&-BV z{ax(09ItY7a&jWrB?*rIC2{vEK-G|aikWM{%}lHuc$IH#QXzRBXsQ@f`~Re$Yx&XG z2pcal9}bWa7k}Gss8#9w-Tu}sC}3rBG_{WY`#D8W_A^Nrw{m<_uhutMH1Q`)j@$YH zl1a$^eyZF4woZ}RXNzldFW*w-6@E)S&qp3iP8E2T%9NJ&%4KBVwUQB>&pXU%Uw#Uz zK+YHllRZ?rUd#`jA0UIP-1aTTDxOQ`R3H7r1ky*i9gx3;WSlem>S~DcpOkm+q)wA+ zRq80H?W~NpFG?};o1G7>^&(@Sr>A1%&uS2;I|gBDXiD~QC(>{oC1IIu4pIo>xpmA)*x?ZmIaz|4Ndk#yW4d3smp^354+M2Hi#jIEC#4B!ix7KDu%$ zScuT}If;=%7vfe|f04PCZ@rSeZ^PYJ4Z?fV;XneFD1_XbUthfICrdV3INIk+tfvgy zJ|zMP6nXOknwn&M>#etX)nj`6Auz-WstV9nKoR2w>*VAlm2Z;6>KMVcJ6SgThCC!N zJWw~(kn=iyPAyk=v>G=bP7U5z(E5+$sLV+j%7czeuS)p#?Pg@>r@OC4O03JACX`?U zftnSL>s^Tw?d#!BMbHraMupdq@VOnAM&64bel!Hf4m{dn#M;+_*^yk8q^6tD5i4Z! z`xf@#Paw}2glOPH0emuD!2d{vFq;SKxh#L5toL0cyd8!%tZPj{r-+dKk_PqwoC6iJ zu!aEjNFY1$1Y?}@Li+UdYwW@Hu%#ZL5nu#WCPaK_bo9S23F^;N2Cb1>i~UdG`|IFC zB!y17K2=my+%q0tIs6{BTd1?B{xN=|QA5tD9Ebk4Z-hlM4;DR-lJoN1O&>D1>F3%x z65kuA_6RW^A+$9W8O2OZN!v3GMC;gm(GZ|rfMVB#%avl0Bw5?}0zJy$1E&|J_OjNA z7T120QQD?aMv)4d9$gOz=F(XOVEg5ty2;*t{Fk$CGtytL_U>8P2S9(bIDKqsu%NZ5 z*PGuY|5YnbwpMXpkV^X801=_a6X#=KD`BHXQlmbK?R?nza0id(ZW}i$7z9rMGPVO5 zujxCCIT`g(J#1SS7DBTVJI=g^A}*qv*k!m;R;d!|fQ0Yj{o& zO{eKV1-3weA$lqvcQgo|b~lNRry`LZE)RG9S5{Bhb-6Zc1%+V!OXF@kv2Y%!mDfBS zv8q1@&OHY`R%9Nde@V}6D*~pVyj_C>D_=d_8Ka^haA45HA+I`_y6RD#u&g(|}JT)k?$ zJk&dBrEMiXAEYcDbY<&cvvX1E(PaSB7o1%BOr`V6wwH&6kio6ovcgJq#F zpp+zP?YN^klqLnjFB)Dx-fa|Zn;*!3Wz>OBXQTi}fAqQs$MPk}JP(^&NIaIVdw|}Q)*nsK^30dqd z?z_Wtzgt->M>1U8s?4uy6uwc?&=_E}tYAew6+N`o*PjH#&SYZGgXAHTrkShKE={!M zDZe{!2iJJvsIgKxKMN=Ot&|`WOAr8{tK7=%Xg}ed5v;ejH&5l?F#9w_9Wm2{B?=|s z@^|41Ej?$}@7VW%U$(#>2sH9kA8a&*ve|8tT)SF40LIol2pOHq36#+Mpe zA~=NX7LsFIaVJP7=9ToBdDi0c<)Y!XVPgNL;&FG=OATP%wrqPBIsy>to88;avtkTJ z2jGU!|mc)lH{RTJh@f#ZjXJG0Yoy{dym7AD@%JAjxkw@IqH0ASkG}V`dXNKnLd^ zPNN~BX#yxfk5xDo=wCAJ&*=@CCIcKa-X#&o|Dwi2HB$}J3;!lC&vW$+TUV1F&QuBjTGHVXYf4@L5zV-M~D07{KHW8 z1ZsAmX8HReYHMk*XVOX%gYgGNls;SE1ekwaC(RG!BpJ>>+aoa+_2I*d;^E;!V>VP{ z2#dhksmh(P9;z95=1}vrOG1;o^6#_q(Xm7cG8!}l;&>4^wJeQ9rWR-yA{6x)w5{IO zPtF0{M>E}df#`I9cpKbTiMW|rR{i9I!@oW4^9kZ!^qd+AjLp74F~MvU=ae#PAv%l4 zw$111*Av=fQW=}ot)^HIRxzJGU4o@@^{TfU8XfVei%1)Zz;ai~G z_V$0Z0sEe4Xlsu=$e3EMFa0l+bMq<0*4v|^QlN=7UY=fFF%G4i6~;nXL4%f$WGt?V z0%4>8I3}#yh4ilc%JM?TiMajSU7xbs)PUwQVyr8x=ck0v*}IMRj5;c2<6LYvHO|1e z?na-ML?AS5YRIBP_I1LXf>FX$gTRf z5F{{`7Sjmt2@q>Vz%OS>^B1^BCHr%?U%);RCgxM)xre2T0pT%Pl!yz2K!X-xoBZ`_ zgj>U0tixF8_K>OF<}HXIV(n8QqDM_^A)lrob-CiBq@t3{C<{+N5YfzTe}6Ty;6{)_ zNUE6j$oTkN#^!xkQE?k0ZOYfLpQoU*)OF{*7a8?})CN57LHrG35j{#zjpJ>Mcir&U zzXVbHSJ3nc*3_tjuoUmM{Tz#rjWq`xvNDp+*lY^i@&X_zgTrkl>@Mb3^PhdS*q2=v zdV)0(l%-pJTC?pjM#0s`M=teu_$@}#7@H|ih$3Sw%8YA0NeR2D@$TOS%|wq63M{TY z1*~6V<3q(%=D7Y6TVXvV71O>A=weZ716rPltXsEl8-x2KIKmn}`mTpt{bixTzOYhY z8C=M?%WCDs#0#3iQf#XJCD`m|`&%J^Y-`GLN(ooWN1178SW&p6z107i6P3avCq$$@ zpkxTnH{_%JngG;;Df_axr!lp(pX{>?ceEJ~Faq!-Dl9(U@{&*eWEa(p0$jDANFXz~N$!)W_y3i1fYD){ z>(2>rY8J`}gh3e%F(KE1o<)|Ax)VMdMfb2u_*ASnFqVa$v^Ehe*njh>fJ_DkG_cN7 zK>}59UYG(PiWtYX+)f361Qn=p{0XY&saW&0Nd9+n@|$*ZU9|vBAi*d61=!;XtZmtP zPhK`wWyr_y+m}CNWv#`J0mv*cZ0`%SO^RG`VXA|nFNGfUYQ*HBXA;KGL_0)?}f+t>S*DE0* z0X*%wOLuDUBHIJ6sD;-&ERvz)hp54EY89DH)Vw7;;N#n>+F$8uSC%^=iGc9oGLWZF z*adAYgD7RU`TI0%`iKCAa|vyejCKd42JXmyRQRMNS3_M17g9WGAA@#>T*9Ji&u~<@ z_hbdpO7QCrqjpi~I3Wj%!Hn4d*m*IeSIR|Leb7LN5#8;|bf8`bgAn1N3-%(Ifk)SM z)UJP?kbV&sb+>>ExPALJ!EtY62BIm~?b{$r0ELKS0Xq{?2ND(Cah5r~rRo zOpZ;kftJHJ6JfDhNa!9(NUWC5GSf7-wG|GUfp(enqy$;8gAco!_x|#|l%@G8J=b;b zcEjN+w{i=3fPx_ISS}tvTFA_eo`PBicmwSZxu1O5AL)D?9Xqo^i`_#klaq5ozOZ?) zx7of9n3l?{om(WQDDW+O^#0P_FZwDB%NLK>2pdFq*AKrdU!6G1BH}t(=S?V?+nt0r zUEjk-Ealo_S>3+fTCSVY&6Z83zt!-^t3mU4GKVup_ z$KjUK#B+5kJv}{nd3iH%gMbWaX=$}PK6B-0%OuSsiFlxMcLq=K{6J$( z$4m>szBBMDMD^d+j~_oWbIMvqvR~xfjySd+dtIdD?Ci`GtQCL3Q*bFxqrgz!#>VDr zwTk;sPCh=n_GmskD#GEb+Qv6}S;Sds-KMMQ>FGDO{%NkfOdBp&2#^zwI!FB7Nget$ za?+ZU8f%Q4T|^OSj$gt*e}3Nk^e-w5kJq+7r7_yKQ;~E~y``#Ccl+2^F@AJmOqkvk z%(x_Dx6=T6!9m9&4ULWJ+FICBl9x14I-TKZP$|2jCQ|CuV)JZRKqZ)T93HCiWfEM| zDn5R^7?rDG?`5+$uJrYICF-e#?xjutSGCgc7KS4#e#Epoq^4LcGDA-QyBhFWAcRy8 z9^K_=yW5DS2mRz`SkA7vkQHrLu)$$yvF8`^-}6Kn|DU0ikFb09K$B4s&1qwU+3TvY z-XHbuX3qX(v}C5Nf|AG8XDhV9tvRAkwJX*M`4bn?P*<%~ohce;8psFPMo_? zdLh@2(5+kV`p;J4p-4c~ZV>ZbTD`3Trs-k&O|Mx&&H}5Wh&BAb(soWm?$M)1Od`6< zv)Rs2$TrA)4U(N1u3P3XJh?V_da4l|wOSMT>66C%PsXQNpPV)|)>z@@-{y2vM>@ax z*#>Gx4XP0y85v2hB1DhFXQp?@RQ^9R2o)O+lFsb^5Yjgt zr}G3aa1C0zrkSr#)&%H+FDxl3$=KG~Uzw7wEPdP%fYesR)W`;ggyb$@g-^_4=hmsT zb2N|Yie5b9)_?}Tgq0vsD(cDJp3lLB4=z9)613Ly?Lp@tJNCDF*8^sB-vL<$< zDg)Wb-z!y+7_di$g*`KF{7q;W21`~QEi30=I7p`GsGmrJe1@HYg(V(T-B+Nmz}Eu- zYNi?x62f!)HadNV*O@#Ga-=kIurP27KZDNBT)%N6B_$>Dhc{(QBx5B z0DzpG?2!O~jEwYw6=bEzCMAn;0AO>Hv%T$Ed_Ip=mAV%Nh?~=>BAq`u1{<3J0Q$nF{ugh)Zv?ocSQ? zNJHnT`M}?qyc=?&6VozIDevMwU~R&nx-cyjBIstfyuBarK1rtU|H6!DAPSW(`mMyD z^iG?w^b={y`k|mt@el`oq_8LT{-F-sh`g`6x&vhVmYi_fdjsjkU{RU82DR@M%JgC)$$#&KM%H;nj{%xt&S zw0SSDl)v+CQf76$XvQzCPXlqn_Zd4Lc>Wrbbo|wh530VT(#iCfMf6;fQnUmxRQ(u)@kjdy7F{ z#N$@oN8|9%cb0Xy@bUG9sbmeEWZdlJLM2jJFJ7B*x2>6bdYCu7#wF_@)9veF_3;?Q z(%PQAcnO!>#95a$4}IfMz^nHgi4?4_b{+Jy9^2H~|C$83_5~ySCkl>K7T;F~=$<+; zCF!Y+EW9qcQj)nXiPD|F5cdJ+?q^I#anV2c@mD9qh!pry<4)%Z|Zy zYnh%GID2|TeTWSTcTk`F^4eE~R7!pwwvO-#T^GzH?!;R4y5XXVS;CnKnW=32*9YzsCPxKHcdZYM zLIptc=ov;G)vN}?4mz_qyr}?;cQTmtYS>OKFv{tm18+jCW6pXpAT4ZiPve&86`JxE zy*J@RpGeCj3GLo=Eq|C5NM2ZmCp_^ee_35eqX`INK0mO=BT?mI+Uuz>I5>#RT#Tys zC`j7U(o&BQXsm2(-F_o`=D# zaFWvC@ive*{^XHs|e??P6CRs*1wAxAI&Qgu5d$vhU zEDF_?K8GKo2S*8M(Svd-cK3QF9o&2fknvxIvAwaQSm$UFyY0+6?@BY=WP3iPIAc8w zZekWUhTfX)_#k$~zb98aA~I5pWESHF3;3{ISZLnFijA8ZSW9R$clDPE!FvdTeHDR* z6HXsjkd`e`Siv`(ed`=Sw5;pQE#xYW=3B)s-CB%Vn}wwqUw$U2{p$cg1{#{(of8gZ0dRz@q4q&>LNQpuqX9G&11OI*!N=&CrJd4>lj}tz z@%W=`7;4M@yl((8kJU`Z{Jxhx%#GsP&_rCA8{nlG0wtrrPcL` dnEe|Tn{3?xxFTPfc}ov7z}dmYzQWG?#vkE2#99CV literal 0 HcmV?d00001 diff --git a/docs/design_docs/cholesky/recur_p1.png b/docs/design_docs/cholesky/recur_p1.png new file mode 100644 index 0000000000000000000000000000000000000000..35d403e0e863d5c4c115a55d58845bc2b7ee78e0 GIT binary patch literal 4805 zcmcgw`9IWa`~R3E$mStp;h%DKPM4Aal_NcKFiEP>TLCP3gRAiYrgbIx%8N?8B z>?f0uhKwSNslgCqd2Z)CukY*m1HP|ke!1?~{hI5#ug|r--`9O7TbLR0@rdz25X5I} zgtCGlm^}n>=y1cpjz=?P8iM|nYK*#YCA?sXI+P+a>e|K}Ws3FG^fR(FEUxss!7wIlN#>7wK+D{gnPL`n{F+T0&zmp-S&)d5c%ZLTf)y)9Eqfmpc*|_luQx zR7?_VdD3fA4-zgK&)8lyH)jzeZK#Dq&l}3i7ppkg-&$Toj^tgI<$%^EojVZ_^dSKW z)91!>Lub*Pafb!rP#{=K%ORn!U~OQ5g8r08LWxB<=)N@y;=TaqI9iT_4(}N9e2_hOVLD3W3|RzLuQN)GEqG zdQyHVW7p5nL>anGkI1-IL`Z9@M3~Gy%Q;v5u70>e$oA}#bM!X5#*m{;0)MUE^Qs>X zsm~|x##zj5aS<=tU8kTm*}C~CUEh7U`hNbZa@|fNIWa`#lYX-`PP{I_S#(6x zqH{Wc^lKj`n@~_7xCl(iJ}}O`=2}e%0ua!fkHerYws-H*+DO_5;uhy*Gy5BWR1{^)Q=o^mH=>_rexDx4M z+Dt|8a;f<%kveDT^vYj1vSd1@tG5msy%n*1^_;H_CVnP^^su9&BR6btaIkmI_j$_g zh4}%2muT(f`om7KJL2RmdRSrN;k{~9PsL~)Np`(l(W6r2W-}FINyqdk7_B@SE}K(n z5+QH5%#m;FuE{|Qa&NbBU3V!a7

Z-u|MNe>Y;k{?e5e@1_LFe6iNhISS5jXw-)| z^=5mBP=5PIFKPX|?G9}|${#K9ZezK~^+SNDMU9;voA&c(KzHCZDb0G0$8gv5@W@EJ z2G+ksB9kmXhW%& zN^#FD+xIb+7>A9b~AtdV7!szxd_HfIT&0Xse#1nqwq#8IOO3Jq?JxF zH|h`E2atjhYn1ozg7M&77VB$cGx^u%`t=Ohm^pZtMWxvF7k zcdcY(wIbTx69_@HdrmUa;t8^L1_hThhL6V-b3gn8N8dkD2+OfTt=$ENyX+qH$gGL3 zy`46X6^`PLW;E=s^$o?Y8`CWV8vl=%P-}xJQ#+F@QzS%`p#G8^l5HdyU%ba*VNT5} zUY#Z6jrMwN-V)(sjX`XC>=s=Lu_L`pVH0CTJw>Gx5C213&1T^NyBDhNaR{jyx?^=C z7z|c^0|?-=LqVMjTLTR{x`{t077Njm9oTQmH2xC)V-d^;IR@MjaJ`N1T9)si4|t^Q zS3PX(W{xx4ntS@2h?~;Y^z)n1Ri_y$3o2PxcjlZga__FL*juLaf4FaB{lB96t8M}= zLICUu1XFTQc=(uK!}YWSagx;zaEdx07?bVavt$en4aMbcvpooAbHmqJku4emoNb6t zn=j^XsxPdw31;OZ-lbkmpK{zWAHC~^=YDQoqxxy})sFCUwoSfRJ?BQA@~^k+n|(2{ zT$yZZB8ix9;W3Kd8sbH1|6|9+cUj4)>tdhOSH0X!*fzE8*s z(Izm%qvvzG)J>H)v)Ww(!ox)k{#33xtE+n|Hc(y612c;khk9?C%05^wRC2-u%@aK_ zp@D&?wR}>rVT=%U<17g&siCyb=&Rd`ERoMo?yv<~S32~%5c{!f72VEhx|~YP|MYk* z%Z3u#tSiCaJ#(9Bn4rYyf4!eaOTa;!O=z5>^uK&eG$-@f)K&br(vQ?PN_BX z`fPawzp}0cQ?9(c{M$S3Hm3>(|28lq%6mgQ?0rNltyHVKyZd-XmQ2qoa>-j{)@SHe z*!i^ZRV_V|ZIbH;R~e!MJBE9Dvc!HvY6{Ehkn8w!#8>ooj)DN7wea9IOzCLe?C9Ie z3QB2CtFar0+yQRin5+rlX)8T>%B{v*^;ToPZpOdA0${D5kcJ^<_v&Xd}7(kY>F7IC_)_YlA%!R_&|h zQOcEfB75JmhLmi);O1s#HzwsLbx38jta+<2Kd8E4kE0G?e71Poo8WR}f;P7;wdt{oq@&-Up0o_7>xG2Ddqskf`xcIzQDo8+-o zx*9WzU~!GK&oQcq46UebPi7uL#@NA)M?bl%)^V1Y?fqPiei(GY6 z_V&)I4gjf-E^@>&bipmZV%ww8(c&3_rHf-91&XGt*+&P|SP%lIgwM0H5g6)lG_h z2Hc*Wo^wA|BU_eYHX6b7#OG84GEwSmKdWY9iHfs%WmlDk4ObC$up{OY^L8H5oX$4J zk_%;e-oJm}>(yUNr_+5HIYZYL^wstuGP=R6$M_9eks=7pe*OBCG%&|xGV>f?dGk~s zYN9d{+|n+U#FD3$LRX9@t%lW?g0(#YNwe{9_X~9LNe62aR+Q<%+FhsmFO|Gs^|Za7 zey#o=x|}n4cst25Df3O>(z>hnqvh?Zda0*V3tampxxNWWrKybGR2NITL|Lgajak_| z&^s_Ny^$gNpod1SS{YId>GT5e8OSc1w|Z}9LC_#Kh8LB}`hv+yI#I@$>rT6iic!|r zVrBvM2IKKnMDNl8fxsqT@wY8jo=)2^}1J}`2ihu}@C$@-Y= z0Y`aVz0M7tL+K%bfwdn>lFAAYLoWC zJ4)cGj;`*2bHAeHEOA9s@ap-#{w=)7L#JR???w&YlLHL%d1c3c(-2F$@$uZm(wd9P zWB#Bl8MS?9&Yyp>u~@lPeXODk;>+%KVIFCo4mcCGKaimijaS7VfOAblFpE}z&OqX*y^gKkj@oSr$dpposTtSOcztXE!quOH4qLJ>#%L=09842EA^kgVeeyC28L$5-$9y0#!+Hui*jfVintj8&_fn(X*l z4~hxPBirs(J$R5O$5^*r_Oifa3x&np>XrS1=dN}Q3K}NK2{-um6>^2VpQ}?ep}czB zk(;l&&>w+{8xRNAo4M)D?by~`R>IVak$axtf-~Yh@FsWn)Z3h>#$V|jWd$f+V2CeB zN=Z~H#W35W-p~n=A?}TY&(b&OI0XOJz4D z@z6UxQC`IP=rf3ns1~Ol2{Y_a7@18q(FH9Q*qejLH{??AyjAbPUOaJO;wM()efbS{$=OR zkFB5P(usRmaa(k6w8p~2L_D{K_iJQ^_znTf2|#4x$LE{1cMfvxy=H)i7l7M;0<-_mPd2*(v~9QX9gfny2P?=J LZH9Vj;F9ofT-*4o literal 0 HcmV?d00001 diff --git a/docs/design_docs/cholesky/recur_p2.png b/docs/design_docs/cholesky/recur_p2.png new file mode 100644 index 0000000000000000000000000000000000000000..eaf0736c58e294a54230a4301c3e827c0fc3bd8e GIT binary patch literal 5666 zcmcgwc{tQ--=DNfIJStQGDef7IrcSD(J+>fLJY?k+aQ!B%Ov91l9`bvTe2QZmI+xW zq>T00jio6WvJBY`X1u?i^StkW&+}gI^w8`I{r%m)@A|nv-#gC2%!v26_;C;j z#0$T1%@PFS`V9m+Ksv?+tZ4m~YzhL2%)zf+u@25)PMRlXxP`x835pxYlesxHXnn*s zB}qU2pyCtbM>fiXw&%LKt|UCtf|Ud#*`E~`T7^)@W8EhQS} z9s<3X4tM1Tfr?Ea2hM`=pkHus&@n0)6pa!D87Ogs&JhnidWPWAzww{mzF)j}07D?B zy&>oxDk6?w(!0I+i<3C{pVr9Nz4;)A49ooxgk5E=(W3m*$WNcY;?1>`{$mXGYO3o$ zZQcJpGC7%j?$(=PYM*E(w>u=c2D_MP*>a)BU`e!p2Dh+7@;&+?Bf*!jy<~nF+ho~# zA?OyA6#ReVp1Yfhc&ctpx8G)eYGo`)HDH%93;2U@QTwYn z{`{lzXgSf5H6tD^T##|tmYZYvAgsk%$8V>C&RJ5X>^1Kgtxm7al8-VJmC;hX)^1`JerP-< zQ@gNd?*|(w1z~;NcJTNOXyjuQPT|4qAWX)`kqXX6igu&|ALD5vf7dIg1-~_>A?cfFzXrvA?dh3D=%nx%6o)YQ5EH0>9uIrM%LEalg+~A6t0p2^s06|4=Lmyy)e%paV7vu^ z=eiRQ72Kr{bsldAcNs#RO8|j@I)?({D%9Bo5Y(8!a$c0+jN4W1(O@NRo}0#_7$xq6 z$_+UTj2O)*&pCqu6QlDh04Yg~cJ=@ zmLt#Lch2QU*WuxJ6znMmBr3QxgO~ZLDbO(YCe3B^wA&>IIBxozIlEvn@?NqO)uj({ zCJ_YZ8P>FeqX=`o6|=ZyPFG2?xZzNYUlbzAiE&O*IZZDS%oz9gQ#$X zB1X?!8ehmn8014`2quu6d4ixZkd%YQ+^#9`qNDlImhIrd7!uN%jS`#((pipH;#S3{ z_N(cZODb`11k4Vek{D}#2q>jmwul4b1)w_w=ypFjUv8~Mu#cO7@#T?#>^zSzCoW}Z1$;}9tDcKIOa)_H&SfWiWMOH0g_hz>8i05J06=FMbqp?-upjBfDSIqZNME zta*05#pZ=qW?himQ^ea+3Zci~zlm)Xx5IFiM*ce!xo}(qdu9PhIMh$fLKKZJe_)nf z?_|dP^T)XZvx;Fqrhtw+JoxDPNNVt3U6-q>+gj8ED^|wkI)|~9sV-Q~wp#=%$}(mJ z=slAVv0NL=Ug|pw%m4xPmcK19F1yTaf@knuH6?{p@E&Q)^k5>bR^T{``f|rBT>y9 zPDK4=4+J&9zz5vB2V3~D>O05`bPG>G)K6xudiSi86fV%)ZAxzKpd28P;EJ5d#)33y zC9xAPyU{5bi89_Wu`cnPz2iTYmX;7@&!}l0uczo^6a6aP>Xr26vZXUTIQSO_wYQz? z4;#-jeix>~BMd^=I~&B$G8ByVr&6!s<+n6U&(6s<#{Q8vj*WG7HF7;}`1hfeyv)n~ zTQ;-2Hx)GwEpFBTg=~^jg5^J zu#9aPt0S65H=kbq<&3t`%=cYm$dFe`NMK%NC+SqeoA>X$9mBh`O$U~@3XO{T+)u=| z3L)LP&daV&-@eNe@zu|kSdeWrw!CFe8Xm<1h5585{XFHleF|eZ5y2&d@rHgXP(92u z=&rJ=sdCboW=b3C)#QuSD z+Y>n(AdAgM32HQFTDke?_`7P?%+cs{@R0rIp%+Jb<(IuT}k42Y*%MJqm6dn42yu)srkN z^|Uoj^ACL%LD8#*hTK2E5H$a6Tew$S$}*fOEBb?vcnJNY|7v$KXJ;7gd->kR{1DWt zZn0*SCnjOS&;?_Z6%`c~cd4orAB0w2Y58-?OovgZC9#A+^@+Ub{)oJwWF%5K?!LIV zIPGS3U05^q?F2q(MkB7~ov}CMnJoT&S9J~n=oX-H9#X$J!CvGNt}DJOos0JpAp7h# z0ch6c^38+@h8GUZZSCgvxoeybte-4>DH{{eWcjoa!+G3NX>)hP>FMt8Bs_vEU@XQG zEuLgpI6y5XJ_#tz1e*Utr_|c&Y=|WGi5*f=vx8dZQMQAAp70Fr6fthq(eRC#;j*yS z0M%Y4_}5O)BPO_NDox6ixa9#f1!5_l#M=B-&!;0!C*7}h+K)vGa7}Y`6}&T8?d`69 zlcSPhi~Vi{lpeBh*orDGl;&u1ih3XTI?{5cyFzO5!-J8%_=U#@OMZ_tDKnuE8B>@` zAbF$e4KGs79DLw@DctL`>wSz{)Z)qv>768XdAgiJp%5CAp5K5(x)yK51OlMsVrmD* z2dvAdJDs^?#2LdG%b>2%%=lcqNZsfFfQhN*=>Tnk1MUuTK_O+P)h3Mho^4_F^p>NZ zkBiOSWNYl)El4DzWbVm%R-9e4gh#%|OFcHK$%8(+_W)z+j|X(7)l2L**q!L}`1i(x zZwV}-X`nRedvBq~*uutY(p}V;sQ0?nR;P?KtJ4E_(ZLXke_uRUK@eJg_`=&Oi;0N< zhZfK(}Jp>gVQQrEOBAw6id=S7ENV+Xk0|A9QiUFGG5qTQrV5Htw^A(i?3 zh+jzY{KqcegNYs0`cv)X-pir)_e(p|E=jSM`o8(p>@Jdf`C?nm3dyAp`dV9CE4J)9 zpl>*}FfNs@I;y+CgGH*sa06624&iuA@K8-+v9g zO7fM_V|$25?vJy#_8G1Gl(nj|OmiF$lGQ;En-Y@R?g^4?2(LfxBF9<@qNI1J!W}-C zG;t_Nl+Fno2TFk)?8Xqljq2vAj(P$w}OEMl*P4abG;%4J{l6N9I642ofi6!yr- zGG~ejhX3SKfR%vjfeJJl+~+gGF*xUZvTxJ&Z~kjDR&aj>IA>~=@hM_GS1){l6rV`i z`{d=;Cvu@V_VOn6sMKYL%?mg%90x?N@B3snMI<;Qg0pt%a|6p-?JYQDWu|+d!`YHk zsy5bf)FxA1wO*P&*ZpNFr5mERNAs|EwT!Q!hp~r~eA9H}J?Azay6lQ!@K-8@ zy$}rO`Awoi-(^D{mZ2iNL(u{#x$upMn3E zspVg6|D3?y*ZqU11n&_iCno{wJRA%vMnFaiM;4gN%f0I|BpWYZUtj1clL6q~&)Rm( zG;?cKA2aaMu9TJhFtz-7c}o;~X7tEO2RVRREG{prr*~&&W&#&AJt5Ddj!X`iN*-Bk z9AgH)ELdw4Xj5Rcb9tS`S5*K2MkXtJwT>}25o^iKPJd$5#+jRmxh@fFXKbBxQI%+b zv8-el0^9(gEGvpp&HB(0Oo22{X;LqwbI`NuHT@;722IvH&1b%cC>dF}eyN`s7!(wQ z2UB|(ni?AI`_yKqX9a&)h0c&)OOLy&6f~>~ud#i(fxz`B|%ShF`Lw2~K3ulfqJl|ifWbvAanec!ish0@j84apTImvQv$39)W#t<0XE znB(wbe8MQuhV^9Yyotbj0b*&SVQ$KPh!*>JbOl`@_<%mgfsc{|0HkuDKI%=Y0_d|`nOfNlD@Szti}?P znWXD7`CuqsiZ%sN>wnG#I|`_T&A%S1tw`D~&J zQw~wUIh>huz)9!us%rEm*8zoyr!ooY8||Lp(x-@F!S`yfwf)V0*C9@e<`^-Fk>)M2qvqTF(W|O5$S7xG^``|H_U0jfahK>QMlh z?##S5T_$q_A#>Zq+b9sxoSw>MgsMbnYiqBCJ2#WMcsjCWiHJJ;V{If>yP^7(^gC04 zQQF)mRx&-chFRHd06X}Dj)gXhnSciK|A z5e@nI|PM_@|40H1BK4Qs4hkXNM17Fo1MgqsXa4 z>=T)zH)ez6uh$x>mC=`z!|wuZ;}FXlUGnC<}ijQ@!(2 zK!>T*!^C${MTR~N`M%!A>)g#MwuwmI`#=sIs`NmrLjuhOdA)m3{~fw2-FDV^dkYL?=m4%{(GsWnV9+ zA#PtVqwz3cBD%jliGU0`bK(dyP}|PRbYRfb$?1yXF`B+mO|D6Nw+^I#WL}If)+Jk= zADCC1o=rZJW3fHwzkdDN+}!*Dd!zsd6CzHWOt`D}<#Y5<3MB*uNnpeweQq8=yY`8A z7SNYPYqMRm3|meE1852!DMd0S`O@+TK6>_q;;>3yA&yy@39?f3RTs9ZK*(9~F4}1;MYI LU3+8T^!UF3JE(8F literal 0 HcmV?d00001 diff --git a/docs/design_docs/cholesky/syrk.png b/docs/design_docs/cholesky/syrk.png new file mode 100644 index 0000000000000000000000000000000000000000..6faf3c4abb6cf9e9dacfc0849ddd05b0f12da44c GIT binary patch literal 5088 zcmcIoc|4TS*MG)NqL3|H2q9}BW5!MrLkwk)kbNIA)+h~SEwasI4PzN(%QjgOxsXQLYQc*-uh*w$prD9l;V~qMQ*B7xQn%p+24Q^c>LC6OXm(z{ zvgm*~)inR0OM80h0QwGqK3cvK#qgCVVXB9`V4 z4-o={Fx?jd*?efWMw_3`0C7nwH}7^gmgbk6!$rV>cdKPdnnR0hK)}+Ro&&a|x5`0U zl8j*yX!lP%IFxdIb*+OSe#FgKP6d3S6*C;tI$Bg^`^ZCeKg!bqVj==%UP;)+U~4L; zuO9OHmXWJc$C4vdaDP072(*7#b6{<(Si_pdWO%-sKoXf*FK^rm0hA#GOH~Mii%s8D zE784c8{a(cGck|H+$edAvRz0W%g?vQ;#ER+hPi4MO$d#1NXx@SD%&1|yZqd^oae1L zKJ_ODpVH#y82xWl%J6hO{V%SULfBy6nHui@)MA8>Vpfw3;BYhyH zB_E5JFznI91GFWnxp7oPY;32O#uZ=1EiNJUlH}aDtaun>S~ld;ZCaZsD!w-$5DR0( z320myK1X7a(8wgKeG4HQ+_$CYpbDn8hh!2XCZ3d}kkWjcIZYr^1ZtD0_JJYoSD&!A zFR+$q&=D@1MCF1Gase%@4}Eum08xi9ZB>xfmITqlMpExC{GsNfQGm?SxI7|b?v4;S z9fV9kDW#y43lyEU;AqdE-0uLv0T%7QjtJ(5{Bj&Bw2!E)o#U?9lQE_(=U>tcK{7S+ z$a3cNXh$I*z$q$$7(*1BcvzxD>AIgzwFm&zQ9|vLx~kyz<(m$|fP)-C&!6+-OPG8< zt%eNXz_69RBu%q*yd9LI_Rrb;wO!05fFXi#k;niainYYBMAVsCY*t*P*`HTA;y)qJsR1={t<;AYD10Pa*$`R;BP_N}(3Ry;*J!XR@u z&A;hk)|HLm%%sIddv}@8(4wAexl^BqMs0lI@|oQyX@InEM%5H@Tm6^oqh2=|72Mk| zA`QVcI{Gj>!db2M`~AB39pC}R%Q-GSjEp%{3$$)2(ly^u)`v)|-eb08~*JqsGb;)~S9Kd@Ki=&D%bVi=EgkU19vdsv9^cuK74*He&{*c0g*y!%`M2 z;jkdSS4hU26rIF_%Jn^d0t%_7p@h$u66n=W+$hcUX={**>ZX?(f4xnxLqzY1mZhsA zoLyW(asJISbvS?3+`n*{fA7A0xjXlM+B?`x--G4jd(=ki^hM%m=No(k{5sTli2(&K zt%yAPhF=c=GhQTYrPLlAr{^iiXu_=IrPTw_2f z+sNbGvx!gq?I(l^X5z3+HD*~R9QEh!k4c#RkWF%~6dMhi50{aVk&Ul-5|x_D2JrCk zkTJs;g&9=*2Q#YK)R$|saH(pqcKs>l2krZ7IDE|y@`JS8(YhGjsw5BW(^X98{O5sz zfaVzs{3T1&NB@9;-JP911FojFu*fQz#lq<)USZ>?pgUAT)veCj#{J2E(d{Z>VjhFo z?-SPywxewtuDoJder9NBSYd%jz3UKR&zo%+Z#5xt7ug3^!gntN9;3h5``gX!rPU)Z z!WA+5*BPs?*Ok2%P|2-BeO)ELzRYH|vc>v1{pqTKl@)I#In<3_61>LIQoM~+PA_O< z$5q^iI)}D?a>SDy9HQ#rFt9tN%qdQC-i*yC&f>a~T9(42li8$wq)Wv_t!ne~X7d)_ zHkx5IETg`embi}cpcijyYEm=3Y#ebX-^~d#o1C^Is~o6eVwzh=dB53@@3<_dMT52@ zp?}~T8{a~0?;Tvj6gvlgJJL6MovoUlTh?c_J7%fA%l0UZ79F-Q^H+;zJ#giN=%eTe zZCioC41%Z~z6PBb<~sM=_R<57i1>}H$C&d=^}k%CNZ63zRv-j%A=`4TaqTOTFEuw` zvtqGwJbjkx!qhxr`CdoBRK&i`;@q5YQ&6w0q_IHaq&;s&V3M7OwcATItF1tlxnKTG zJG;9vg1ro?cEOflMg|Wtm9c#sRJoc}+=gHL+j{X*c;$ja_vvV3A=+s&h*7uxbAdLZ z+`_}u)O2f+J61+YMQi-8kjJayuacPR+|V3Yb^C(G%J0GK|ZJGA{=)OYv4A!;mf1r+(# z&UxP^o0Pzj9)_Kr9Zu={fWv5^K^yerppDQj=1e`j%xj`8Y5DhWx-ch0xy_=UY!$!! zZbu&Vh%iuHBcnmZuWBOD6jSbSc%Iy4W$)?HB16cRoLd1D@&Zb z!;ebJc-h%^kJhS^xaGA_e78vKY_$`fWo7jq;~NzXv;BR2Dg63Wlvv)YA|eF@f*c$k zu|8QBL(iJdFFs9TcQ4=D3q&HD16Ov2pqtFxavG={f+M1Yaii_n8^nXw>fOe~nr9;B zwE`Y6<6wEkE8p=MWo59?Jrg9-^mSR8OEpmxt!0#+l~q=@YNR~#A!yB_d1j;Oz6jLx zv+iO0VZ#IY$w_y*;Oe8h-O=k@L%5nQRJ`Fw<&V$&&b)QD@O*yoJfQA{+vSX$A2mU} zU<-Na*ihD1MI36%u)XOx{+;hf)0p4&CeUUZurreYiq5!&bGDM#ojZ3FjFo(Ip$ad$ zy1E*dZWwxYG|WYKG)(Q2CqUu13E~{(e>mm0``q?tg6D>Ep_gO3;YI!8{3+w`U* z*OAEZpV9Q7^$H~VgXhAL`}~k5x?^*!@l3RP@EDQIF_%Q zus;_WNsNLxJ3FJA!>8)KYS>pmQ{-Oa;CSTh%nu7SH+_wOAL^j^!U^Jq?SI$CS$Pze zz-Wrk_ni5NMDg*1YO&!hDtP4`b&$w6Gv1a+Br3s|3#Lp+_t4A!`4&Iz7fK-|&lTkQtv%56HHc zUr*0SORM$&b!Wnl>J!#PMJ3nR3%NPp87FJsW`oKRb3#bm1=9`=y>Sajf>B7RR}I0M z&P$2ya0Q=2wq=Vds@kbn#-`B(TE)E_(lyj?Q@kO<`XfLVSsZPBUhNh4o)VlN~ ziV(5MHwyHVB6bIrAxq24h-@Vz6kjBXJ=N`CYaVY|MfcNJMGS$Jh8o8-&N3OKr=>Yn z-><2u0Ylcr))p+hCn~t)Fwlo(BqRXBDkjBy#EW!w>r~JS#JK4DP&s|I#V`F3cQDuKNtj*e&7yyRob0>_E3G$}A{5I%S>*H0pi!6B) z>O{JgP1HSU@%Kvv)(}>EZIj0~3(pw)=Z9hVHzitZc*7IxgbKtwcQ$ zcBOW*(Qi>CNzI>3lGinbAN?svj^G4!tw*g-KJ0vsw6K_5r?9I3lM58=!X`IW*fjZn zD=}Rj%B`JR2g6SD1^gN8%9wRq*daSY&2M|58_aVp`?Ki00|rX0@F&n=dy)iM9exs{ zFDrQthlhb+SI4t+;zn6CLQtr+HTzwCGNW$d;qLPCuU`;u)xeVa(L%e%L>k@KO##b2 zU0rHt|8~KbEiWw{@wh)E&f)g^OJ+eu^x45;k|qBfe&B5mLV{{A>*j!CHR#E$HRP#3 zj}5n|{b8T+jRKEiOOL$1t`sycri~_^>_uv_bRWhFTX-&Cc7)Eo8!*X1yyyzyL;N=e z^Z#q)cl7pdfGR2~uEvAC>ie;kV}(tRf6TuERqMtZXEs!m4!~A+Hskwzcr?!SL|k5& zH8XwC1;2mx&84=5GU+`7==c5(`FQTaYjYwA3YxTLIKV%&xcPQNK}b!L1oN3X?I{o1x>Fw>^nuPX9#$q>`YGRfGRl!DKj7xJnSlLv9^wgh)A3L z11+La%Y5vkBcc>uix7)q{Hm5*Ou@cEwc%5I%1`V!&qyETd$-%f*;-O!t8QGRbajPQ z;T|u)IBCaIbeX+kpV$9%7&>>Xu;?ZuOX-?k^{jg}Ephp9!J$&|XmnXn8q+kEmf`d8 zpLr3*?e^cs z)tiO#mTf0|1;>%Ed<*v14QTlKcUydEb|QnW-ku$hK0kAjJm6mEzAh;vIfs@$I>?wHr#u@K* z<#6peV@5m0od1PCA#H5R!{bKQQP5oD_hXq@Y$fc{=y6?EXsz5Ac+qEgiUy?Yq(a)h zBVi7L+uU(~%swixcNVTUmV2-e=4fGouPZ4jF}C(p%C2X8K3_lWaEdkgIkwrAoB$H=MFH?HP)%ra(Mb5!0FI^ literal 0 HcmV?d00001 diff --git a/docs/design_docs/cholesky/timeline.png b/docs/design_docs/cholesky/timeline.png new file mode 100644 index 0000000000000000000000000000000000000000..3c90e4d8e3416f9121cd1d5ce246c5552364be28 GIT binary patch literal 43155 zcmeFZbyQYc8!x&538h4k7Vrf`8brFKK|-Wk>6Y#=X%Hl&K|s2@Q52Bw?rwbP=FAuO zxA(r|o_of*f1E$=9p4&?id0i1!0 zDr6&v<WqRWPr|$cU zg5q)7?MG0Ptjsl_snFhj^Dd}XQ5(0ZjPF`DVEgQ_WKue2Q>p%JIUs(q)-NL;g39k- zKhT%5$TbW6Xk{@?;_WvB?Kk}aQ&Ur@>fewdE9#yXw`Q%Oxe9DQ zJvh7YZfvBC+t%MJ!gS_KY;GqDYt2`)yov%}x*7QsyUQp2l)vE>AKw}Mbx))HjXD~n z!TZOt$2dttB-5VVqNe^bW<_xr&8S{Lpa`Cm*7w;o|Ex+ku&sR$8dx)BmAx#XWpb5kfKnhBQBNM_x zF55cdow0mKzIXf4k;o^01`!2`E=Mt{o5c4jJMx*EukiR>o-g~QNn~6t0)`X zu;B>gk8)tY=F{4#?F0AL33ZFB=gydMT|oVlV`KCR+C9YviBdJuL98D5gpKv+TEDyV zl**I4l?ku5O74-9S$>Nq5U(Ahz;IUhk#6 zS>=TGGd2-yU^JOMewkZDK50T22Jb^BskobQ%A{C60vRkoJmncT3OsC!0opbxtv=?g zH0yR@BYqq&pUaFiY{f>*L~ODC7E;?dKiN}_QL4w1VS_%#u0$X=Jwbab3Isno^Q)h2 zT3b`gEf=mYsjO5#z2jaw1~8$$A6fP5lqQTCS)-Kt4Abj&Q#*4>hYgqZpd|z`eb-Wk zl*uSx@*vRDAB6t)TS64K%e0#*ylluKq;C=&X6JsLDaT^TPS?MY()CV}m^p zdAA!poI7=5i{fzFrc#^1)y2i-+mh5L{vra0na;j7N5XR=B<&Xp6Z{jJKc2F3TxE`o zqIaFFa8ZN>5yj37vi9dBD^WgfG!^KD#h~`bSx0}meHR5fctDHli}Lsyba9xW{=D%d zgd?kETKOpIiCm;1-i`Wo%k%gRQJAo)tF^PvHooUivZKCrrY^nT-R&Nx4$ju9q?y zC3KW_n-{$5u9Q?^%E|VN!Xm(+<)4F9WoKlwH%QwP#U96Z_21*MVm(6PjPGhL5U~6< z_ZVurc^wFgP_^=hFQ0yUE$8;g3eu>%X&#m2SJ@V+3EXFDD)D0PR~I=Jnumu65vyKZ z^v-fm_mX^pz)7=BrAP1(ZmCC}MCx*eN&AE0SClEy-HH6_0`>Mw?VG}%utG|OFf>x< z)w@ECu#Izvax4|FAOjGa$eHKf`kTZi0mu9q-@E0!q8KYrm_$L0O^A*5)}+*JbEl7z zIGk{6dz-}r4r0WYZw5uBrKPi9`VS`2vvL=o@15<7iqM9_yNcpsCy^5IASHE#v&3~> z$lhsbN$bms-0RX$aWm@Gctng9ig!CyGEE`^fT3uRt2%?WbY!;#L8#bp$4=Mq<{6d2 z1<=4No=)=c3axy&Yr+W*1d)J}#DF(nEqDD6gF#$FBnEJW3_W4Dj#oV7icYmt{X6|7 zn#-4&0EIS{wLVL8uQWVYXMfB;TbiDpzErE1BJ^C`6CZH^Ch143lQF~hEsw>@t(mg< zfEu|#@-vyj1JB1U-Gzpp2~uK4V5v*eCnJw>;<1i-_!Mj0I1)SDTLo9RVI%6mpnarl!H zN;w2!o~>td(JHAhI!rR{PuL603+=?P{92a1nQ+5SpM8jD*BJ=(a@%#0qC#Si<4id* zNLbJ1(#K#;Y=xk%;tVQ(e5Nr&eRNuY?{!;*R#so9G=Tuh;zp<_Z)771)}OVuxCsJj zw0TU zLDE)zuN&mB(v6Lo!-B3cn2yq37#+R}ZVa;)R+%d;+ZpI^#BqTHJ7U#+d2CT^|c} z{3br*jkmsAs_ySr4A>qDZ8eNpMJG86pG9{KnmrEekO4Zz{($6ris)O=8Mjc>&Xqmy zWZYy5iiL(HDQ{Q>Nv0L6tn-rNaQoa}d$*JhiEM!|@B0>%OF%*ZrdvVT;%hh{=}UZF zlN*pRuf}aE2OgwtbSIKfJ0GQ7E1x+~+R#MXK%N~^5(Xhn<2Ch)xGX616tW_Q_p=+8WU*bGdPsP)`LN)m7r^QVTwHdz(C6*)HQOjjsOrdL#fehlA zsIXFG+Eb_os$@q#P`e%5fiDAVc{G&Fde!^J4HD!FLO{dw+I)%KuUQ!zOg176ASw<3 z{5%nJhrSaeMS7O;-yt$V_^X&UyxJTY=BE-i-<4~{K|G#%V@dBc06c->6r@9l2km;- z8reXX$zGAnx(q>?&$vZ7*ZIV725az&PCG~K&i468%&K}{9Lt|^M_zYb!skCRcQ!Emq@pVIU4gIq1w-~dP)^=PJ zW{J?5g{MFz$ov|puwJil{|sazZh>NrcD@}kII9^OaoL2m$6-XE5C-5c+9_up8A^Eo zBn2?HLq2M|>zA*#Q6zO{6$$ESkez%$jUNUlPPbM+JxjDY=UFK%bR2-qw}lv0E*-s~ zx2(kl+<;Q*{PB`_vJBcr<~6s*2zO;1*f|Q4(_02!tNpmgl3wvoE#hv34C?od)3c(; zgGoQhH1JIrTrvyW>o#DB3DHpJ4(=SuO;Au|O>Eip?K^gEwrh+(<^&~m@nA!3h@|1S z-j^HRl+h&>XH*YS|7=6lri{aDjx7szl@rYJssJ)-xkFpdtqX89D|-zU_ol`0j)xl- zhobRPLgytjJBr?vCWhayVO0q~nj;=|pnM>@_Bt6^X7w%L!$6cj>DLW#1FrO!e;N-6 z`u}m`pZVA9V|}?ZNMG0K=FCx(U`!4R5@fKT0c?4F{cFZiyl_1#Nr*(>zHn8g^jhA$ zr|Jc2ujSI{|Gm*jLQV}Bsl4Z4^>!Mf^_)RD(#fg0;mbLewai`C1fFJvW%)lDo;XjU zLc{BrE|cm*QZt9}74?HYD9U4YI!yg|&9r;^*<2zTZGukYHCC|AE!>3uQJsOLU$a|i`eGDINZ`m!$u zh|};x->z11x(Sh(dxIIstjHg0k1}N@pVcKFuW@&}eOnPe3D2FW^07XEpuw)PCGF<1=pj%Q8mDiLB}NcWP1PbsD5p02S4)qC#}>V@mKt!fz_Y>rdFr) z7FW9VHS#b}$&Qkq;jJyI{QDD}pZ@zbWHxn|gCX}HUMnCJVNI*oFU8$2LKN$_&NuY; z)%AVAJ+|nz3X24QDvQ=+@z#R$PHN(ly*9df8V)21Jrh%|9rZdx74_i+OVbJ8`4>-I zBMJVFPyTxLTLd>sQj+QTZ?0`@pox$NpP4vZ0Yes{N8b$4SNMjMs)+Ob#TOwR5R9vX z9oB#IFGi>kjEIKb`+JYXHPn1}5X!2bqpV-K5X65CvnEq8e;_qT+Wy2zC zkK4}G$zB4m3xQRuyJ2@r7QD@^o9QjX0kOh~1$s&Bh}%$*1Ur-ukMWsT|{d7JN# z_0+>T^T#N?w#a5qHLjkQ`Uth=YIkXO3wRv%NF`tBh-*B%w$?kKo~+4^o`&B=zjrxY zd|r%2?sF1dwj+?uY7_(CIQAAaix#c^Q$}Z9cJJMNX4b+Qm~U`d3uNg$b-*pK zrRlTH?{mK7@4BB_*sz{jczHN($>VSs`b+dZqr?Nt>YV%+V(m)JRwS>!VEH1gw-TS7 z8Cx|~DPWVSeqcY0j))l6&)%q602_zLEqk4^UvPBvIu$EuE$qY;cM4x!xg8N*wrFqb zte?gRo%i^xXH@1FeZVid$!R4`qGhPi-6eb;F7ir)3B@OKYAoRE8FcMd;0El zGO(w`C(wNwbb#0I0yMG#<%!D1`LEsgl~3VB(`fV|EpBv~ql2-C3uqbhLe-yT(6eX_ zty0j^)2kzE3MQO%A=O?4JN(o&0}EHAcK7EkqL!e=mk^ec+)nVSd{{{FYik->`AiS`*}C6pU@fN^h=Q!}1ri`T@ti`^TktM6iZyW)V7x0QA5C zp>f-(#3@|UW`GAWdUNNWYpFdOH4wuWgwpi=j}zD6l#)9B$2tgQ z>`DUB_#U;ORRl#w+K)IP_pW+*bzx&SI?`k20Bp%n*f4L00w$ zt^)29k=@@LYN_WNpI$Yth5>_k*31V&cm89>+~DHZ%=c-RJOJnVK~=Zmv4M=pZPLJ3 zyoJf^c<1Ty@Zeqk6VyVgb`nrPT|3}D6l}?-qFp+Z)g3|AB@(#{oMhvO122X;i2@G~ zR|2qadsqF&M}H(X5oY<8QEsVsK$+R;>F2)v`@LI0?F?oiNW(i(h#e$&cryW2xBCjK zV7}tA9BaTp2W%vUS4!fdS3J zh~P0-R#r;^5ls}hjl?z8RDbJ!HHNa`CJ=x}W6cG^+pO)5`^KS)Q zNd6E`pzxuYKTsV;{umnT=&N<=3xGxlsJ86iZhCMXQO~IP1{T63>%Icv`>&M=;6*Ae zqnQEVwTIXlg9uGEcsmIc`hmxm*R&9>>O{?=fOzO9Dq7>8Ku44V|Iv|)fEW}ng3#wqpli>56(zjaOT9(WyR!Wa^yoiK`~nz< zwd;;t0F2+d_O*L>JMk34hi%-HHCOG&0j;@A7*zNOpo5^N;2#W; z$zesr18hT%57yv@VdwD5oyOhSx43GWw?!U+xKX}8?C&Ee$kl?TwxlwIs$^;#Qgwfx z83eFV>X|r4-5$4Y2L={QDw2HmkCTX5)oyj>)gq(%)#|pt>p8!VeaD{9CW-KA;D$|A zc7*JM?z-clyWXb_@(;zZl{^43_IXP2cSvkazt~!)D{n6Fac)k0d^8H-aP~DGGWSiYIt)?|IK>&akaSC0{K{6@Zb3 z|M5U>m>|3gUCEj@pFLOAt(goj%=@eXQMmkqXJ;mPw}f5(+_3f3DyJ0<{}6+(>c(#c zbuJI!lsYlkPBat8hh$pMusBaH{g~PpW97M^W{oEozFZZXX}B6t6gW(@kVBTfj_BNi zdncK;ZYPV!wLb0#sb_@(@X_X>=o=sX5cUU^1EW!^?(p1IsmludS=0)0YqHFvv7`4N zMeaz8+}6C&EP$vw+{P_|)buowL7x zc|jeMhuo{l_uEOATCJ#1%M&g%gQspD>XrvX^?tfb_~e$4s-srUsbIsW?J3my-Vf}P z;@BTke)zhD-^DzE^BC(b@%RU%RSHx#h$rBirIlUCqlvo~a_auyB)+G#Y^Q3_a^AMl zovhKJLnFHRR-5RtDA%?!*L1~MQ{Gw9TK|gA?(%cPZpvLo_A2FDFtSe}sUFj!UNdbc zb4M^(#Qg}G-RiazQgl>ko!u2_a!Bt6YSFgg?OO&Z{7Qx>tAq(vmShZ+xkkwjv$ijfio)nrn0hARseV8147AQ4O> zLr7@`D|~Rd*>hJsLcGF^91kMK0P?Kf;pFly{lP##ID9(n6sS!EnZ2{Zg4X5k$l$~A z@WNk)(}deT4#j}5-XRl1jsc`(O=AX81f;XWs8E3~Vr4v2#DhBzVX(U}8JNDs-*h9C z4bcNI0CAR=Q3T_!%y4NPHjp9ak0D4RUijku;@`!^(-mlBXjB*cp(YV>zBPg-CJ!T>wQEHu@UmEK`C<7;>r?ej}gmPiF4C?DYouIuTZa5 zdOO?+w!)=8VnrhdYKY95QzgpiVntQ5)KiV^X^fOL??h@nAl5gK_T*S)1K z*j4{Oq=Bq_0jkPXw`_Ew*<05}#MYEd|JqP&~D**{v=sBHV?$H8uIv2~{ zB6{*SRU+mRg~jj79kt?m|DxIlGlvmf%V!n@h!c*B{fGD~z;MiC{b{-q@oU7arUT;d zgVLWFqVNFF%-N4u_$3V>_GgI%6CwC7zq|KX{BJ<k)imWb^;~`;#sd4 zcC)VK?3F=>bRPw!12vny1zfZI&2>PJc!4=Z50OmYBf`IIq7D>C4iv23nveOHf{()6 zoVpfhVMmrEm=G}z!p!|ruo3RC>|&%M*+2^4!`o05GD0fbjJA-jg~ou%d}AiafXA^P zc#WF>hIf2?H%jGO3Q;$A*>bVTLw@sHCT&S8V$U01feO(Ya{#kPYwv^vDm+j zE{1&Pu1$O)wm7}a&dFu)G%wfD}& zU=!cUO<&ChUZjR_*pO?lPz(^561WVLE|E^W-ca_J-wDWo^>@JfZ~vH;;;vhT+xdIx z2Iu2GJOl>4NhLn z5uL!kWg-aecHy~IyPYQj!b$&d-G!$)jxd{Jn5y!s=*D5M34%Hvg8B|cxDQuYWe`o! zqS?QFzh!E0)b|CzPGZX6IjMhN5C}ViV8@050x9MHh3Or_@XGks zGl>Xs97_aQ%xj=A!*(7cfL6cT%Lk?KPr+UiR?CQs9|q6_c(Oc_UPytOUID|Gl{>EW z8+Sx6?M!6o#62Z#DXF1Hb80;BZ<}2%ZtW}ddNS|XJU@hu1O+b<;wgrcNnD5BMKrw9 zHYXU&UYtf|kt6&C$Q3F4#j+L{oIhjBGc?g`aRdA}EpHhPW<4u4?P*v^@)*QM_r+t@ zt=aG5o|&xoaOqGCP6&|->{L6-MIs(3G462Q8qaVdDEO|WuBL5a8btslgedRFKX;Lu%$i5xg}EhB5LrqgdAX`#dBNnhMXf#~d3)9HddzuJjIvrHh@W;kK=dt|w@-%wwUGWKDG z(Md>pxT3b|nc3WIMqTH7X*LIYk%B%~pZ>uhpK*ZiMojY;RLH3}+eKs-oND=o>N4-T z?Ts_rG&C#CNL+>+Mxhi!&>{G6+5v9HmcqLDx#}gIK z?eJJ-DaP2ge8L(wzr0OC?s6ZMZCrvvGR$ysk_85~8INgiapn4do)mbqz>?k~sl9lZ z^d8>@yVcAK=BFL=2!iB^nNcc-Pu&vx^O(;ikbUEJ4=oPc3_3)O;(qj=iYAuiG3OTS z2(#Z;x?m#^Q#*Q&Mp}WT73Yo?CW|wB7<^|dgfLnf zsp)Rnxy1JYVdV`CsOjOR^F(J9$&N0Wu(83%VGz6SyPitC+b7eGzNpXddxC@o)v$Zl zrDy!wUsG11a<0vPY}ug|o~ZjD&*d9i9DNz8(PIQSe_i$VZO28OA_O9&Sq>R z)^6Mpc6wC=_+TS6By0+_o-o?vVT~^bZ{tQ8+pcvEFc37=go!@%eNAE0Nh~AxsgLM% zl&Ku{NPf`}@3N6}HFdHK2U2Q&Pb8}qR~743hI0e@?ZA};NRNvdkE5YLXmi_ZxmcIS zbHzhva^=`)P_SVq@y$@OT!N-{1a&9{B~4OBdU7%;eNqbwPK`(tuUb%*&C=ALcTslB9w!qE zFWFDm;9reurTnE!0fupP@%47hu&vXU0VAgtqGIt|ki|N4!~=V~y(K{yE2t=aSwJz| zvNTFDu0{n*CLAmR2je;BnVO9uCV~);x_?-!U+Eo|P)m2={WSd~PdB-Siu*{ucKf^B zg>g}A4_Xuv4=8o^T43Q?Qh(qQrgwAY=$?1LuA+gFPAlbIVv>@7>G~^oKEu z-ce`gUj?kikf8+(I2-{SorFgJ!6>5T&FNWsHN0YOtsn;+RH=h762y~RlLsC#^W^y0 zb>H_$Q`LZ>LR&Zd(*ty-Jx@Niae(ZY@*I@espZzV^eBX~NFwr&H$X|RM_~ihSY;)W zVqZNQ|GdXWAmdt%mr2_IP&%ZPRPCYtQ!}4`GQdKxozNREQ%cw}_b630KRf*1z$GPH ze@Ckf^~P07=d?w2#e@Y2q#E-;-uH6T+4%epl`#K#kj;aEvk^KP+*`^lF`n^9`yX_! zCHuYmN}H2$Zuj+Tn6kRwXD><241~UN?T9oPcPExRGQ}^yASj~dru!OtWT9eafVbP? z4uE61n6&f|-^mNUI@G;fxDw2^766_Ep3=S%cWahI;B~BDQzz7JfAV%G%7I4D`QCxeM239)*(J`HH0hUgL-=w;3)J zQp&Mr1HB$Ams{>tMxpM|&M97M`|-ind)%+ib5X8P3Wu%F{L!-CC3ogK>&fMeZte<# zCa0zMtu8(bFRr4lAC@BJs=m0q4DW!sfGX1W5QO3~+uv0}2+XKyac>|sDN%UbH|swC zcsKbh`)bH1nZjkD?JKhDSm>84Z4aFbVo1axd*+zQTTn^dV&kccAl@mQPv5DbG6O1L zWEHqHl)6^92yV&KMW6WAtJ~G`9U4$sv)Bd%Nr@Hu+xOG?9gUGrudLuU8hPlo?xBZm zP{Vl910@y*-jL$O4zTvh6eB_8=U(m!V_jwIpNH)r-#|SrilS3YeLT(I76#7(ChED> zbI6;5o*Q;;fAz^E3TUgX4a^TYR5$sNO^yPYnVp3t(f3@=>k25IGiPP}*gDiyW$-+k z$!v|ClDFMqQX`Z#lsa#**i|Bw?Wx}%_w^@!0gUUOG9u9lD-85+Ym{Ju*_xB%`6d^~ z5UgVG19XeADumSAD}5Y(2KmZPpLOo&@0?ZYG+Z`ZEwEaAL?NR4IVtqu2}xsP=AvS5 z>0lTV^f=sYa^c&1`5iBRG73Cl24xafFMfkdMh4nNqQc;MPIjF*m&lqcdntOkbf{OX z9%iqD0EK=rV0>G(zZYd?@*qQTih$+Gog*s};5#^XD~gkEDP8HH#R}ah|mY)pl%-7Uq3_y7@tC#ieTDrD(vw zow|!ISczrh%4YS8v#`&q7;A-}&N@km=)3r3r>x|A+Ka?*QBzHbwJrypm%E);OPBad zfL^`@?q4!p&DgDE#Dp6>GGcS_I~ZST9c zw?FYP3(>SIir)&HvQ@Zw_VQC@`&?K2?SrXG z&2YK26?y+V;WfM8@jV*sN;NfL?;FCmUffpn@3f`J44?@b1l^f|BWYd)-&vKLY2E1C zm>`jFPv{}B6YtECX1O;}@IBibLPxAPWaz#&hqK>5z9liiWA~^|=5GjVwypQ<0!o7| zyam<&C*D0VHP3=W4n~KTgUgM@ujLaVtqi6GompAxOZA>2jE1eD)+Ci@>h|tSJY+7@ zN+-vHOYJsEh1s?4%>(U#HIU#3*k zceFU{;F+U(tS5qR~l z;~uvKQ~s!$)bJp!pdEfg+y%FCFLQsx!6wW8W)lz7>Utds5! z<1JCTt=om{wiuXxG#3j2L zUN%#Xy#A+W?m3C_Pu{jj;vOe|Re$#jkX0+WibZ1rweAXhIkd;&oqb`i#ekiJn zPp6=QW?hTUjLz2oxM}P&Ple4%#QbWTBp!WFp$s6W*V}#5U##UQgz|27a7@EJwyV3G zcXkDd4&r>=^o>CW(92ySh_xgXt{&Gq6GRiR2&y&%B~lk&UK9iCrXZ_>-0i6KiapT*Xm^|_+WV)R$~D4z8E6!I>!oc=OvEe!fyk?@@0J+p%0}%3l>trT%p4;aU^a z_9v1%3#bp7s@ag^oYj^eW-s-$d5=*;9j)%q5TxGYQ)spGaKtoHL!BIQ?8b@4; zi$z;U0RDv!uUON%cEmK1N2A|atMg+KUJUrvji<2PC6k@_)gLIJCI0g+!1M3BL6eF1 zCV2%-=sVthGqSjzUkk(&PL|9eyV5y94ao;U%{!ue^91x6mlNkWt-vl-b9% zc3+K!sB_+{^l+8i?%Q&+UU}DggGM3b2z&QVIEqvkcMtYd_001wszOS<({a^{NO<^{ z8cTH(a+aLuz^vM6U7&azU#F!hbfS35EqN@@VI^n#L|plj{ZGd8=*00g;(je z^JVXvF!b?$@h*8g$d0yHD+pU>91Jj&QKxVKtBo;*t9C7>P2*JxN2HZv=UYpYR_?z;VERxEaTz2p?cnC1sR88SC3xL!St%q(mT-%$)`+YP%2 zl$}bFryqb#I}4ns+I){fGBT_ZQf5~$Z80vNORYdBx}mOCdv{6YYhf3;5WO0&@sa5R z5+UC0-_bft*k3XAu$dT~YLCLAa!_Lh9-1vVAsYPwlgbrUEl>%#0{;5EcnzTLa{nDI5&92=6Cg0YH!J{ zm@LLX%X}lleT?*%joKhKYm|m}!`r`%1N~)Wj>}jgQV{W!Hc4Yc{}r6*?I6}J zXdXeY>grY%AZ-`ov>X`BJl3sy7t(r^*)KQ?NoE<$kJc>71*Um$21>hm%b&&t@4j(% z?!hIQ;r3QZf24Qn+;pyU-?i>Csj8(ib3Q6Y-={Wyg$q*PgN(&*Cy0Mzj zGW9ex-Y;mwH9$qw14HmxE!`-O0cB%2>hYMK$5*2kV|hB4tusEljn?lK1E5B4FtDi0 z4D<81n%Z*On=DGx9H)GKA4qo_NN1%~XM8e0-`7kgp5D+B*YJQ^I4x24>_Y00&-RJF z@%b_Y5)km%qb5jN!fOG{@yNomE0gb{4e~kIOxMMt5T}ODy=Cr* zCcve%!V4d7|FB;k`+n@MbtWy1Bo4)h6iK1Zg2xrd{g6<)uP&t>XlMm-QM2;=$TeaW ze&2K-L_GLXmf9f^A#s^&C+?e6EzVdvCFW9nMKavrl$o&q3(S9`8Vi>kUFf~7j3#(v zta>a-TuiQQefitlL{%)U|Jo03UfjdYx}hW6ag!@w(cOqrSS$WYMAKDzp;7%Ls7-M~ zfpbu!yZYhy3k9<>pZURKLH&D$4B7l~TXrDSIN-+U@|2v-U}=RXUtHAoIGTJd^wZXL zC{41$B9dZ|Z2kVks)LZIN0&IEqEAZv(9_9NDWB8Xtg5*Tq_m`l<$8T%Z8EDFn z?CS5|PjR}_=bXfhs)KZP+~;;Qqs+S#`qe7BzS3;x!-Dn7@q1S~1v#Z@T`jda=ZWT2 zP^o2+F0l%-SL2Qm7lf$FXY5?*lC~9u zlYGj}L*xV<`IuAsSCmKnEjXkHSca zzvu{+@wlEydFM!uwO7~I{c2gIm;Sk$?pCQj=>vWPwc`IV-F=S33k^Kfb@FQW1%{Em~<{;V62>O>7{$f@JLkojcizpN^-W+s@*N!Mj5L#RPlYz|oyG z-Sxq&mLJDnB1l;EARG8f2Bio5c&y)j`$QTPCSLXd_%81kmgjQ#8pU&z#>GHz-7}JS z>p%Hhbh%{aAlcidXaM;}Hv;sx5H)DTgTW7ekAB7=RYTdW}YHs8p*(u8NnNqLdi#I88lq|f1OTvWWT#{j;43czHe5DKm>G1HCw-Pi* zugt|loJP|%a21b+9qK^W^c?@tGb%Az)21~0-dxeNPR(OH^dq5jZLTi}}P}q1Qq$((Iz@ z)BD$ZbrErKb^RLO2bgeW7E>R`%>}P+FEauj!_5V$l@N;QB2PPbiffR+O3ZL(;jf26 z&&Hg`o7kVN=n!NC#~^yFmtCiiKy)Mk(Q!Js!$RNVD@{|+1DFJF)S4xSW&1=O`Z!H9 zU2D0QOEw>xlq+Hq%a*ss9w(gN?e3!(@E}}dhFot9CChd>ij!4XZ{55iNcvmx?mf## z>p4s~4wHhD-O2rhJ+oh3VouRb8xpmb2D2Xerh>vo#4ZU8FlU!LG-0;ja4?0tpd?0x z$A*Gsxf7sR)?*cS$MjM~hYseWm1tZ%R!BxMo{xk)z)R>mCgpvuEjmEo}Fip3&Yvb5L60|lw7a32*4+Q1^WB}8SnaXja(*tvbQmqr= zDI&4@ww}Uo6Q_D6=+8OBZ?9);J`_|^VuZPPMO#~!&9EsCE%>B*fWDDN#>@s}G-J8I z70~Z6zhdn`G*s+X%2rCO`hMnOENdG}mTJ=mr!4_FFN3x&t2nGU9VMS5or^9jN)R-y z!rKmC+5I_y9In~b&)+`_a6HHmpYeD6UF<_}lJ;o4a=e+pjgF=uh!S_!l%*`DjM?U0 zx4|Rx--?$FL+!$Go*o-)d?(5Mpd4g~Jna3bC)B>An{3=zn|o{}bo^YHbhQ5*)5wKu0#50uKL(~E zYfl}Ohd4VmFmjDL4CTGUdnPZ3dR2LNTyjrh-IEi_GXG)FQ9}E(8C_dEy`tnr>v-q) zP3LLu@Q=j2t~!*)b$pG>#c9S0U9C4$^>PakFFj=DGIv!Bn_t~Av($C7Hm?99LBY?M z`PWhE#~5okgBfnBlmVr1chfH~V~N(dP|P#)iEQebm5M}s(ye43PCYl5?3hNWtE-!P zn0v~c?Iza+nq4IsuYLOP{i=laH}f92Q3+X=Dk~dbkUMh=6suAc2gLSpzb+3Fm}cz_ zK`CE_!b3pzQmo5T5GT#AnA#{+ps-p=hkw5Rf|@0WqdHdH$WUgg-cR-xS;FU!^x$6 z7x)~9fBWt{S#+;%GA<9w*>UYiTR64L^_lS$>y>my`Sh^Is}z5u9p;?Ea-4YL4YpR+ zAN+P&-=_;pf)rJ%v%t7ETrzC|?RU{EZYIxDNlBAtVzJpi=8BenBF z-TdC0Wp>BI(I;HlV)yTO_nP8Ixta{G&9sc?lhdJSliKbD=D5AzIPH%P$VzJT zYa6}NYPBuO{P>n(FNg`xjuS1``ZM0Y-y)aJ#r-)pH0-WDgZB*<;Zl~D+QMh0oM+!| zgIOALmbG~7rN5GJnyn&!ii6qRg=3yAq!8nm_|9sQo#$kB;q7%^;QzB=ohQHBKkR}+-4jMea( z&J#t!Us}@LTP&m;QF4C^Z@7-X2z|DqruBvn)dms$~*zK8g<~$@H1kft@~9x zhj9KL@XCNf0#B+Ffr3hsx}~+*CO_;)^T40XwESD9xShT>AG`i>OUHuZ+`QX0BJ6rlROIyhpk(zcTir^#)cv!N%NLfzIVt+yV0dnBVPkbVzI> zrAj?DzF5!k$h{r^tBYFWSH0>9OgXAif0L_G@2%>w1vn~{m(bTv8xubM?7v9pUJg60^XbA+A`|t`Pb->=npKVAPDAwx1QEZc$jPO^;k0|&;3(>uu;q!2 z)3^H;Rws4h3G_Rf4VJ(p1fl; z-Ue*<cW-|{G526d z*m8_3vhoQ3pmECSRN$7Py=yrBBuc7;A z-pBuYE8vr#toRRMo`J3|<>zZv@UmL)E`UP0_riv!}Pt>K+u^7|sp zG)BmCbM=dHkk8UgffIyQSgcEfWwA*SsH%)IH$23EWo@4^Wpi{h@T=e=--}lp$TbpP z<(!rtG64RUb9&LIEs5E!pe(`X?1iruRn6gnp1YnbG{`^3Hax!8cyTAj&DD7#H>LKC zNxkKJQ7*LZ>7M}y9nq1Uy2PGLt!X7$Twd}mU?_9{nK#Y=T1d#@KAh)NEXjY7(z8cP z{f@&ol$;x8w?b>PHjgV#19kA+mSats2s=u9Kv$`b(wo#bjwej;JZebj6zd+8Z#Ng!`UwrV6rD=;iIHy8kkA3>h zj>rvFk};-7V6RgQS1=tMVyfWt?lUY=*6Nm>wz{!vFjwU5N07t}FPtsr8Z$W0D^x z#_FW$P>ec0;ng1z{k#29BiiAwu~U#=!&{NKOYlG1d&{UQyPyH|03wQkfdYb{C?Y5! zoq_^VN|#ECfONNn1xiYHDIqP=AR!1ycODKQ-O>$ro&&G%`>p%`uJzsZ@dr3(KYP!f zJ$rhdVGWn!F$=EtChvBTd_mw_Wi;^JerIDb-NoPE|9Ah8rh;kNE(be@nJxqcbHGmk zyRuVU^>>eGvTNXdlBaE~E0o@;<3FTlay(UR>kcEQZKu&%PI=nuGUAKiD=BTeP$CnovU7T0f!pEn3$ zU_5;y>w}oMi6TToQv8eZ5Cf;(RQN4R6K46zd~~;h_cyX$(#h*`=FJu}4Jn&MJHJcf zyWAeQa|4?>k$6M6n&88Gj7fdpmO^W*H1Ed4;>N|~5Z1M3m%Zir;svE@)6z}d`sy2k zw-On7FnMv-GhN&}&jH)%`$HRxBbS54NEn{{G%wy#l5fpVNdspmx6IB+NH0eOM$1P! zRuBB5?+XOMPd&p@=9sgg@}xK>*mfvAo9F{BsavO7uXBHa`RN)|nnsy3a(?@>-|d$v z_7u>LM_Nk0WW8c7r`Hczv^Z;Z4N?hvA0qxbM(|Zw*gsnzl%CVDDI&Vf z^(}?G@@bLP)b9FVrq`g;I%X!LID=e(qNOGAQ3Q{GJwzNMOJ5wZ?)1A*H4)n1KrLsM z_Od42tBBsSlHxEz#LA^m*wDN9a_`4F#W=A}MlJ6jpT;M*^t^FNxg^0>Q0pV_ODM)O zs}5#V>s#v6H!zB2RiT+9R%gVWfK1iM)7#)xIoK^(Mh!J-~wIrnM>Q z@A-2pX~U-T1k?AHZ;ij-5Zx^p4>v4YU&eXlMb6m-ZzxT)F4b?pv37w>R7h>o^*EE= z)&mc*$Kamp^YW+&2Vzof!@B40=DPHLP?LP>tQftG`4GkQ;eBqzw>wc*L$}kP=4Y@P z#>gi=%tFr@#lJF$cGnD*24&zVSCW6jd z6Q@J}d^ercJe!>ULQEszaVVy#{nhages5}4WvfNVVsxDL<1r?wjCbv3w`qf@IGz+S zkhjAH^IyJmq8uFyHsV<9=9^MFrq-TTo>{g zV;R;KXR3Dy>O6edb1j_vY^Hy8$wM&Y!8Waw=Q`2qMZY4Cw9kVLfPiF!Kx(M2=(}=Z zM%=6+p-!{e#g}SOxx?oEN^Fat^2$1M@dvT0WMhQ6YFNt&d-(3sn(kYi&3aN~&y@Uo zDk;W!sW!1|zc1G?z~6s*=CbrrHykLJ?1`{0E$iQswtjiuHndkUYNy^=Lc-&lYTpwH z3BnTG%B7VVe$9##_gb|@;#&>SC3j~=fB5IMKXZHQU2wZ;RQH_Uw}1%eJ~P{{F4`&+ zd1rJ9?>3!l9D~#IZS#@p+}%&W6U-BN*j7OK5?Jk!uM53IKIz(|EM82)MDj|$l@gcY ztFMOilCQ$}WdjGfyN=Ss1E*@xO^YWMr6PmRbaiV8r=7Y$HkcSlxi8UqQv@VGIO$ZR zx4iAIKO5?-+xQdQ{~5j9*K{aiOh1Oa?+B#iE?Ru`x^iJ+)=qXaopsHhp3-G!5&y)+ zn@-#p^-B-c~>q65t)le}Ttvr3V{fWlBmrQMSi#ub&vw&KwqIN6sDu*_`pHZ@v zCi4G=jx_&Xl5f+Zc9_ah&JY`3@S)=;&x$o6UgkDb&^(*6Gp# zArpma@TR$Q{10!MMKWInHKrs);T|^8KF*yIGRJwU4;U!dg}JNc;O1P7=gZ`naeo%k zc$=E0P!XIusS?wDQsfT$56jM;t>dz7HY{{hH(YP`@Nu60?#$my`odcwjn?l?hUaC> ziiXVmaGRQbP1-5%(}*98oxObJjh#RX_DnxH6^ou2|sIQ-mTje$QzyZf~EFP zt$CJErO0&vg{em}@_UGa`9jHh>DF{=ogF&QSX^A3Rw~kLEd=ttDn^o4skY;50e$#2 zb+&1>UsAtuHJGoP@w!Nq4F^1P8IYXLF?xvP@>;CGALvxzCR)U&Wy%F?ch+=UFjpx8-If`@hk?VNJLbxVd2FIOP$0xgnsI5qy6`BIZJ z_9Sfk%?a2!S{RkYaY#fQk6}EWcFgE(A{<2)hgu5igX+Onf4-~IWAg+F^sACBOwC=T z4#Ma4b!JV^UM8cE;J}=n$`d9jUR5-FK9bq2VtMcE zwZ8!vVD>qC*BlEAi|xO(g`1ksr018ec0H}LTNo&TgAnzhkR5i!5j^j#C3F;^=d}9z zzC(#EKC||AL&gi;RSjbs`K*;4@ zhm^U=qn=Muo{)te4!@7bPU!}EM9znN=h zlLzo_AqmTpK;2K%{Bar6SJP)&;)I~!YVzUASM-+GMm9^+fP8^%Ln!5ZA#KQR}fbc(0TwBg!WyiBEP_xw>xOmw%6kGn_u=EZJ!N0lg*Q@j@|D{DYN z0OE1#%g1xA>+gSLQFr{GTmb%UUnlzOx}PlUA;{PewV6&E+W%03Luo?%9#V5dV_;N{ z1BsrgBVG7{gVaaW&vTBS)rqvtF#&T+HS+nW+s%-Hz34i@sMCIsSuI$z z` z8DtJR{^Doqpsa+_th`TgpT2fAVy;swu*PZ0{}?KX5N#!| zu&{(hYFqbnrJ#=7D2|o3Q0HsZe0-SZ%iW@z9}FDDFCd7zc;-&q=y;r0$!_E6GULZI zis|fw&$d>kl9Q8lvmXmDEiD0)D&3yv4bn!zI|xi26B9>2zPGb0>Q>}1Qn#Os$X|2W z+=@0G=quaY6+$cRcgPrCu%=Rp`v)*^fvlW_5|`0Yq%vpar@*tHphj&Rp?NQ6t*-`A z&fn+f*nSiEe7M9x$H7%6U6_+noFESA;#MhfiB~M^wQV94guA*)SyJNvv+oPD+^0VNFWgJZ5ggsqEJb7y_@+<{S0+IeJv`o z3QwQ*Y9^Z+@yd_U;v??M5E$Cl)&_-4O5(35KSR=OK|v@~1V$~Meoyu%Vgir)yvR?v z!2(wHrN9;l|4^3{!uCLlLAV5}|Wb2wnn5N&7WvoA8_-&mynokvSq}0Qo zDn3?ys;mR)Cv~QZCXM-_eD)Sko;;C}`S@NzN$CbHt;=E+Nw_dBs_7$W`@b7<1vDc1 z`S|ckc0M}^9mN+`KSQw`@pU8F?@gvJq0;k%(wTIS=dp0gQ7RzRw1O<+ehjX+bVi&`Y z#|Cn@VANj{aHiW3DyyQRqO2Se7`U=I5u;&07WAYFJb($mJCqh;H{MDH+u3d1dfbhf z&9s*B36$FMB4e$qt3!%NIT;H_{X04o42($0QsV`QMp`K4_H--KPq;?^^pRTpbDDu` zr}X_!&YXf!`fB8f&lKZ8Z~!AaEDyzfE$;pNnPDZB)XgPviiUd=Irfc z#PokdCUT!qUb1{rhI+|Xe%Wq%mpM-Pb9O9F6-$IX@OmZnOG>X5+N#PaDc1Ib)bt~z7bl|FCv@@8W5o9@_R~W$3ee!;V4_n zpXGz^)*{XkJ7u~2G>eI~%a(qd%HT(T?)>@owzl8De+Sm5W)vB8rYQEg9YIy!L(Wau z`xH_)H|dAm*OHV9{<r8uos(|-^txJhR;bM32s8X-7%~}1Nj-EM=Q*W*#b#-n; zRMEk~9Gc~*u_dsooj`czoH~y3dV|!pi6>_foTm7qDyLn^1s1&nJ(P7=&Dw2QUa&`t z{A|Q1DC=yF&*AOwc1LH1EDU~MYU%QI>C58C{`5sDEP0rGi5R$n%wXX}bOfyN{MVVW zB{y;tmC~1m*pdQl^(4=Fzoopqd~%i+8!%^nQtoy>dP4^9;`=yo{mGQPa6we#tLWUgCl9=6W&nYK0{8qgh7 zj|PuJkg|F!$8{9*X)LEssHWt1$gzMrLd-H$&Y|=q3#}i;+~w!zR}(e?_k?GaMYo!* z{$&!B)nn^LoJ`kH%Nr%XO-`GwgDpYuzBb-kCJI}7DIy}0kdV+AdPf`lq|_Gc7~7|} zgSY34rDqCI$exmTx$Jku!Iw5)rK%E zy)Ri>6o7o5AAbu|Nr;I9;mGWlo{zD>dl&Y0Hvb2uy#Z%j3lXs1gZkgQR8%o=BJuIp z$Cg?}sdi^H_EWH>6j-?{;T;tjnFd586aqwK_3m#?Yjb6#kiCn)#pY;$^xdUj*AA9I zwCLK($`R0?h}b!-ycFdGP5+i^-nhHF(muF5Z(C>g<261o=)P(weMB_x6^~2gpmo#5 zN`qiOtc<2|-WXo(*2wegyjX)cD@4v6KNnZ3E zLx==w_?Ria-HALGucoS1cd)pZZYRra<rg*S zCc4dZsZ9-s3yQ=6~rJ(MArT z&^E0^j>j;Q%Vs?ox1jCrj9v)oQ_k%A<8py-2=5JxE{DnPTkR94Y}xg(DoWA^98{zp z5I57lqcuc)&TfKzpZR^Mu32EeAA?Z?-XO%Ri__0>^xDD{NeG>*THMwJpi+l>-lr~W zGSO+?K5_B(pBZ))vRnBX)zuG>QX@QU3H6R+)C^VcibGOIM@M{od|%%aY#CDQfl~EA zx`v)klk4Bh<9znbj$=%cSJ}=|Jy^mX9)$m6xG&r_P2v9xAD;OBr{YB2MucgpxRewl zBV*UUEYrYfMHLv#(W<`;#&&;qok^`ofy@Os7^v~~kcIOfS>}zzamow+y`A%*E`o0~ zrf$QX%2`Z8g2#EM04qeWb?6vyA!e4Wg|j49Kali%;s4741}ctZ;*_S2*=!R z^=)vU-5g0vO9M1t=Cyf2ehvkCZ0p^0AsM_wX5~N#H=S+x?;9+Nr&Ch9(jJeDkKdrC zm5zQWB{fiFoeDRbxv1`S)X~Q`6An++x1631YFwZ!UKuLVhj>o5#gh1e8k3lq=wNUE zLG0eWdvbDZ;5NuPP3YRH@hIn4pt^QH{|FUrprnv!MMgcO1}9t~ zXj(2VE@$a(-IA1$0FylvnIMk+Ns4#2j0B>Pu)(0595w@^0;u@I;R1(bIX+p+qf8L6 zW@oGQTReaL{D7!4_wU~aw{j5=H@7{e$odHZetF9!hE7q=1k&q7;QT{DL3o=aGEPY7 z-6ir2cSSkW98|&oI~l%PAG~xJ%55YiB|W{oi2dB$+4ZB`kCPoQmb8?W<9?W!! z(?r$4fS#R0^`mRYV*484rZH|6>j?W2Kb2{U#4bO?~cSM~$ z7gynp0tL7_wC%2oo~Qg@)p(HwgopEJS9?LK|NCDnQ|$tK8zZTirBC2;a@7+#@t+(_$S7`ThwNzzSv%MF$OL_D!@h_j3iN72sJAWGhqndv&e?zzWz);yC;?-x{K8G% zFzgA*etrO|0caJw5I&=}Xk_nBQ!QL7n#pMAVMIyY;iYM?r=lcw79^v& zOxM+%O7tM#l819~mkyb6T;SmvjPH;&d|-{6u*Nz$W@|dFiGo!ZyMx%P+VjuP`E6|2 z(X7w+hl&p5aStH>sWMYxuIY-ikxEQn<1LctSk5tvv+#)y%(#;Ys-Xt;HZq>J1 z=oQelHIGR3)mjBp8TJ4%gO>N1%{Pa! zapLd0}H*Pf_^SBx!b^+v^YVbE* z`-zdOL7gU;rB}4e7!4=R*turB9iz;%B>O4h{~{Nij(=3p0v4ttGt64SF?8%e!mqyD!U{TR)3tf`{|d3@-gl z#Qyr95#M)r7cPJEKOLw<5xCdQ$-nrk_1#SmDXzu0*)fIdqw=<?Fk59Ix%Tc1flqKL4@?azkZEK9$4;idtD*HrPR`7==v%iaTV83% z2n{6ep%WrumJFWa{7hc?y`R)TVOkz-*K(~<#MAeamfV# z0=L7;A|CtuXU^}}Swr>*7 zT64Qy`aY*yVepJp;)`v^&}TL*##4Ay8|CwuVkEdOD!W4KvP6rZcCnh=lacVt=@MJM z^O~`diKit*;_p}*={}Gkd|ugtALLiVvMaPs<>)lsp2%(4z_R<+O8W|}!vg7opt>&c)L8l7-w% zC*z&1#0V3Is$1%^M*M;_kLmmTO-??STEMiyqyyb$d#j5xL{5`JWjpm{sxX3mvgAA# zwWlSddB+xSdi;)E`IArCH|r@9|9N10V12L4bXRGok0K&cO#IdD$CS7hI$_TDi|%~9 zT4H;D!J)fYNlEEsLLXWOIDAAtG#XCV}bN_sU`^?8L9I}6R6u7cZR9<;O-OJsrNfJM0vVi-49GEY zoP=Wva74qH?XvytMMFy?7HDC#^vwR)ua7!g$y3#+?DcWZvQ47BA)?>iF5CGo#oxcb z1o!hZN!(-Za8V-V?oE+&l`g%@&3rFLyIx$jT|D0N9NM^y{#rUCNH&Q~fG*obyR3-1 z5SQ&C-;a%rnYwJFtAdZ5IgRX_=1p6T&)vPA)|zW)r))yC1E7!NM5b7OXjy` zGJQ2%*0N*No;-2bnXJSA$o34*u&blkvp=9?G%q-fb|fGu;>=o%DzFrtoC7^*La z!`1mdt(fWIb4pDwreM3qBjd!C+eLS@9HuRu<0j7$rVXzRr?=z#UYtgn@$C+nUdn{*rvclG^ zf@TMnHHk$enRppIaXn(5UgWCTt8v+>QKNOzS+FOGDVXSB*`W_*Ix#adBhoEcx`JOf zLf~F@kw{IJc#dZ($8M57FTN*#Z)!v&M39}~_U)4z0%J)Y`#UnR+w*rhY>9T1Wser! zx!UqdyU)~~@8!k-LFt~rNH;A+`4;R$W_k3dFJv{m_&!=^ab!#j_vGVNv^!ypK8d;@ z8Q)HM2~oeHPooc)IZ0>~?X1%l+5Vb9btV=6(ooreYp^&+=- zHdeKL`K&ggg?#Vs2xigXyd6327~@=Bc%vMpXt_i--ZL9`-u6`(4-9XuI8M~RT{tN! z)IG4wIAFa*Cen?6<{~CE_T@~B$>5gf(j7jB)GwFLJvI2ezfy$vuHy`O^!n2w@6O6} zBK$71ar+R8y}m(UL2#lA_LKYli+;R}MU$U;L$pYaorw)-=Hm2t=z5j)YIBx$ZJ5o~ zAPNn^7Ry=Cm`19YoXwy6G%Cc$;W4wP+ms6?<<4&lWnXf|C?v0~6 z8{ghMKu(Y_Gc&j|&KVLCvQRMMQ&m;^I-#iIr}(iBjK#CxN+uX6 z_xki2v%+4(K50{i9Se2qX z<+F3#t7)B`oe%`)hiRPGi(wj<-EJ2(D_K`COzwtHiCdvkMhY5AwbWr=r(2n1B%8mOr;6y0={nT3gSbJ@>px_b0l)&(?V^+F-B z%=LdRWqv_HK~a&zTCX19A2O@PnwJ0b{~R30IX^@l|+!XkYCEVBN6g5T8Gc#f>u?!nGli1Sp)-6cO? z;sd6KLr)j2EwCKc>@^s!&FTgl!><=Y>BN%bH4J3O?9Rf*^!R5UFQPRzgETgg%^8WK zhQIosmJx4uUdvWiR&LDd^@qn!{C&)Kf5SI3BjfIRVYBz(USH;JpEu{gzmJXgJiISK z82J95-SF=3EWu%H48=IDYStHzbr3L*Qb#b&P??-5l-MM zlatJj6OoopeSLj2A*$!foaPKQw&!ijihKM~6&^@7*ty}#2EiQxYf{ku z@7$PT+syRzl1ULJkHOO2In%*17nlE>9~liy-FPsy;8nP2J|W*nX%=Td{G@amTzE=^ zh%1~FP~EQ6(J>R8R69H{jlD6A(peoQPYqk1D9-Q0r>eT}&JY9fNwC!Dib)VQ5d1?E zwn%JtXN<{LqijZPVXJX)r}5bg&M}YU%z`vXR$QiC)ve#%<2_eFHMFe!M*`W_Gvt3? z3d;1-fNy?FG3pr?1X5js-&V-z`agE(h4Td~Q*I0xA{JCw0;^WJA?pPh3Jov>i{XDJHsKRD+&ME<>|0M;!sI9Mpu>=U^@%ph8gzl5d*|!rE_^`M({%Y-DH?Hfg~X0 z)?b8x)3agq2o@EAs$gMkxdlL^~JOhh9w%~zJ5_g<=I=IzjTgvLZn=I{QpszpmgMjMxKO_!2d;vS`XZ>e~ z&~XmJDBc<_TkSExN=oUYw;D+qbW*IZuLn<<#(&_b zgs#){75@;a8ic|C(Ly_0O}q;LbP*Vo|DE|5%zXLp%$=FetB*J_fIP^o3l0e>1A4)V ze4stJF1$zJLdyL;!3)Z&=0F}lGZ(#DX^u_FC)@3k3aL0J<~-*RG>W5|*_>e2@$bPH6Ac*DFX$&5VtWAt^$Z*a_73r%#?h|6fF=5guIC3{t?A(?7V% zRL=xn5oO)=ST=Rn?&m+b0K}KMEuP!kZvt0)4l4dz(C-n}pDHXtc;v^yfwL|H=g*ux zSEtim>*&e&szUYUVpVEtD)3GzViB;LHxhu*M~nC65OH(B)@JLVv0@yvm26rCk=pIG zAyLvc?=bIM83oU}YxRiLL>|xbYdtbr?;yzQC?bJBpgV#> zdx+%9j87*}p;2~|MYG@P=OKi5)8n+uToY@P(_XD!q`Z|HN~_g7?A0xBO{c6ou5_F4 zjbpYw89|g!CN~#;pFCHQKHQ3F<}+`$rF0y11a;7CVFF~nNOmU9m4`RV6`JK^mQRR; z$k?6AC(^j!Y-{&*87-U5R7v4Dz>t=iNuq|gNiXjBddY6(*Bke;t@Xv03nHzLo1I&o z_P2eWNq$OUEEsytP|SR;8(!il|c_Qn>+Vr57}%Dq5IF=kI-bY+omRf$E8S1BGJ-w>inNri=h7-sPPt zHQFdjkyhksvx*NQ%ECHo=2aI+RL$E-@0C=Q=cgUE-QjCe-5wfCee7RE35>{X^UcS?(m3?`TvYS@D|GnOJJ44R=OLNDD z%=@fK4)6}cnkg%uRfm#J1LHM(=-jHF$r;gjheR9`B%j%xuUPrx4S{1Y&AlbtPLq6$-;tltJvMj_y4gcJ^dFkc^%d2Ub6pa^Y~?j<8rFR_ zKM4M^Y9&*>ctshyhHGiXLEHN&1!$D2o;k7d%86(5kt(~TYi(8buH6NPr|0(`Ud*p) zX-KA|qoZgy;;1=!RlG#qbVtR%sLEpLhY>UBbb_clZ2uSU^TRaHir zL=BL##63L96&)3#_rWDwR`_wf?5fOR>4s@%#aLXc)?NTb06;KqVL_w)ST@r9&XGjm7QaPH2L!uUTzQBrzV>;_z`zU#I(%C1mK7GdVyl*utt zMR$8Y5Z3mayOWE{L;vf{?uf5m;_d-KyQ zWWUsD`^L_%vm&xPKQte9$VO}8-6T?DTd2n!HLiT0)B0i-oo~9q(U}VqCbiT3$!b49 zO_V2kWJ8M@T{YJ_^I}{9YMIZ7T%)adAi%@xJSGZV(DTucFid1FUcIy7@5t5Cv8#fRJ!*5O!Q^_wUGLZ}V+#TRZbbzZrV zB9zJ8bKs%Hxx1woq%B73b(~JKCii)M>(;I0 zS<6ZyqO6YUx{jLoQ#!?dIZCPC7yD;cS$d0}U!(gP*J@LwW%;Gk0As?*J;G#j*J-jP zPV3I+6N0tPC(%Ve%DQJx35XeO5Lxvze*EOwH1rK+FS#}foqBsYqFRY&@dxgQ4E|JP zB?!GAX_)xB?M{8Y{cV$^jLa~3MLD_mpGKOC$f%^d401&iYoBYUJ@41^NT0ERvk%s17)TJ1w8v=j41n%DT^^PYG zKKqDo!#GhrlXfuJ;N9t)GAUG2-(2OQ(I*2sO}cuNm6Q@EV=9A7doPEZdq)2WB#*7I zY7J9^8-m__p^?HWVi68J1SKz-UOb*<#XMa2=3p89hkLi@I|fZS-j3$`u>}uLWOT?V zr);agDr|>N;{tLUgsorWXx96Q7TP0uJjC)=M%7{}W8^jjC_*)^TD3+po{%}WC~x8B z!EmGOL#pm+=3Kt;(NXL!g09oN%XvtN4r8meX;bC&e-O6!+2}g{H$};tDQ+9O4X31i z$p(_R)hvB7SsiR1TiiD5XZ(8p!u+RAc0E@)Iw4wLiwFc}UbfB8LfT?6Kh0pBCIC8}*TQS&;~Ovf-V9BJ2CJ5;~mZ7sWQwnqBYKoqvt$i&tdP z3+F~MSJ?wRKQsy!9xp23j}Ma<$5IhNC@{-f%dhz~xlwGZ`LzPI`h|R{t#=f^?@9|+ z#fO_IO;me1D*$vd;@sBAmdn5F+{g&$E+@Y6vO#aa6Di;1bfE|j-5hVeC!#Em;C}U2tFdjePgW@pT56!P%zkXEZ z^vUbn@?ZbG_EA@=N=$q>)Fi*S7voo=&cyxYq50ye`~cl=epcB;_g5YG*#kK)c|*%> zjI_8~%>y~j;yl&T+9|fsxl3v&iD+`x?-9;~pQ*_GOV~QIaO7|ouw)blQZbA+oV!;4 zcS;!Eb?f2~-?kL8Z^ejH!EHWr4$hk~lZ!cscx{=PLgbX7U zKKP(b7Fvc+XWQW150-ba>1RCiKCFJXg!!<0AYARqEvL+!Y7*Fp!clJbA&d?4Tj5qMo(HR-&=+IEgZtd>5y{RMC8rCi5 zX5~I+D+FLVIXQXEY=KlCZ zwE7CxmNib_vrVourp3=QtfYys7_cb#_U&6`q%|vVGo`EM6c+8hVryA(9hYmz{Mr%z zO)4=b`&dImI>Glb{|Ia4-$fij=o#GZ-Lz?IsPXS`pkMim92N zX(Gvwg9G|%oSgOCV=lfn-+mS)cIpe{N&XuB_8BdnU8jNV#*7=%zw6&VJ#+eCJhQhG zWU7y%^~^Y)NZ}b6@aijV-a^5T;E+Y%uK%~nD&sHb=;$6^V^<9_lN|f~``MDQTT%~N z3fq9$J^Derk-RGO6Fp%3^;E@i*NB;>Va!iiwF+=l~pfF z6mH-XC(Ke|T##zNTbQ5!eFf>zdHv30?+BVK4372vyZ~IXx~W7lSHa0osO6>#HWVEi zvn$tGWgL16G$h;iKj5y?ZUz_r!k5pF?K=QM#viGUdHkeM29`* z^@bXUT8%jNoZPLGpa8c|UC(a&q1j@@AsRxWt3ZXaq|O!9X;4ePEs%MaU z@!a(4*5uHe#&445Jb3;3Nk-XCIzM1;^@=NZh23P_k+*qIjOyCY?iKP;@#Zt@Cx7h zUU14jz`ugXx)?JzP*{l3+5OZbMc5z*th`H~VYy8bM^^|rIpcI@BTj1kD-;2ywCR6- zbBsdiT8&jP1~^l#S(`~daFq!iV)4$f;r2qfgB-+6i>AF&LIK)+`k!0h zJ3?tV`!#}|kBAWj${xgmKZ~2PL&XR#726c~qqPT+X|9?{u?!r>r~}p@lvCTzR6Rp}|yi^p{9d zJJ?T^`Mb_#N~srAq=znEyyymo)q{LA!&lOSv1V)ja`&({(p54)IeM(*{!uE2{WA8Z zoLov`uTj^`>ZepqLolCujV`KTo3W$LK>b5dn-K@dhG`=7T;7PyiEfj8n0uNCKFaQ0 z`K5xLL$f+QUZLiNaNi3y5Cf2mAAEM|e#Ld#A2-UJU;fJfiWi~SHf43AtRnAp9%g<) zD<`zknH-J~V`A&%RDv;~-T&S_)4PqdMn$0@tRLt>>Mgdt>zt21_40&%YNtW&lK|0Z zG&|`@|7I8CeDu@33eYdNcAck*E|(lWz|G#gWHSeU?|7ll_w+5K>9GR@fj}Ed zIw*e{>sYgPEv$FvYlBAOUvIKD2Xwc3rsZaPws8_#j7I^c9^=Wl-+b;AEHh0|3`*vZaYBqEC#+ znybWguypjPH0=Sfz$5EkO5!_n_?z6!t+$+zMqS2;@vJ7up~ZHk0?psXr}fC^T}$3kSrK0eNbPo($RJLIc(j~ez;paZN*ay1SjBWaj>o~G?}Ex zo-(r_Afjv-<*6n3@t~Kc-TT3{0C+zI@`HqoIyp&CI+P6jnd|V2_daOhE66S;9*0o+ zUxks|`1-+)L+=1D2XpXU`a|0>V02?a+KsxF6Z;>roK*fSaD-5148<=0_5H2~2b8So zJV#%I(oDJ`ZHy&<%%*fCpTWU&H?KO1UnM@&oUkCJfT#7|ds{!M%Gf%_j>#z=$8WwI6&aM}1iN%`J>mMD?Oo3#~PC0{}1|fas^=kJ5P`oM(bm)M3-BK_u<|<6F&t5SF9&jyH zNaxnIAnHSX19c_P7mB)P zE~toeC!DKsD2eV2V(pg-?;q_m$xq5Wet;k-2OCx-2StEen4V$@N~hy^xIIPprkxL7r*M@q{S<@ zeLG1$wbLIh{xn@l2`vts!KMHsupG-Lfzpw-ENcV&VW5Bv2f zly+U19V`>r?1`5RHm4l$1lL7iboZuZdT)Aw8!F6q zAgV;+Xc)YrkS;I4h9^n^;edR8#MpU5!``i~u6{CfHDZfsRL>0A2>?HW>EPXog5VlW zW-dr%ncRN}K9(>3*x{}yaq;B9)>yV1?zdEv+I#=RYwWjUzd?ZtpJ4ES%8qPieY?qu zp{1gtVv;HkZOrS0JpL?P)&pcWuM$>u!zUR8`T7K?aAo=1!Ka*$Ve!9FnNVNBi#WQXD-st zADk0P>u(eC|N9H|f7k%}_&+-M{}uoYNmURnYx{mm#u7jOl~#1G71pT}HU%GYn=yQV;P0kRT><#=y1k9oJaUo1TH=2mVPBJ^1G641**LV6oC(9t*Z5tj!EEX&O>GxvY@6T1t_D-P3m%}o*px%DLTFAr$#i$l5NS- ze75x^@pqs%i>5Q|R#Uq&aD^M*4p5~87!=fpL(-Ijs3Ue737-LcSMI}|hepF@$NdiX zCkk6)wv9hv?7TF9uFb>H-!E5W-1~!ZSI7Ymp(KZ=-hcqfD0Xmo-;je_I>hmF1jjVA z2RQzPtW*asbp(iyCjhO1!H3f^!E`ri59!PYK`XS-Muxe*thzUKoOT}j+39~HlmC3{ zwEu^5ezRiESHOnN;}feC@0d_`+1Mt@jxfZ$1zQd`!tHUbt%j2S+y^n7ShmB@?9ck) z3UVJ3hQ(ECeAhNeBt_PI4!u86I)?=HEA%}vN&@mMk#9cIoCTm`4uNI`do%RJAtXlc z#c(v$4{^Y91*YRZL|hLYJk&sPNHadjxcS3ziL@}z9#TP&$wU2NrST3VB<}icdr6>Q z>5!b@Y}eEDR9R}3J>XgF>7F3kohcCsfv3u6Be#B9(r&UKi?AQcPb1&=#A7e%&!bSs zD8+^EDWJxqO5#ypvwU5KE+0Pia;5C@_qp@?PX3w1fCzkmNK@OBWLKdTAt+|v{Sm79 z=+S!Nc$Cbk!!U5@djODL=Gx3^P6)HUc<`T{;|A!?*Ou7BmeRd+bGnX<~&>yDZ0aQ@OC6 zgBcQfB@bAL7eIIa${`!@K|uY7x%~H=jjI6n5wI2zfX;{jsK*XBamC}47##et%7NXB zeDC_&X+Fc{BCNu72n|Y51tO2gcPP|bk%L+OfBu(_GNgh7Nl1q`>h3&)Vl^;2D;6%D zJSt24Xm1t1`FIO^)&cMl*eK43VR>A>`G=-VX7}ft)9c%>k;AVm|484?e*Y48-`-9JP z`9k0;*T~0Sh{7+rm2McIv8E!%__B?BzQazk#O31YOw&1J%O;PRnDOZEo` zFt>6E-)f;AFnRuP(=GgD6wr1w^ClY`bH9>Q+?y~`{-4*=YVS(FTN>%+S@=FEfpF0! z>-pMP@Hh#SIfptmb#|Vq7>0X3lKW63`H#_O5=aAq3GP5M zOL~nm_z@wUb`PO$!DqCF$+V4+l;D$sw+Loke_rg8ixPqF%5(XWDB*MDM-#%6@XN!T zA|g??3p_|y&T}BTJYQhxLbFqXet5{gEgO+jS8OYp8@lgQzYT!l5K9swiwROZkJ=6B^wqj0ZFYP{3Mvpf94)U#@O?MOm{+5Q zG{GPK=A*|xK;=Ui)y|d>4tqzLfK?6)CKXCux_`5tWe$1L>{=7Tn}*~c2VZ?VXH(o4 zhXC>QF81M=l$5?Xo8%<;#1)MdLmySVvKF>G11FnLcXoSl`vUusZpwELK0f;~I=Hqr%Qv*kh8vY^|tB8}s1~ z<4&KgMnQ6-{&dk@`=SkG2sb$V literal 0 HcmV?d00001 diff --git a/docs/design_docs/cholesky/trsm.png b/docs/design_docs/cholesky/trsm.png new file mode 100644 index 0000000000000000000000000000000000000000..b2121a37fe730fd0221ebc75445ae9b7d055e2ee GIT binary patch literal 9343 zcmdsdX*`tS_xBW1$(od1ku0I?ON`1`vx~^S6DBg2v5crJW#6(hSt3gryBSm>q{-O# z?8d$fvW@5Rd;Tw;m%q3F=Y`M5oclW0b=~JW=X<{ATz90NjvDPnwu=x5gjPdc#Q*|1 z^AG&t&z}X7J5pAP5C{)ML*>4aU*_ugvp}PMT-)ZYm#+@BjTnmc`pguqVW0XX)qf0p z<~6u4#FKkN$RIwdPla9`rhMUsrQX|*%v8)q%F4s7FIovI*vFsITovcdT(LayjTiV9 zemKU?FeIeiNvwA2UO!xF{weFfQlzv}x?-wOR#^$Q5>Svq2Z5;Hj^cvcR_mb%nG--l zK6Ak6At@gg&$M_kD?*;cps68U`Xq?8-2b9MJKEIBf2vmrCI+zmHPSoPLJUMB*Z$bH zm+^)^>Em!r1sT3zvM%G=|7E|&Lfh#KgG7yoURYGnoI*3wim+D3UVD6~-RlWyeJ70U z

~&J47#jci5cj~C?Z2p+pNMP74P*xNjZfnE(@H*MunmAZR20t_u3*1sjAbmRvQ ze$X)F#s;~IUSv#mbXPd3g+fIvoW*HEhV)1g1NKBQGS*1mYUG(XU5K+jDdKAxfe&SQ zeNMAcyX_3al_;(+tiswm3n%ej1@8PakdLge*!SZCv`k7aL#4 zKg%~K(g2B3I9g4_LRfJY4_B`zH~}qLfhewO-)fQ!Dp<=aU-wbl89k{euJBnemuzVB zEjhwT+B=Hei_D62{X8ytP!!*sCOqRE1rHM}R<`bm*)*QLz&!FhKICuiA|v-zP5k}| zYUH*Y;nSPTXNF?YYGavV^#&9gguw*fe!Cg9o8cQqMg}Feg%?k(c;hoNWK9)pUKjSm{>BG6f3nlj9HOsttde_ zYe}pk$uS*F!x6=W$ivNQHgBc!=CcBW6u(BRampEZ6}X|~G}Ar6G!F!jeObl{!aEzO zcKL#*LqgGNS*}9$1~JW8({yJrgb56(e$XYv9Xq$=dY^Gi=$N*La zk@Nli4+9iRG_LLaud_q%(BT?1`CrNREZ7{xX{<$}xZZRpzd6{x(c23!19+k-L`#o!J~NU)iLw-xGf3-@RVI8`WK@|c7(MncND_W*yduSV zUfq#d@$8bLOFFb!AlU<1;AK_cp{z^*>zdo;2OQVCEyz5TDJvd2!~wAv5IcX!r#`L^ z9IP$K7;u}w4%_qaTqt;KY@Gf(METihK?U*r2o5n*)4hzX5B@$c5-az~($NkR9}*^L z+C1d_O5MDN_ktt{E&sz_tp9qpJD6rgc+^IkDNY5PTDYo6tjlI%FV25`Z?x9K9J6=m z?#O$l1KmV~sI%Q%PQ7-t=H5th8A9U-gr>t4gvMc~kM#cHj{<@qg5$$BxxL{1@#ej% z9MyM_w(~sYi}8=xq@l=SD2g9A@cVZZeAH1~8ilo0-%j*NG#&Al(wXPa%SUl_w8xtl z$MdK%yy-@c`LauwcCbHELK5IRYFs0x06F)XTjyo3T%j-+<(~0CMQBN=LE| zDK#v=9e7FTYjpU{m_&1;J<7kH^ckF0h#u*)8g)lf|F1?fLZxhQ#ZNLDdSCC3E@7PSWE4Tps zmtUXs**M7Mb>FDfgiy2uEA^c>=61cKwIq)A z9XjGQ!6eh~(A0b``I5akR;gYEcmSHyk!b2&c2UJ8{BpjzOBm1@(<8Oem&eT?Y%3)2 z;^5+R4BX6$8@yo!ROF+)B;hi!Py|I3S45wc*(^Rsvz}m<=o#`5)jAinU1%q92My6kCy=T%ZIjz#^s9N8ksa@lG z>JVLYro|lA=*i|jZ`m$78-nv`_(qFb;BiZSXsGG0Lf!!(eTkB zyc9{Bj_f)^xQBTevZzN=zO*FcJW%q``1o8mZjQmF1`K8fgQa)GGpbhJ_xD@TC31zjC@|HZ&14x#DM1+&So&Z0ZqcwQhA=`3a zpQOAh1>F7~%0E%E!=5A8m09t*I-6#>cT+PKcJLOQ)D;2bU%dh?yvEgouF?DqaGD@X zD+k_=7u{VDB;)|M3VwyAo_5TW{F$?!K(-gB**HD8B7edhIWUK$mS`Gs47U86%x^J= zMI@?&`V!Tjhkk?}>mt|sbE8kJx) znwrVEL~i|x2JLThEQIbd3xr&v3|V`ss^ zpxg)~5rg0~{+_pNz{&y7-i*k6`I5`0^y-RbV*5_UAnd3k%#UTCZQfMx;@y ze>_Qf0Hr5a%&JeOy$kEJEnIxz<@Ml2a-+;t1|Jm_2l*oVn85f9{v_@{sXp$(FXo_F zF2(6LYkmV^CfN z%$-kZ2g^xLzLHKqhP&YYXJKxx--16^$NXmZ<^l>t*Ow02&pBymt7_NBK0ZFqtX~2y zkL2#{y4M|_+pT|;zK%ZuOd)`Orp4?2QH#;X;o;hsKSblRw-#ax%!yV&?#Ta^Y&}X^ z8RdZ;-3P281Mra?AUETG+tlfP;H(sM@yW~sqJMEV9}x-y(N>o6wteN|r?J9?HqHEW z^NwUT&4i|S4wvdFmCw;>XnObG9CZPg5180V>YUAq=Ee1VLU1FP?m*2xEs%|Wee z+W%H%-2U(u9I1L0Ob}RHQfJk>_}aaNhQ4lC$S^j_JYE}pzDR4Jk}|95ep4d=z)Ufi z`_+^%QiF<$YNDyTL$+#t~K+S9Mm^sRgh!$9DIt~ zPh~ES;x<3SDY}j-O^S?H(~Y-7%T9(j@e9(gPC7Qp>yCq338IOe`NEf3nOG!HunO+U z0N!19h=ze0NEh3wCK@QfXEbgzn2@H4H`KWrYutu#2s_DGiZubL$j$B((Z}1rQAy-Q z?e*Kmlb;}TWv_}JZh3hka8L%$~^Z}r9xiyLOV&9@jrikp?agDLE36@5sZG!ih2QA4fLh+ zpJ>G=6Korg;C)*^b;j|Z=BaM3AK4A2bl=6@Y8jx7do<#sq!3vDVb;P>BTl6btyTr_ zs(2rE;qu6ryX-SVlShk8?na5l)V}c*BUDt*W`Nz8JwAeF8Mu@3nh{d*PG|l zpFJv4ey&fFirD#^wSnP~t)h;U4*d5V?zfm1US2nS5o5#fggJ=z8~udf?c0|WM{6`0 zt>*r4@s0dSlX!S2jXE5id~=D_R+>IE5?yh>^{YZ6zjn#|4@RqK+3(cqufV$NZ`eIH z=KZ(lX|LUBapR(7fLUQ8+AVfghPPV=nXg{mTN##^wt14d9Ma%9XyN;M_tI5S^PyV! zbNERJqx7U@S_(_|S3{_PfWXJCzrQ?Z+Fr}}+1S{anVIS6=r}v^9}Pdb4uiq&&QrO< zJCg*{($X&9(n+ea??|8#t+O7kbofHP1$jDbPTEY!%Cayr>MMI%y8*p@`*!KPO$|^q z|NS9m+-KCCm6dg-JvNmz_5v>#gYhO3&e5{H|4p?$)DRgpxTD|nJ|$eRcFe1b(|2sD z8(>j=eZ7Lu${qF#_mz9pgcTGTD=RAx6!>W_UUc?d?>c_{;*fGp z%AcOLZTCzOdFnG8ua^Fs%5LaG1#NaxXh`E)!?L)zxazxW*NN4GmfO3_qqT6j{MBF9 z%jZ7#N)4KC|KyGJ+a9TY_Tt40=rPF<6K{PHM*rm5&}W-iD7j9i!m4?~Gm{>A^v8|w z+Wj61X?Ah@pNYC_(1Cwp^h(b;7P5RN?y_G%#GKc<1W@d8Dax8}@Cc{l#5zni1?3=@1Tpb(vLLUab(Hq)2NA$1e(>RD)XDz# zLPqo}%DlyweTeDETMgv_1dUt%U0=hf@o5t6ZhU;Fo2;^nRUAN!TOCbbo$y-xlZSEa z7H7ZU33>poYA0N!ndmrS^ncZs1}lJ7@F(QG=Dn25{Ex0F_>>N- z%c?Ldh6vp|T)`Y4>}=1E!kdrV*x`~H)Zw9MYQ5^BoZy25Ez!++m)=4P)bWnvcJ2V^ zW>8C^$VXkGjlVM1aeEWbDsTgCTzzyf`ds-J^S|o1I)!ojU#YpVOC_Iv&EFT%BSBm@ zHonuxUc37(gWG>LzDulm%#Eux3ao&k<__G@MI9u{KzE1i!p_l(XHehO_*7adnRVvn zK#i7FFXP#KjpaA&#!yx>pCzTw=~rcb1_~?Uj0j=@2-p5{7paP|gB;G2K##EesHl|5 z)GULbHyaRl`MpSfAFP|0ztf-$xBrXpENZp?I#?0=#JL9%cUK2iAHbNo|)?y=u;Tc7CH*^b2aX!cvg!_}LhAQU`ywfmqN{f%&@@z=LO^WjNaRS1|Zrs^4CM zV++skvEc+kw%QHH?WD}e<^*m5r3mF73R$Cv;dEIt@NAjpy@`OB8SDx9(2@5#Xj<{m z0mQ3%KJ@@jUi=Wt(e8h^THCGq;5>MuB--?E`p4V7$&Mxfz++=$GLwP5ZhU@VCC}a> z-JH3u)S{vytr4t0W=6K6sq^lIdEoEU#U>S&Ak3ZrC=&*)P^+>!`ug!iJa_gk?hXL%z{&^K$Hlv~K0X;`xf=j3-;_?;CqQ)*sM@c!5bY76y< zl@+%^#AuZZZ!s1-=9vjr&Y(*M?F>kyR~xHSUHK@$4ES)P?>g))OImbNI(TxSulkwE zAVLALG%Iqrvs`Qa^(-d-wDjZoBJaC~wfzxwS!d|Px#T4U_3tc&LAQLPhhU|;%Q~`O z5#vA-&;{U0()%9W5(F4s^oUdhMOf568$4FCpktSIoBx&O$Xh@M&XCh(vI)1`(0tHe zft`zB`9^VYJyS;B!UFs!-$( zZI@4w>DI&y^`R?O-w{hT% z9aC8f$4D>SJO0Q+7N1(I)7n@)aKoK>@WSr^R=9C>J%`bA2vsaYMLR!@neYqZED!YC ztKEhqc?)IacB`cG{P8hcVoIWALne8B&i?q7VT^>hxMO$vO&4H)2M32(rGt+|y_DR= zm$J^_U1l1zcXpo(?LH3NC;D0h__z&Ak8~XYI$7g7*q5QD0xWhd!aDv+JBUB`E-0_#FU zL%-OyBe&78x5&LUcWK>9P#w#Df&o41bV(P&s|x_4_P6F@W|C4O<%p?#DF!2eD zFnT`CXbasgwHZ8$ybUmXxf>+Mn3=%iok2(5o19Tx>bb5B*(tFTAN4ZzY|qe4 z$=Jlwe=bw*1$KY6cG8Dq;X|yFIJ=-JFpkCf^@E_fD~Fwio3Fwi`VKp!DsHx3o;U(L zR3T=2B48avE2x}vV^lIDg7-N}v;WZS z^KtxT>>f{!?VX2<1Ag85l3#>MO{=#wFf~n$iejn$$-tGXy`WU>M@xRv%RjoB zd4IG9*@6%EPPNPo@k@UGy&ZP$^;OGj4OExWi)V)Ng1@trFLU{hc}{FizZIhkssgQR zh*qXIK||QMR2CcsZZ7wJ7+1i`T}=F0Knbj4XJVlfi19>SrQxTR!TWQ0AO0ACE=G$G zr@Z%bi@mB|o$VA8t02T;#7q(}&S&jLpH=fg#pO~Ab`DTdcBHq4D_Hr-Vgzpt7!Q)u zZVE^P$5k%9zf{EmDYGtO5izSP4_K>TKz6aiv3E%yH&3DWv++z@w8bET10W)&g3q^C z7aVz60Y&|6s;b%+bwwBi&Z-E=YXhD$mooM2L6@&3SSwW&w}^g=_1~K9@b~w3-0r&z z09n&-KF#UerE7n?eJA`E7JEK)&lhU}8f3HyoXQeOESLzyEAGAHkR!Nd%b$Y&^;pGS zz>vRV_g_+TT~i9I8@`7K12#9~0%Yb{d3{@k6Cmo|i+ULU(`3LiceGQ1;siup29QIi zjlW+VxB1wB{K|O1Iu6kI&3Qm4_tqyT-LhW+TjYNP>}uS*S9{{9&ro-V*xT-w-3m@E zzg%IhZYJfa$L73KPEZXaRN#j()>aW3Q+d-=93Ax{5pX4`3)c$ zYp03-$guL8qH6c7flqo!8=NxD+@qr-P$6_>SRsiw+dZ3(cb{V7r-8{K8Vyr#*g;;J z>K1E0s+^Iv3f?amR^J4qyd{`dEe1qxp-eORU>N`e@GD2&A5`Er3kodp4@B}(_wqp5 z3l696Z)hEPIYCnx5cL0hC4eG9=$1lhM4wy8NJtnGfaA@{>Ues7A?n=)rpT3-lam8f!-b$i0L2!Vv)VP8 zsP_ffHusz&lJwl7)}bpU!DnS?N><0lCJ$(VteopO-hg;vu?I?}BqL+vdD-8f_HdFn zssc$iqBG4Yo7k|{qsA>#Tv@pVmVQ!cA4$~H2Xpe6fdpBy+yK0jloE8fYf<++Z&lUp4_Ak{EeLyK8JRL_z3s7H+THp1F^`^t}{Ob?joT5D4 zW6pA*>6j&t@UqxTGSt~jF){c2u?B}qu@|#}DD7+04c@vuh|q-|d&BEjq43oarav!4 zha2vL?j_MqKGa@bHE{rH)+7#3$m2N#7k7hh5M816k3+up;OcFzu||D_{G%Dbmf;?& zs|v{EwKhA6g}g1FNys$NQ$4-b0M{6_>D9%l!>!(EA;9CGs}l&zwJYzdpLN~RHsglN zYxNU^goU-8APF7CMcECY2J98&D}FXu-VyLzbcw0?9hI$w_u`*&i^g=FJVPBh2T&Fj z8uG}2#12WB`CoMT`Cs4KmIGP^nXdSs;@!d6*x1LX=IgyXrA*2W4p?%Skbr>fMmJ~( zppV)?kMiQhi&0Tgu6uBOBcr6mL__Twfyw$#>o4RAlSUL3-`S@!$|1}wEb|KszRLsd zOMS&($QZ?d-4)=^Ha1gVE%latcSAW>w0`{fQSJAz?`9O&c{GUgs;a6*g(|@H`@KC| z6+kxy!xB%oE2Hv{ixKo>s6#(Jt@Y^fK7J{ZlbATOde;AjrPY%s*NzcE&SK8}C66CB z@j#)I?C8n72Nn`IUrN508z=T9W3MYL{V0qv!=Pc}ZEa%QEN5xjF9ty$Dc+Yl++kcd z*0x~2a^-dN^>L<0AYa{&&&tY5O}(n03K}E*R()#b&R4HonVXyA{8vdaDc32Iy5akY zya4f7xd86U&!?=ctVF1;fftjipr5P~1g)JycMJyeeZAv?dwt)>^zn>8lPoKvlpx(u z>ZtH8|Mjg=F}F4G7vc68tMlx^y0<<O+GVB2o72i~NL^?4_9wqgxZ+!<_ zQ@1rWHJ$2eT}|Mw)8;1QZrJiT1BMfU9{UtoCzK=UVi1~e+Fs3$qdf=#%#A0iF8)jU zsLQv#b{ZO*XTI3`k~e(d|CHUBK>wgcP*jwz89b*DM6}ds{HJL(0=nWeP0^Q|0u1Mx zXu~J|vr{ckPtVp{`44EH768joR21XFgl0znO+g>OT4($MzZ}S{$iNJaeM0~H6zV;2 zotgIBt*tFOpMLJiHu|8{KlB_cqkL3RTptZ51~ijP?rAG^>gtr4*H<3=riCAwg+n)- zXx3^qK*nbIO_C#bEqx3;iFuekp)8m|xYO0AK(KdQxY<_&eur%HhB*Ug$fG`4{)=&Zu(NZvs*rQM{F8$@Nhqf_|@LY zslvQoe0rW^El`({S<%|E)d4Y0NAvcdO{igq0VUYogH|!=%f9zaTXpI0iSS(kgpx<) z$X0@}J@L8&xQ*aZ$+Xv-k@!kK$NpMuxKNks)MSiTcchFrfW^NXi5KoE6 z<15H$B~e|@Da(IGpeR1WXogGyth-tg*hTL6YB~J}4={AH*0T+2<+n^IG8`q*hv2UHR!zhCK3#EJN32F#g z&@-Ppwlp_4@76);>+82-G_elgIXo9)_L@M9hHp+r30CwIYZi0|1Q0C0~pQ(;LZtOTxMn_s4dLy z;VC;r%hI_ooZ%B(_vf8LeOtFM1Yrb80aN_{^^wYdjnn_tCpc>-XWVTnh;Pti@4$ye O5Dl1)O8Eop(EkICm?tg( literal 0 HcmV?d00001 From 1b25d9c7d6222df91ccad0515a87d3242c667946 Mon Sep 17 00:00:00 2001 From: dglr Date: Mon, 27 May 2024 23:54:40 +0800 Subject: [PATCH 04/27] modify mathematical formula --- docs/design_docs/cholesky/cholesky.md | 66 +++++++++++++-------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/docs/design_docs/cholesky/cholesky.md b/docs/design_docs/cholesky/cholesky.md index de436f129..66fda5836 100644 --- a/docs/design_docs/cholesky/cholesky.md +++ b/docs/design_docs/cholesky/cholesky.md @@ -9,28 +9,28 @@ Cholesky分解是科学和数值领域中最重要的算法之一。Cholesky算 ### 1.1 厄密特矩阵 厄密特矩阵,又称自伴随矩阵,是共轭对称的方阵。厄密特矩阵中对角线元素均为实数,且每个第i行j列的元素都与第j行i列的元素互为共轭转置。例如: -$$ +```math \begin{bmatrix} 3 & 2+i \\ 2-i & 1 \\ \end{bmatrix} -$$ -对于一个矩阵$A$,如果其是厄密特矩阵,则可以对其进行Cholesky分解,如果其是正定矩阵(对于所有的非零实数$x$,都有$x^TAx>0$)则Cholesky分解的结果唯一,否则结果不唯一。 +``` +对于一个矩阵$`A`$,如果其是厄密特矩阵,则可以对其进行Cholesky分解,如果其是正定矩阵(对于所有的非零实数$`x`$,都有$`x^TAx>0`$)则Cholesky分解的结果唯一,否则结果不唯一。 ### 1.2 Cholesky分解 -对正定厄密特矩阵$A$进行Cholesky分解,即求矩阵$L$使下式成立: -$$ +对正定厄密特矩阵$`A`$进行Cholesky分解,即求矩阵$`L`$使下式成立: +```math A=LL^* -$$ -其中,$L$是一个下三角矩阵且对角元素均为正实数,$L^*$表示$L$的共轭转置,是一个上三角矩阵。当$A$是一个实数矩阵时,Cholesky分解可以改写为 -$$ +``` +其中,$`L`$是一个下三角矩阵且对角元素均为正实数,$`L^*`$表示$`L`$的共轭转置,是一个上三角矩阵。当$`A`$是一个实数矩阵时,Cholesky分解可以改写为 +```math A=LL^T -$$ -下文中为表述方便,所有矩阵$A$均为实数矩阵。 +``` +下文中为表述方便,所有矩阵$`A`$均为实数矩阵。 -对于一个$n\times n$的实矩阵$A$,Cholesky分解可以被写作如下过程: -$$ +对于一个$`n\times n`$的实矩阵$`A`$,Cholesky分解可以被写作如下过程: +```math \begin{align*} \begin{bmatrix} a_{11} & a_{12} & a_{13} & a_{14} \\ @@ -59,9 +59,9 @@ l_{11}l_{31} & l_{21}l_{31} + l_{22}l_{32} & l_{31}^2 + l_{32}^2 + l_{33}^2 & l_ l_{11}l_{41} & l_{21}l_{41} + l_{22}l_{42} & l_{31}l_{41} + l_{32}l_{42} + l_{33}l_{43} & l_{41}^2 + l_{42}^2 + l_{43}^2 + l_{44}^2 \\ \end{bmatrix} \end{align*} -$$ +``` -根据上式不难看出,每个$a_{i,j}$等于由$l_{i,j}$和$L$矩阵的其它元素组成的多项式,例如$a_{32}=l_{21}l_{31}+l_{32}l_{22}$,并且多项式中只有一个项包含了$l_{i,j}$($a_{32}$等价的多项式中只有$l_{22}l_{32}$这一项),包含了$l_{i,j}$的项另一个因子都为对角线元素,因此为了计算$l_{i,j}$,可以由$a_{i,j}$减去不包含$l_{i,j}$的其它项然后除以对角线元素,这样就能算出每个$l_{i,j}$。 +根据上式不难看出,每个$`a_{i,j}`$等于由$`l_{i,j}`$和$`L`$矩阵的其它元素组成的多项式,例如$`a_{32}=l_{21}l_{31}+l_{32}l_{22}`$,并且多项式中只有一个项包含了$`l_{i,j}`$($`a_{32}`$等价的多项式中只有$`l_{22}l_{32}`$这一项),包含了$`l_{i,j}`$的项另一个因子都为对角线元素,因此为了计算$`l_{i,j}`$,可以由$`a_{i,j}`$减去不包含$`l_{i,j}`$的其它项然后除以对角线元素,这样就能算出每个$`l_{i,j}`$。 ## 2 Cholesky分解实现 @@ -70,19 +70,19 @@ $$ ![image](timeline.png) 图1 cholesky分解时间线 -上图中,假设矩阵$L$的左边两列块已经计算完毕(黄色部分的非对角元和红色的对角元),这个流程展示了计算中间列块的过程(蓝色部分和橙色部分),完整的Cholesky计算只需要对分块后的所有列重复执行此流程。 +上图中,假设矩阵$`L`$的左边两列块已经计算完毕(黄色部分的非对角元和红色的对角元),这个流程展示了计算中间列块的过程(蓝色部分和橙色部分),完整的Cholesky计算只需要对分块后的所有列重复执行此流程。 SYRK(HERK)、GEMM和TRSM均为标准BLAS库中的操作,POTRF为计算对角块(完整矩阵的对角元素所在的块)内部依赖的kernel。下面将按照计算顺序依次介绍。 ### 2.1 SYRK(HERK) SYRK是BLAS的标准操作(数据类型是复数时为HERK),定义为: -$$ +```math C=\alpha AA^T+\beta C -$$ -其中$C$为$n\times n$的方阵,$A$为$n\times m$的矩阵,$\alpha$和$\beta$是标量。 +``` +其中$`C`$为$`n\times n`$的方阵,$`A`$为$`n\times m`$的矩阵,$`\alpha`$和$`\beta`$是标量。 -此处使用SYRK是为了计算橙色块的外部依赖,上式中的$C$代表橙色对角块(完整矩阵的对角元素所在的块),$A$代表橙色块左侧的所有黄色块,$\alpha$、$\beta$分别取-1和1。 +此处使用SYRK是为了计算橙色块的外部依赖,上式中的$`C`$代表橙色对角块(完整矩阵的对角元素所在的块),$`A`$代表橙色块左侧的所有黄色块,$`\alpha`$、$`\beta`$分别取-1和1。 ![image](syrk.png) 图2 syrk示意 @@ -90,12 +90,12 @@ $$ ### 2.2 GEMM GEMM是BLAS的标准操作,定义为: -$$ +```math C=\alpha AB+\beta C -$$ -其中$C$,$A$,$B$分别是$m\times n$,$m\times k$,$k\times n$的矩阵,$\alpha$和$\beta$是标量。 +``` +其中$`C`$,$`A`$,$`B`$分别是$`m\times n`$,$`m\times k`$,$`k\times n`$的矩阵,$`\alpha`$和$`\beta`$是标量。 -这里使用GEMM计算蓝色非对角块的外部依赖,上式的$C$代表蓝色块,$A$和$B$分别代表橙色块左侧的黄色块和蓝色块左侧的黄色块。$\alpha$和$\beta$分别为-1和1。 +这里使用GEMM计算蓝色非对角块的外部依赖,上式的$`C`$代表蓝色块,$`A`$和$`B`$分别代表橙色块左侧的黄色块和蓝色块左侧的黄色块。$`\alpha`$和$`\beta`$分别为-1和1。 ![image](gemm.png) 图3 gemm示意 @@ -103,12 +103,12 @@ $$ ### 2.3 TRSM TRSM是BLAS的标准函数,定义为: -$$ +```math XA=\alpha B -$$ -已知下三角矩阵$A$和矩阵$B$,TRSM解出矩阵$X$,$A$为$n\times n$方阵,$X$和$B$为$m\times n$的矩阵。 +``` +已知下三角矩阵$`A`$和矩阵$`B`$,TRSM解出矩阵$`X`$,$`A`$为$`n\times n`$方阵,$`X`$和$`B`$为$`m\times n`$的矩阵。 -对角块在SYRK后需要经过POTRF完成后续计算,这里假设已经计算完毕,于是可以通过TRSM完成蓝色块的剩余计算,TRSM执行后蓝色部分计算完毕。上式中$A$为红色块,$X$和$B$均为蓝色块,计算结果覆盖原矩阵。 +对角块在SYRK后需要经过POTRF完成后续计算,这里假设已经计算完毕,于是可以通过TRSM完成蓝色块的剩余计算,TRSM执行后蓝色部分计算完毕。上式中$`A`$为红色块,$`X`$和$`B`$均为蓝色块,计算结果覆盖原矩阵。 ![image](trsm.png) 图4 trsm示意 @@ -171,14 +171,14 @@ POTRF这个函数名取自LAPACK中Cholesky分解的函数,POTRF的目的是 厄密特矩阵,又称自伴随矩阵,是共轭对称的方阵。 -对正定厄密特矩阵$A$进行Cholesky分解,即求矩阵$L$使下式成立: -$$ +对正定厄密特矩阵$`A`$进行Cholesky分解,即求矩阵$`L`$使下式成立: +```math A=LL^* -$$ -其中,$L$是一个下三角矩阵且对角元素均为正实数,$L^*$表示$L$的共轭转置,是一个上三角矩阵。当$A$是一个实数矩阵时,Cholesky分解可以改写为 -$$ +``` +其中,$`L`$是一个下三角矩阵且对角元素均为正实数,$`L^*`$表示$`L`$的共轭转置,是一个上三角矩阵。当$`A`$是一个实数矩阵时,Cholesky分解可以改写为 +```math A=LL^T -$$ +``` ### 3.3 算子输入输出参数要求 From 0da4788131f4a3a8c4b0e913e779453e74d82227 Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 7 Jun 2024 19:21:52 +0800 Subject: [PATCH 05/27] add complex type --- kernels/cholesky/cholesky.cpp | 267 ++- kernels/cholesky/cholesky.h | 30 +- kernels/cholesky/cholesky_union1.mlu | 1979 ++++++++++++++++- kernels/cholesky/complex_cholesky_union1.mlu | 1089 +++++++++ mlu_op.h | 281 +-- .../pb_gtest/src/zoo/cholesky/cholesky.cpp | 334 ++- .../pb_gtest/src/zoo/cholesky/cholesky.h | 2 + .../cholesky/testcase/complex_test.prototxt | 37 + 8 files changed, 3534 insertions(+), 485 deletions(-) create mode 100644 kernels/cholesky/complex_cholesky_union1.mlu create mode 100644 test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/testcase/complex_test.prototxt diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index 09b60e4d3..f0e263a59 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -1,17 +1,63 @@ #include "cholesky.h" - - - - //dA:输入被分解方阵 //dC:cholesky分解结果方阵 //trans -> false: col major; true: row major //uplo -> false: lower; true: upper //ldda:leading dimension -//batch=1 + +mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace) +{ + PARAM_CHECK("mluOpCholesky", input_desc != NULL); + + + PARAM_CHECK("mluOpCholesky", input_desc->dim == 2||input_desc->dim == 3); + PARAM_CHECK("mluOpCholesky", input_desc->dims[0] > 0); + PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); + + if(input_desc->dim == 3) + { + PARAM_CHECK("mluOpCholesky", input_desc->dims[2] > 0); + } + + mluOpDataType_t dtype = input_desc->dtype; + PARAM_CHECK("mluOpCholesky", dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); + + int type_size = (dtype == MLUOP_DTYPE_FLOAT) ? 4 : 8; + int size_a = 0, lda = 0, size_c = 0, ldc = 0; + int batch_size = 1; + int dim = input_desc->dim; + if(dim == 2) + { + size_a = input_desc->dims[0]; + } + else if(dim == 3) + { + batch_size = input_desc->dims[0]; + size_a = input_desc->dims[1]; + } + printf("fuck you!"); + + if (dtype == MLUOP_DTYPE_FLOAT) + { + // *size = size_a*size_a*sizeof(float); + *size = 0; + } + else + { + *size = size_a*size_a*sizeof(float)*2*batch_size; + printf("size:%ul\n",(int)(*size)); + } + if(*size>0) + { + CHECK_RETURN("mluOpCholesky", + complex_malloc(*size, workspace)); + } + return MLUOP_STATUS_SUCCESS; +} + mluOpStatus_t MLUOP_WIN_API -mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper) +mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace) { PARAM_CHECK("mluOpCholesky", handle != NULL); PARAM_CHECK("mluOpCholesky", input_desc != NULL); @@ -25,11 +71,16 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); PARAM_CHECK("mluOpCholesky", output_desc->dims[0] > 0); PARAM_CHECK("mluOpCholesky", output_desc->dims[1] > 0); + if(input_desc->dim == 3) { PARAM_CHECK("mluOpCholesky", input_desc->dims[2] > 0); PARAM_CHECK("mluOpCholesky", output_desc->dims[2] > 0); } + + mluOpDataType_t dtype = input_desc->dtype; + PARAM_CHECK("mluOpCholesky", dtype == output_desc->dtype); + PARAM_CHECK("mluOpCholesky", dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); int recnb = REC_NB; @@ -37,6 +88,8 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa int dim = input_desc->dim; bool is_row_major = (input_desc->strides)[dim-1]==1; + + int type_size = (dtype == MLUOP_DTYPE_FLOAT) ? 4 : 8; int size_a = 0, lda = 0, size_c = 0, ldc = 0; int batch_size = 1; if(dim == 2) @@ -76,77 +129,205 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa int jb; const float s_one = 1.0; const float s_neg_one = -1.0; - - if(upper == true) + + if(dtype == MLUOP_DTYPE_FLOAT) { - printf("start transpose 1\n"); - CHECK_RETURN("mluOpCholesky", - transpose(size_a,d_input,d_output,handle)); + if(upper == true) + { + CHECK_RETURN("mluOpCholesky", + transpose(batch_size,size_a,size_a,d_input,d_output,handle)); + } + else + { + CNRT_CHECK(cnrtMemcpy(d_output, d_input, type_size*size_a*lda*batch_size, CNRT_MEM_TRANS_DIR_DEV2DEV)); + } } - else + else { - CNRT_CHECK(cnrtMemcpy(d_output, d_input, sizeof(float)*size_a*lda, CNRT_MEM_TRANS_DIR_DEV2DEV)); + + CHECK_RETURN("mluOpCholesky", + transpose(batch_size,size_a*size_a,2,d_input,d_output,handle)); } + cnrtQueueSync(queue); //TODO:检查拷贝开销 - // if(upper == true) - // { - // CHECK_RETURN("mluOpCholesky", - // transpose(size_a,d_input,d_output,handle)); - // //print d_output - // cnrtMemcpy(work_space_h, d_output, sizeof(float)*size_a*size_a, CNRT_MEM_TRANS_DIR_DEV2HOST); - // //print work_space_h - // // printf("matrix after transpose:\n"); - // // for(int i = 0; i < size_a; i++) - // // { - // // for(int j = 0; j < size_a; j++) - // // { - // // printf("%.2f ",work_space_h[i*size_a+j]); - // // } - // // printf("\n"); - // // } + int stride = size_a*lda; + //printf original matrix + if(dtype == MLUOP_DTYPE_FLOAT) + { - // } - // else - // { int row = is_row_major ? lda : size_a; - // int nb = row > 512 ? NB : (NB/2); int nb = NB; for(int j = 0; j < row; j+=nb) { jb = std::min(nb, row-j); CHECK_RETURN("mluOpCholesky", - ssyrk(false,is_row_major,jb,j,OFFSET_ROW(d_output,j,0),lda,OFFSET_ROW(d_output,j,j),lda,handle)); + ssyrk(batch_size,stride,false,is_row_major,jb,j,OFFSET_ROW(d_output,j,0),lda,OFFSET_ROW(d_output,j,j),lda,handle)); + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + mlu_spotrf_rectile(batch_size,stride,is_row_major,false,jb,recnb,OFFSET_ROW(d_output,j,j),lda,j, handle)); + // cnrtQueueSync(queue); + if(j+jb < row) + { + CHECK_RETURN("mluOpCholesky", + sgemm(batch_size, !is_row_major,is_row_major,row-j-jb,jb,j,-1.0f,1.0f, + OFFSET_ROW(d_output,j+jb,0),lda,stride, + OFFSET_ROW(d_output,j,0),lda,stride, + OFFSET_ROW(d_output,j+jb,j),lda,stride, handle)); + cnrtQueueSync(queue); + } + if(j+jb < row) + { + CHECK_RETURN("mluOpCholesky", + strsm(batch_size, stride,false,is_row_major,jb,row-j-jb,OFFSET_ROW(d_output,j,j),lda,OFFSET_ROW(d_output,j+jb,j),lda, handle)); + cnrtQueueSync(queue); + } + } + + if(upper) + { + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + transpose(batch_size, size_a,size_a,d_output,d_output,handle)); + } + } + else + { + recnb = CREC_NB; + // int nb = NB; + int nb = NB; + int row = lda; + float* r_start = d_output; //实数首地址 + float* i_start = d_output + size_a*lda;//虚数首地址 + + set_half_zero(batch_size, size_a*lda, r_start, lda, lda, handle); + set_half_zero(batch_size, size_a*lda, i_start, lda, lda, handle); + + for(int j = 0; j < row; j+=nb) + { + jb = std::min(nb, row-j); + CHECK_RETURN("mluOpCholesky", + cherk(batch_size,stride,jb,j,r_start+j*lda,i_start+j*lda,lda,r_start+j*lda+j,i_start+j*lda+j,lda,handle)); cnrtQueueSync(queue); CHECK_RETURN("mluOpCholesky", - mlu_spotrf_rectile(is_row_major,false,jb,recnb,OFFSET_ROW(d_output,j,j),lda,j, handle)); + mlu_cpotrf_rectile(batch_size,stride,jb,recnb,r_start+j*lda+j,i_start+j*lda+j,lda, handle)); // cnrtQueueSync(queue); if(j+jb < row) { CHECK_RETURN("mluOpCholesky", - sgemm(!is_row_major,is_row_major,row-j-jb,jb,j,-1.0f,1.0f, - OFFSET_ROW(d_output,j+jb,0),lda, - OFFSET_ROW(d_output,j,0),lda, - OFFSET_ROW(d_output,j+jb,j),lda, handle)); + cgemm(batch_size, false,true,row-j-jb,jb,j,-1.0f,1.0f, + OFFSET_ROW(r_start,j+jb,0),OFFSET_ROW(i_start,j+jb,0), lda,stride, + OFFSET_ROW(r_start,j,0),OFFSET_ROW(i_start,j,0), lda,stride, + OFFSET_ROW(r_start,j+jb,j),OFFSET_ROW(i_start,j+jb,j), lda, stride, handle)); + cnrtQueueSync(queue); } if(j+jb < row) { CHECK_RETURN("mluOpCholesky", - strsm(false,is_row_major,jb,row-j-jb,OFFSET_ROW(d_output,j,j),lda,OFFSET_ROW(d_output,j+jb,j),lda, handle)); + ctrsm(batch_size, stride,jb,row-j-jb,OFFSET_ROW(r_start,j,j),OFFSET_ROW(i_start,j,j),lda, + OFFSET_ROW(r_start,j+jb,j),OFFSET_ROW(i_start,j+jb,j),lda, handle)); cnrtQueueSync(queue); } } + + // printf("after transpose, d_output:\n"); + // for(int i = 0; i < 2; i++) + // { + // for(int j = 0; j < lda; j++) + // { + // for(int h = 0; h < lda; h++) + // { + // cnrtMemcpy(work_space_h, d_output+i*lda*lda+j*lda+h, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f",*work_space_h); + // } + // printf("\n"); + // } + // printf("\n"); // } + + - if(upper) + printf("before finally, transpose:\n"); + cnrtMemcpy(work_space_h, d_output, sizeof(float)*lda*lda*2, CNRT_MEM_TRANS_DIR_DEV2HOST); + printf("real result:\n"); + for(int j = 0; j < lda; j++) { - cnrtQueueSync(queue); + for(int h = 0; h < lda; h++) + { + printf("%8.3f",work_space_h[j*lda+h]); + } + printf("\n"); + } + printf("\n"); + printf("imag result:\n"); + for(int j = 0; j < lda; j++) + { + for(int h = 0; h < lda; h++) + { + printf("%8.3f",work_space_h[lda*lda+j*lda+h]); + } + printf("\n"); + } + + // CHECK_RETURN("mluOpCholesky", + // sgemm(batch_size, false,true,row-j-jb,jb,j,-1.0f,1.0f, + // OFFSET_ROW(d_output,j+jb,0),lda,stride, + // OFFSET_ROW(d_output,j,0),lda,stride, + // OFFSET_ROW(d_output,j+jb,j),lda,stride, handle)); + // cnrtQueueSync(queue); + + // cnrtMemcpy(work_space_h, d_output, sizeof(float)*lda*lda*2, CNRT_MEM_TRANS_DIR_DEV2HOST); + // for(int i = 0; i < 2; i++) + // { + // for(int j = 0; j < lda; j++) + // { + // for(int h = 0; h < lda; h++) + // { + // // cnrtMemcpy(work_space_h, d_output+i*lda*lda+j*lda+h, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f",work_space_h[i*lda*lda+j*lda+h]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + CHECK_RETURN("mluOpCholesky", - transpose(size_a,d_output,d_output,handle)); + transpose(batch_size,2,size_a*size_a,d_output,workspace,handle)); + cnrtQueueSync(queue); + CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*batch_size, CNRT_MEM_TRANS_DIR_DEV2DEV)); + + // printf("after transpose, d_a:\n"); + + // for(int j = 0; j < lda; j++) + // { + // for(int h = 0; h < lda; h++) + // { + // cnrtMemcpy(work_space_h, d_output+j*lda*2+h*2, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtMemcpy((work_space_h+1), d_output+j*lda*2+h*2+1, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f,%8.3f ",work_space_h[0],work_space_h[1]); + // } + // printf("\n"); + // } } + + + // printf("matrix after calculate:\n"); + // for(int i = 0; i < batch_size; i++) + // { + // printf("batch %d:\n",i); + // for(int j = 0; j < size_a; j++) + // { + // for(int k = 0; k < size_a; k++) + // { + // cnrtMemcpy(work_space_h, d_output + i*stride+j*lda+k, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%.2f ",work_space_h[0]); + // } + // printf("\n"); + // } + // } diff --git a/kernels/cholesky/cholesky.h b/kernels/cholesky/cholesky.h index e6ada4f8b..88afe06e1 100644 --- a/kernels/cholesky/cholesky.h +++ b/kernels/cholesky/cholesky.h @@ -21,11 +21,13 @@ #include "kernels/utils/cnnl_helper.h" -#define REC_NB (8) +#define REC_NB (16) #define POTF_NB ((REC_NB)/4) +#define CREC_NB (4) +#define CPOTF_NB ((CREC_NB)/4) #define __CNRT_FUNC_TYPE__ CNRT_FUNC_TYPE_UNION1 #define TASK_NUM (4) -#define NB (16) +#define NB (8) #define CLUSTER_NUM 1 #define M (TASK_NUM * POTF_NB) //POTF边长 #define ZERO 0.0 @@ -34,17 +36,31 @@ #define OFFSET_B_ROW(B, i, j) B + ((i) * (ldb) + (j)) -mluOpStatus_t mlu_spotrf_rectile(bool trans, bool uplo, int n, int recnb, float* dA, int ldda, int gbstep, mluOpHandle_t handle); +mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, int n, int recnb, float* dA, int ldda, int gbstep, mluOpHandle_t handle); // void mluOpCholesky(bool trans, bool uplo, int n, float* dA, float* dC, int ldda); -mluOpStatus_t ssyrk(bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle); +mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle); -mluOpStatus_t sgemm(bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, float* d_b, int ldb, float* d_c, int ldc, mluOpHandle_t handle); +mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, int stride_a, float* d_b, int ldb, int stride_b, float* d_c, int ldc, int stride_c, mluOpHandle_t handle); //side:true->right // false->left -mluOpStatus_t strsm(bool upper, bool trans, int m, int n, float* d_a, int ldda, float* d_b, int lddb, mluOpHandle_t handle); +mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, float* d_a, int ldda, float* d_b, int lddb, mluOpHandle_t handle); -mluOpStatus_t transpose(int m, float* d_input,float* d_output, mluOpHandle_t handle); +mluOpStatus_t transpose(int batch, int m, int n,float* d_input,float* d_output, mluOpHandle_t handle); + +mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, float* drA, float* diA, int lda, mluOpHandle_t handle); + +mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle); + +mluOpStatus_t complex_malloc(size_t size, float** workspace); + +// mluOpStatus_t complex_set_half_zero(int batch, int stride, float* d_a, int m, int ld); + +mluOpStatus_t set_half_zero(int batch,int stride,float* d_a, int lda, int m, mluOpHandle_t handle); + +mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, float* id_a, int lda, float* rd_b, float* id_b, int ldb, mluOpHandle_t handle); + +mluOpStatus_t cherk(int batch, int stride, int n,int k, float* rd_a, float* id_a, int lda, float* rd_c, float* id_c, int ldc, mluOpHandle_t handle); #endif \ No newline at end of file diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index 3fe714804..669fceb35 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -1,24 +1,30 @@ #include "cholesky.h" __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; - +// __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; + __mlu_func__ void sgemm_fixwidth_device(int m, int k, float* A0, const int lda, float *sC, float *sB) { - int id = taskId; + int id = taskId % 4; - int span = POTF_NB; + int span = POTF_NB;//span = remain > POTF_NB ? POTF_NB : remain; + //这个m和M不同!这个m是前面M-i的 __nram__ float rC[M * POTF_NB/TASK_NUM ]; __nram__ float rA[M * POTF_NB/TASK_NUM ]; __nram__ float rp[M * POTF_NB/TASK_NUM ]; __nram__ float rB[POTF_NB * POTF_NB]; + // __nram__ float rC_inter[POTF_NB * POTF_NB]; + // __wram__ float wB[POTF_NB * POTF_NB]; + //void __memcpy(void *dst, const void *src, unsigned int size, mluMemcpyDirection_t dir, unsigned int dst_stride, int src_stride, unsigned int segnum) + // row major if(id*span POTF_NB||remain <= 0) ? POTF_NB : remain; float *rA = (float*)nram_buffer + id * NB * NB * 4; + // float *rA = (float*)nram_buffer; float *rB = rA + NB * NB; @@ -150,9 +307,42 @@ void sgemm_anywidth_device(int m, int k, if(k>0) { + // if(id == 0) + // { + // printf("tmdsb\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < 7;j ++) + // { + // printf("%.3f ",A0[i*lda+j]); + // } + // printf("\n"); + // } + // printf("k:%d\n",k); + // printf("m:%d\n",m); + // printf("lda:%d\n",lda); + // printf("span_b:%d\n",span_b); + // } __memcpy(rB,A0,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span_b-1); + // if(id == 0) + // { + // printf("after memcpy rB:\n"); + // for(int i = 0; i 0 && if_execute) + // { + // __memcpy(rB,A0,1*sizeof(float),SRAM2NRAM,1*sizeof(float),1*sizeof(float),0); + // } __sync_cluster(); @@ -190,12 +380,13 @@ void sgemm_anywidth_device(int m, int k, -} +} + static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) { float factor; - int id = coreId; + int id = coreId % 4; int finish = id * POTF_NB; int remain = m - finish; bool if_execute = remain > 0; @@ -208,12 +399,17 @@ static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) __sync_cluster(); for(int i = 0; i < span; i++) { + // if(iter == 0) + // printf("before: %.3f\n",A[i*POTF_NB+iter+id*span*POTF_NB]); if(if_execute) A[i*lda+iter+id*POTF_NB*lda] *= factor; + // if(iter == 0) + // printf("after: %.3f\n",A[i*POTF_NB+iter+id*span*POTF_NB]); } __sync_cluster(); + //TODO:可能要重点优化 if(if_execute) { for(int i = iter + 1; i < iter_num; i++) @@ -235,64 +431,228 @@ static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) __mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float *A0, float *A, int lda, const int localstep, const int gbstep) { - int id = taskId; + int id = taskId % 4; __mlu_shared__ float shared_data[SHARED_MEM_SIZE]; float* sdata_A = shared_data; float* sdata_B = shared_data + m *POTF_NB/TASK_NUM * 4; + + // if(localstep == 8) + // { + // if(id == 0) + // { + // printf("before sgemm:\n"); + // } + // for(int i = 0; i =j) + // { + // A[i*lda+j] = sdata_A[coreId*span*POTF_NB+i*POTF_NB+j]; + // } + // } __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); + // __memcpy(work_space+(i*NB),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); } } else if(id*span < m) { __memcpy(A+(id*POTF_NB*lda),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,lda*sizeof(float),POTF_NB*sizeof(float),span-1); + // __memcpy(work_space+(id*POTF_NB*NB),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,NB*sizeof(float),POTF_NB*sizeof(float),span-1); } __sync_cluster(); + // if(id==3) + // { + // printf("sdata:\n"); + // for(int i = 0; i 0; + // int span = remain > POTF_NB ? POTF_NB : remain; + // __sync_cluster(); sgemm_anywidth_device(m, localstep, A0, lda, A, nullptr); + + // __sync_cluster(); + + // if(id==3) + // { + // printf("sdata:\n"); + // for(int i = 0; i =j) + // // { + // // A[i*lda+j] = sdata_A[coreId*span*POTF_NB+i*POTF_NB+j]; + // // } + // // } + // __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); + // // __memcpy(work_space+(i*NB),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); + // } + + // } + // else if(if_execute) + // { + // __memcpy(A+(id*POTF_NB*lda),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,lda*sizeof(float),POTF_NB*sizeof(float),span-1); + // // __memcpy(work_space+(id*POTF_NB*NB),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,NB*sizeof(float),POTF_NB*sizeof(float),span-1); + // } + + // __sync_cluster(); + + // if(id==3) + // { + // printf("sdata:\n"); + // for(int i = 0; i = batch) + return; + dA = orignA + batch_id * stride; __mlu_shared__ float shared_data[NB * NB]; @@ -309,6 +669,17 @@ __mlu_global__ void spotf2_smlpin_anywidth_kernel(bool trans, int m, float *dA, if(id == 0) { __memcpy(shared_data,dA,m*sizeof(float),GDRAM2SRAM,NB*sizeof(float),lda*sizeof(float),m-1); + //printf shared_data + // printf("shared_data:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < m; j++) + // { + // printf("%.3f ",shared_data[i*NB+j]); + // } + // printf("\n"); + // } + // printf("localstep:%d\n",localstep); } __sync_cluster(); @@ -325,40 +696,479 @@ __mlu_global__ void spotf2_smlpin_anywidth_kernel(bool trans, int m, float *dA, } __sync_cluster(); } + // } + + + +} + +__mlu_func__ +void small_sgemm_batch(int m, int k, + float* A0, const int lda,int width, + float* dst, float* nram_remain) +{ + //dst和dst2形状: m*width src1形状:m*k src2形状:width*k + int ldk = k; + int ldm = m; + float* src1 = nram_remain; + float* src2 = src1 + ldk * ldm; + float* dst2 = src2 + width * ldk; + + float* dA = A0 + k; + __memcpy_async(dst, dA, width*sizeof(float),GDRAM2NRAM,width*sizeof(float),lda*sizeof(float),m-1); + + if(k == 0) + { + __sync(); + return; + } + __memset_nram(src1,ldm*ldk,(float)ZERO); + + __memcpy_async(src1, A0, k*sizeof(float),GDRAM2NRAM,ldk*sizeof(float),lda*sizeof(float),m-1); + + __memset_nram(dst2,ldm*width,(float)ZERO); + + __sync(); + + __memcpy(src2, src1, ldk*width*sizeof(float),NRAM2NRAM); + + for(int i = 0; i < m; i++) + { + for(int j = 0; j < width; j++) + { + for(int h = 0; h < k; h++) + { + dst2[i*width+j] += src1[i*ldk+h] * src2[j*ldk+h]; + } + } + } + + __bang_sub(dst,dst,dst2,width * m); + + __sync(); +} + +__mlu_func__ void small_sminout_batch(int m, int width, float *dst, float *nram_remain, int lda) +{ + float factor; + // __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + float* diag = dst; + + for(int iter = 0; iter < width; iter++) + { + factor=sqrt(diag[iter*width+iter]); + factor = 1.0/factor; + // __sync_cluster(); + for(int i = 0; i < m; i ++) + { + dst[i*width+iter] *= factor; + } + __sync(); + for(int i = iter + 1; i < width; i++) + { + for(int j = 0; j < m; j++) + { + dst[j * width + i ] -= dst[i*width+iter] * dst[j * width + iter]; + + // nram_src[j * POTF_NB + i ] -= diag[i*POTF_NB+iter] * nram_src[j * POTF_NB + iter]; + } + } + __sync(); + + // for(int i = 0; i < width; i++) + // { + // // if(iter == 0) + // // printf("before: %.3f\n",A[i*POTF_NB+iter+id*span*POTF_NB]); + // nram_src[i*POTF_NB+iter] *= factor; + // diag[i*POTF_NB+iter] *= factor; + // // if(iter == 0) + // // printf("after: %.3f\n",A[i*POTF_NB+iter+id*span*POTF_NB]); + + // } + + + + // for(int i = iter + 1; i < POTF_NB; i++) + // { + // for(int j = 0; j < span; j++) + // { + // diag[j * POTF_NB + i ] -= diag[i*POTF_NB+iter] * diag[j * POTF_NB + iter]; + // nram_src[j * POTF_NB + i ] -= diag[i*POTF_NB+iter] * nram_src[j * POTF_NB + iter]; + // } + // } + + + + } + __sync(); + + // __memcpy(nram_src,A + id *span*POTF_NB,span*span*sizeof(float),SRAM2NRAM); + // __sync(); + //print diag and nram_src + // if(id*span < m) + // { + // printf("before sminout,id:%d\n",id); + // printf("diag:\n"); + // for(int i = 0; i < span; i++) + // { + // for(int j = 0; j < span; j++) + // { + // printf("%.3f ",diag[i*span+j]); + // } + // printf("\n"); + // } + // printf("nram_src:\n"); + // for(int i = 0; i < span; i++) + // { + // for(int j = 0; j < span; j++) + // { + // printf("%.3f ",nram_src[i*span+j]); + // } + // printf("\n"); + // } + // } + + // __sync_cluster(); + + // if(id * span < m) + // { + // printf("after sminout,id:%d\n",id); + // printf("diag:\n"); + // for(int i = 0; i < span; i++) + // { + // for(int j = 0; j < span; j++) + // { + // printf("%.3f ",diag[i*span+j]); + // } + // printf("\n"); + // } + // printf("nram_src:\n"); + // for(int i = 0; i < span; i++) + // { + // for(int j = 0; j < span; j++) + // { + // printf("%.3f ",nram_src[i*span+j]); + // } + // printf("\n"); + // } + // } + + // if(id*span width) + { + __memcpy(A+(width*lda),dst+width*width,width*sizeof(float),NRAM2GDRAM,lda*sizeof(float),width*sizeof(float),m-width-1); + } + + __sync(); + + +} + +__mlu_global__ void spotf2_batch_kernel(int batch, int stride, int m, float *dA, int lda) +{ + int id = taskId; + int batch_id = id; + if(batch_id >= batch) + return; + float* orignA = dA; + dA = orignA + batch_id * stride; + int width = POTF_NB; + int span = width; + + for(int i = 0; i < m; i += width) + { + span = std::min(width, m - i); + smlpout_batch(m-i,dA+i*lda,dA+i*lda+i,lda,i,span); + } } -mluOpStatus_t mlu_spotf2_lpin(bool trans,bool uplo, int n, int ldda, float* dA, int gbstep, cnrtQueue_t queue) +mluOpStatus_t mlu_spotf2_lpin(int batch, int stride, bool trans,bool uplo, int n, int ldda, float* dA, int gbstep, cnrtQueue_t queue) { cnrtDim3_t dim; - cnrtFunctionType_t func_type = __CNRT_FUNC_TYPE__; - dim.x = TASK_NUM; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; dim.y = 1; dim.z = 1; - - KERNEL_CHECK( - spotf2_smlpin_anywidth_kernel<<>>(trans, n, dA, ldda, 0,gbstep)); + if(batch > 1) + { + dim.x = batch; + KERNEL_CHECK( + spotf2_batch_kernel<<>>(batch, stride, n, dA, ldda)); + } + else + { + int carry_batch = batch; + if(batch == 1) + { + func_type = CNRT_FUNC_TYPE_UNION1; + } + else if(batch == 2) + { + func_type = CNRT_FUNC_TYPE_UNION2; + } + else if(batch <= 4) + { + func_type = CNRT_FUNC_TYPE_UNION4; + carry_batch = 4; + } + else + { + func_type = CNRT_FUNC_TYPE_UNION8; + carry_batch = batch < 8 ? 8 : batch; + } + dim.x = carry_batch * 4; + KERNEL_CHECK( + spotf2_smlpin_anywidth_kernel<<>>(batch, stride, trans, n, dA, ldda, 0,gbstep)); + + } + // dim.x = TASK_NUM * 4; + + // } + // cnrtQueueSync(queue); + // float* h_i; + // h_i = (float*)malloc(n*n*sizeof(float)); + // for(int i = 0; i < n; i ++) + // { + // cnrtMemcpy(h_i+i*n, work_space+i*NB, n*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // } + + // cnrtQueueSync(queue); + // //print h_i + // printf("work_space after mlu_spotf2_lpin:\n"); + // for(int i = 0; i < n; i++) + // { + // for(int l = 0; l < n; l++) + // { + // printf("%.3f ",h_i[i*n+l]); + // } + // printf("\n"); + // } + // cnrtQueueSync(queue); return MLUOP_STATUS_SUCCESS; } +__mlu_entry__ void mlu_strsm_rectile_batch_kernel( + int batch, int stride, + int m,int n, bool trans, + float *dA, int32_t lda, + float *dB, int32_t ldb) +{ + int id = taskId; + int batch_id = id; + if(batch_id >= batch) + return; + float* orignA = dA; + float* orignB = dB; + dA = orignA + batch_id * stride; + dB = orignB + batch_id * stride; + + // int remain = n - id * POTF_NB; + // bool if_execute = remain > 0; + // int span = (remain > POTF_NB || remain <= 0) ? POTF_NB : remain; + int span = n; + int start = 0; + + __nram__ float sA[8*POTF_NB]; + __nram__ float rB[4*POTF_NB * 8*POTF_NB]; + __nram__ float rC[4*POTF_NB * 8*POTF_NB]; + __nram__ float rBp[4*POTF_NB]; + __nram__ float rA[8*POTF_NB]; + int calc_length = (8 * POTF_NB) > m ? m : (8 * POTF_NB); + __memset_nram(rB,POTF_NB*calc_length,(float)ZERO); + __memset_nram(sA,calc_length*calc_length,(float)ZERO); + + + float temp_b = 0, factor = 0; + + + + __memcpy_async(sA,dA,sizeof(float),GDRAM2NRAM); + + __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),GDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); + __sync(); + // if(id == 3) + // { + // printf("sA[0]:%.3f\n",sA[0]); + // printf("dA[0]:%.3f\n",dA[0]); + // } + + if(trans) + { + __memcpy_async(rA,sA,(1)*sizeof(float),NRAM2NRAM); + __memcpy_async(rB,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); + __sync(); + + __memcpy_async(sA,OFFSET_ROW(dA,1,0),2*sizeof(float),GDRAM2NRAM); + __memcpy_async(rBp,OFFSET_B_ROW(dB,start,1),sizeof(float),GDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); + factor = 1.0 / rA[0]; + for(int i = 0; i < span; i++) + { + //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) + //float __bang_sum(const float *src, unsigned int elem_count) + rB[i*calc_length] *= factor; + } + + __sync(); + + for(int iter = 1; iter < m - 1; iter++) + { + __memcpy_async(rA,sA,(iter+1)*sizeof(float),NRAM2NRAM); + __memcpy_async(rB+iter,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); + __sync(); + + __memcpy_async(sA,OFFSET_ROW(dA,iter+1,0),(iter+2)*sizeof(float),GDRAM2NRAM); + __memcpy_async(rBp,OFFSET_B_ROW(dB,start,iter+1),sizeof(float),GDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); + factor = 1.0 / rA[iter]; + for(int i = 0; i < span; i++) + { + //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) + __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,iter); + //float __bang_sum(const float *src, unsigned int elem_count) + temp_b = 0; + //reduce add rC + for(int j = 0; j < iter; j++) + { + temp_b += rC[i*calc_length+j]; + } + temp_b = rB[i*calc_length+iter] - temp_b; + rB[i*calc_length+iter] = temp_b * factor; + } + + __sync(); + } + + __memcpy_async(rA,sA,(m)*sizeof(float),NRAM2NRAM); + __memcpy_async(rB+m-1,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); + __sync(); + factor = 1.0 / rA[m-1]; + for(int i = 0; i < span; i++) + { + //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) + __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,m-1); + + temp_b = 0; + //reduce add rC + for(int j = 0; j < m-1; j++) + { + temp_b += rC[i*calc_length+j]; + } + temp_b = rB[i*calc_length+m-1] - temp_b; + + rB[i*calc_length+m-1] = temp_b * factor; + } + __sync(); + + + __memcpy(OFFSET_B_ROW(dB,start,0),rB,calc_length*sizeof(float),NRAM2GDRAM,ldb * sizeof(float), calc_length * sizeof(float), span - 1); + __sync(); + + } + +} + __mlu_entry__ void mlu_strsm_rectile_kernel( + int batch, int stride, int m,int n, bool trans, float *dA, int32_t lda, float *dB, int32_t ldb) { int id = taskId; + int batch_id = id / 4; + if(batch_id >= batch) + return; + id = id % 4; + float* orignA = dA; + float* orignB = dB; + dA = orignA + batch_id * stride; + dB = orignB + batch_id * stride; - + // int remain = n - id * POTF_NB; + // bool if_execute = remain > 0; + // int span = (remain > POTF_NB || remain <= 0) ? POTF_NB : remain; int span = n / 4; int start = id * span; if(id == 3) { span = n - 3 * span; } + bool if_execute = span > 0; __mlu_shared__ float sA[8*POTF_NB]; __nram__ float rB[4*POTF_NB * 8*POTF_NB]; @@ -371,6 +1181,9 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( float temp_b = 0, factor = 0; + float sum = 0.0; + float c = 0.0; + float t = 0.0; if(id == 0) @@ -380,7 +1193,11 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( if(if_execute) __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); __sync_cluster(); - + // if(id == 3) + // { + // printf("sA[0]:%.3f\n",sA[0]); + // printf("dA[0]:%.3f\n",dA[0]); + // } if(trans) { @@ -397,6 +1214,8 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( factor = 1.0 / rA[0]; for(int i = 0; i < span; i++) { + //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) + //float __bang_sum(const float *src, unsigned int elem_count) rB[i*calc_length] *= factor; } @@ -417,12 +1236,26 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( factor = 1.0 / rA[iter]; for(int i = 0; i < span; i++) { + //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,iter); + //float __bang_sum(const float *src, unsigned int elem_count) temp_b = 0; + sum = 0.0; + c = 0.0; + t = 0.0; + //reduce add rC + // for(int j = 0; j < iter; j++) + // { + // temp_b += rC[i*calc_length+j]; + // } for(int j = 0; j < iter; j++) { - temp_b += rC[i*calc_length+j]; + temp_b = rC[i*calc_length+j] - c; //So far, so good: c is zero. + t = sum + temp_b; //Alas, sum is big, y small, so low-order digits of y are lost. + c = (t - sum) - temp_b; //(t - sum) recovers the high-order part of y; subtracting y recovers -(low part of y) + sum = t; } + temp_b = sum; temp_b = rB[i*calc_length+iter] - temp_b; rB[i*calc_length+iter] = temp_b * factor; } @@ -437,15 +1270,27 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( factor = 1.0 / rA[m-1]; for(int i = 0; i < span; i++) { + //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,m-1); + sum = 0.0; + c = 0.0; + t = 0.0; temp_b = 0; + //reduce add rC + // for(int j = 0; j < m-1; j++) + // { + // temp_b += rC[i*calc_length+j]; + // } for(int j = 0; j < m-1; j++) { - temp_b += rC[i*calc_length+j]; + temp_b = rC[i*calc_length+j] - c; //So far, so good: c is zero. + t = sum + temp_b; //Alas, sum is big, y small, so low-order digits of y are lost. + c = (t - sum) - temp_b; //(t - sum) recovers the high-order part of y; subtracting y recovers -(low part of y) + sum = t; } + temp_b = sum; temp_b = rB[i*calc_length+m-1] - temp_b; - rB[i*calc_length+m-1] = temp_b * factor; } __sync_cluster(); @@ -461,38 +1306,448 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( } - - -mluOpStatus_t strsm_rectile(bool upper, bool trans, int m, int n, float *d_a, int lda, float *d_b, int lddb, cnrtQueue_t queue) +// __mlu_entry__ void mlu_strsm_rectile_kernel( +// int m,int n, bool trans, +// float *dA, int32_t lda, +// float *dB, int32_t ldb) +// { +// int id = taskId; +// int remain = n - id * POTF_NB; +// bool if_execute = remain > 0; +// int span = (remain > POTF_NB || remain <= 0) ? POTF_NB : remain; +// __mlu_shared__ float sA[REC_NB]; +// __nram__ float rB[POTF_NB * REC_NB]; +// __nram__ float rC[POTF_NB * REC_NB]; +// __nram__ float rBp[POTF_NB]; +// __nram__ float rA[REC_NB]; +// __memset_nram(rB,POTF_NB*REC_NB,(float)ZERO); +// __sramset(sA,REC_NB*REC_NB,0); + + +// float temp_b = 0, factor = 0; + +// if(id == 0) +// { +// __memcpy_async(sA,dA,sizeof(float),LDRAM2SRAM); +// } +// __memcpy(rBp,OFFSET_B_ROW(dB,id*POTF_NB,0),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); +// __sync_cluster(); +// // if(id == 3) +// // { +// // printf("sA[0]:%.3f\n",sA[0]); +// // printf("dA[0]:%.3f\n",dA[0]); +// // } + +// if(trans) +// { +// __memcpy_async(rA,sA,(1)*sizeof(float),SRAM2NRAM); + +// __memcpy_async(rB,rBp,sizeof(float),NRAM2NRAM,REC_NB * sizeof(float), sizeof(float), span - 1); +// __sync_cluster(); +// // if(id == 0) +// // printf("rA[0]:%.3f\n",rA[0]); +// //print rB +// // printf("id :%d\n",id); +// // printf("before calculation\n"); +// // for(int i = 0; i < span; i++) +// // { +// // printf("rB[%d]:%.3f\n",i,rB[i*REC_NB]); +// // } +// if(id == 0) +// { +// __memcpy_async(sA,OFFSET_ROW(dA,1,0),2*sizeof(float),LDRAM2SRAM); +// } +// __memcpy_async(rBp,OFFSET_B_ROW(dB,id*POTF_NB,1),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); +// factor = 1.0 / rA[0]; +// for(int i = 0; i < span; i++) +// { +// //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) +// //float __bang_sum(const float *src, unsigned int elem_count) +// rB[i*REC_NB] *= factor; +// } +// //print rB after first calculation +// // printf("id :%d\n",id); +// // printf("after first calculation\n"); +// // for(int i = 0; i < span; i++) +// // { +// // printf("rB[%d]:%.3f\n",i,rB[i*REC_NB]); +// // } + +// __sync_cluster(); + +// for(int iter = 1; iter < m - 1; iter++) +// { +// __memcpy_async(rA,sA,(iter+1)*sizeof(float),SRAM2NRAM); + +// __memcpy_async(rB+iter,rBp,sizeof(float),NRAM2NRAM,REC_NB * sizeof(float), sizeof(float), span - 1); +// __sync_cluster(); +// //printf rA +// // printf("id :%d\n",id); +// // printf("rA:\n"); +// // for(int i = 0; i < iter+1; i++) +// // { +// // printf("%.3f ",rA[i]); +// // } +// //printf rB +// // printf("\n"); +// // printf("rB:\n"); +// // for(int i = 0; i < span; i++) +// // { +// // for(int j = 0; j < iter+1; j++) +// // { +// // printf("%.3f ",rB[i*REC_NB+j]); +// // } +// // printf("\n"); +// // } +// if(id == 0) +// { +// __memcpy_async(sA,OFFSET_ROW(dA,iter+1,0),(iter+2)*sizeof(float),LDRAM2SRAM); +// } +// __memcpy_async(rBp,OFFSET_B_ROW(dB,id*POTF_NB,iter+1),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); +// factor = 1.0 / rA[iter]; +// for(int i = 0; i < span; i++) +// { +// //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) +// __bang_mul(rC+i*REC_NB,rA,rB+i*REC_NB,iter); +// //float __bang_sum(const float *src, unsigned int elem_count) +// temp_b = 0; +// //reduce add rC +// for(int j = 0; j < iter; j++) +// { +// temp_b += rC[i*REC_NB+j]; +// } +// temp_b = rB[i*REC_NB+iter] - temp_b; +// rB[i*REC_NB+iter] = temp_b * factor; +// } + +// __sync_cluster(); +// } + +// __memcpy_async(rA,sA,(m)*sizeof(float),SRAM2NRAM); + +// __memcpy_async(rB+m-1,rBp,sizeof(float),NRAM2NRAM,REC_NB * sizeof(float), sizeof(float), span - 1); +// __sync_cluster(); +// factor = 1.0 / rA[m-1]; +// for(int i = 0; i < span; i++) +// { +// //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) +// __bang_mul(rC+i*REC_NB,rA,rB+i*REC_NB,m-1); + +// temp_b = 0; +// //reduce add rC +// for(int j = 0; j < m-1; j++) +// { +// temp_b += rC[i*REC_NB+j]; +// } +// temp_b = rB[i*REC_NB+m-1] - temp_b; + +// rB[i*REC_NB+m-1] = temp_b * factor; +// } +// __sync_cluster(); + +// // printf("id:%d\n",id); +// //print rB after complete calculation +// // printf("after complete calculation\n"); +// // for(int i = 0; i < span; i++) +// // { +// // for(int j = 0; j < REC_NB; j++) +// // { +// // printf("%.3f ",rB[i*REC_NB+j]); +// // } +// // printf("\n"); +// // } + +// if(if_execute) +// { +// __memcpy(OFFSET_B_ROW(dB,id*POTF_NB,0),rB,REC_NB*sizeof(float),NRAM2LDRAM,ldb * sizeof(float), REC_NB * sizeof(float), span - 1); +// } +// __sync_cluster(); + +// } + +// } + +mluOpStatus_t strsm_rectile(int batch, int stride, bool upper, bool trans, int m, int n, float *d_a, int lda, float *d_b, int lddb, cnrtQueue_t queue) { cnrtDim3_t dim; - dim.x = TASK_NUM; + + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + dim.y = 1; dim.z = 1; - cnrtFunctionType_t func_type = __CNRT_FUNC_TYPE__; - if(!upper && trans) + + if(batch>16) { + dim.x = batch; KERNEL_CHECK( - mlu_strsm_rectile_kernel<<>>(m,n,trans,d_a,lda,d_b,lddb)); + mlu_strsm_rectile_batch_kernel<<>>(batch,stride,m,n,trans,d_a,lda,d_b,lddb)); + } + else + { + int carry_batch = batch; + if(batch == 1) + { + func_type = CNRT_FUNC_TYPE_UNION1; + } + else if(batch == 2) + { + func_type = CNRT_FUNC_TYPE_UNION2; + } + else if(batch <= 4) + { + func_type = CNRT_FUNC_TYPE_UNION4; + carry_batch = 4; + } + else + { + func_type = CNRT_FUNC_TYPE_UNION8; + carry_batch = batch < 8 ? 8 : batch; + } + dim.x = carry_batch * 4; + + if(!upper && trans) + { + KERNEL_CHECK( + mlu_strsm_rectile_kernel<<>>(batch,stride,m,n,trans,d_a,lda,d_b,lddb)); + } } + + return MLUOP_STATUS_SUCCESS; } + + +// d_c = d_c - src +// __mlu_global__ +// void add_c(float *d_c, float* src,int ldc, int ldsrc, int m, int n) +// { +// int id = taskId; +// int span = m/4; + +// float* start_c = d_c + id * span * ldc; +// float* start_src = src + id * span * ldsrc; +// float* temp_c = start_c, *temp_src =start_src; +// int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); +// int32_t data_nram_num = MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; +// __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + +// if (id == 3) +// { +// span = m - 3 * span; +// } + // float *rC = (float *)nram_buffer; + // float *rsrc = (float *)nram_buffer + data_nram_num; +// int k = n/data_nram_num; +// int remain = n - k * data_nram_num; +// for(int i = 0; i < span; i++) +// { +// temp_c = start_c + i * ldc; +// temp_src = start_src + i * ldsrc; + // for(int i = 0; i < k; i++) + // { + // __memcpy(rC,temp_c,data_nram_num*sizeof(float),GDRAM2NRAM); + // __memcpy(rsrc,temp_src,data_nram_num*sizeof(float),GDRAM2NRAM); + // temp_c += data_nram_num; + // temp_src += data_nram_num; + // __sync(); + // __bang_add(rC, rC, rsrc, data_nram_num); + // __memcpy(temp_c - data_nram_num,rC,data_nram_num*sizeof(float),NRAM2GDRAM); + // __sync_cluster(); + // } + // if(remain > 0) + // { + // __memcpy(rC,temp_c,remain*sizeof(float),GDRAM2NRAM); + // __memcpy(rsrc,temp_src,remain*sizeof(float),GDRAM2NRAM); + // __sync(); + // __bang_add(rC, rC, rsrc, remain); + // __memcpy(temp_c,rC,remain*sizeof(float),NRAM2GDRAM); + // __sync_cluster(); + // } + + +// } + +// } + + +// __mlu_global__ +// void add_c(float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) +// { + + +// __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; +// if (beta == 0.0f) +// { +// if(taskId == 0) +// { +// __memcpy(sram_buffer,src,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); + + +// } +// __sync_cluster(); +// if(taskId == 0) +// { +// __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); +// } +// __sync_cluster(); +// return; +// } + + +// if (taskId == 0) { +// __memcpy(sram_buffer,d_c,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldc*sizeof(float),m-1); +// } + +// __sync_cluster(); + + +// int32_t data_num = m*n; +// int32_t data_per_core = data_num / taskDim; +// int32_t data_last_core = data_per_core + data_num % taskDim; +// const float *a_offset = src + taskId * data_per_core; +// const float *b_offset = (float*)sram_buffer + taskId * data_per_core; +// float *output_offset = (float*)sram_buffer + taskId * data_per_core; + +// if (taskId == taskDim - 1) { +// data_per_core = data_last_core; +// } + +// int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); +// int32_t data_nram_num = +// MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; +// float *a_nram = (float *)nram_buffer; +// float *b_nram = (float *)a_nram + data_nram_num; +// int32_t loop_num = data_per_core / data_nram_num; +// int32_t rem_nram_num = data_per_core % data_nram_num; + +// for (int32_t i = 0; i < loop_num; i++) { +// __memcpy(a_nram, a_offset + i * data_nram_num, +// data_nram_num * sizeof(float), GDRAM2NRAM); +// __memcpy(b_nram, b_offset + i * data_nram_num, +// data_nram_num * sizeof(float), SRAM2NRAM); +// __bang_add(a_nram, a_nram, b_nram, data_nram_num); +// __memcpy(output_offset + i * data_nram_num, a_nram, +// data_nram_num * sizeof(float), NRAM2SRAM); +// } +// if (rem_nram_num != 0) { +// int32_t rem_align_num = +// (rem_nram_num + align_num - 1) / align_num * align_num; +// __memcpy(a_nram, a_offset + loop_num * data_nram_num, +// rem_nram_num * sizeof(float), GDRAM2NRAM); +// __memcpy(b_nram, b_offset + loop_num * data_nram_num, +// rem_nram_num * sizeof(float), SRAM2NRAM); +// __bang_add(a_nram, a_nram, b_nram, rem_align_num); +// __memcpy(output_offset + loop_num * data_nram_num, a_nram, +// rem_nram_num * sizeof(float), NRAM2SRAM); +// } +// __sync_cluster(); + +// if (taskId == 0) { +// __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); +// } + +// __sync_cluster(); + +// } + __mlu_global__ -void add_c(float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) +void add_c_batch(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) { +// __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + int id = taskId; + int batch_id = id; + if(batch_id >= batch) + return; + float* orignC = d_c; + float* orignSrc = src; + d_c = orignC + batch_id * stride; + src = orignSrc + batch_id * m*n; + - __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; if (beta == 0.0f) { - if(taskId == 0) + + // __memcpy(nram_buffer,src,n*sizeof(float),GDRAM2NRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); + + // __memcpy(d_c,nram_buffer,n*sizeof(float),NRAM2GDRAM,ldc*sizeof(float),n*sizeof(float),m-1); + __memcpy(d_c,src,n*sizeof(float),GDRAM2GDRAM,ldc*sizeof(float),ldsrc*sizeof(float),m-1); + return; + } + + float* a_sram = (float*)nram_buffer + m * n; + + __memcpy(nram_buffer,d_c,n*sizeof(float),LDRAM2NRAM,n*sizeof(float),ldc*sizeof(float),m-1); + __memcpy(a_sram,src,n*m*sizeof(float),LDRAM2NRAM); + + __sync(); + + + int32_t data_num = m*n; + const float *a_offset = a_sram; + const float *b_offset = (float*)nram_buffer; + + float *a_nram = (float *)a_offset; + float *b_nram = (float *)b_offset; + + __bang_add(b_nram, a_nram, b_nram, data_num); + + __memcpy(d_c,b_nram,n*sizeof(float),NRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); + + + __sync(); + +} + +__mlu_global__ +void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) +{ + +// __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + int id = taskId; + int ipu_per_cluster = 4; + int batch_id = id / ipu_per_cluster; + if(batch_id >= batch) + return; + id = taskId % ipu_per_cluster; + float* orignC = d_c; + float* orignSrc = src; + d_c = orignC + batch_id * stride; + src = orignSrc + batch_id * m*n; + + // if(batch_id == 1 && id== 0) + // { + // printf("add_c d_c:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < n; j++) + // { + // printf("%.3f ",d_c[i*ldc+j]); + // } + // printf("\n"); + // } + // printf("add_c src:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < n; j++) + // { + // printf("%.3f ",src[i*n+j]); + // } + // printf("\n"); + // } + // } + + __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; + if (beta == 0.0f) + { + if(id == 0) { __memcpy(sram_buffer,src,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); } __sync_cluster(); - if(taskId == 0) + if(id == 0) { __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); } @@ -502,7 +1757,7 @@ void add_c(float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) float* a_sram = (float*)sram_buffer + 3* m * n; - if (taskId == 0) { + if (id == 0) { __memcpy(sram_buffer,d_c,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldc*sizeof(float),m-1); __memcpy(a_sram,src,n*m*sizeof(float),GDRAM2SRAM); } @@ -511,18 +1766,19 @@ void add_c(float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) int32_t data_num = m*n; - int32_t data_per_core = data_num / taskDim; - int32_t data_last_core = data_per_core + data_num % taskDim; - const float *a_offset = a_sram + taskId * data_per_core; - const float *b_offset = (float*)sram_buffer + taskId * data_per_core; - float *output_offset = (float*)sram_buffer + taskId * data_per_core; + int32_t data_per_core = data_num / ipu_per_cluster; + int32_t data_last_core = data_per_core + data_num % ipu_per_cluster; + const float *a_offset = a_sram + id * data_per_core; + const float *b_offset = (float*)sram_buffer + id * data_per_core; + float *output_offset = (float*)sram_buffer + id * data_per_core; - if (taskId == taskDim - 1) { + if (id == ipu_per_cluster - 1) { data_per_core = data_last_core; } int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); - +// int32_t data_nram_num = +// MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; int32_t data_nram_num = MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; float *a_nram = (float *)nram_buffer; @@ -552,9 +1808,17 @@ void add_c(float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) } __sync_cluster(); - if (taskId == 0) { + if (id == 0) { __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2GDRAM,ldc*sizeof(float),n*sizeof(float),m-1); - + // printf("d_c after add:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < n; j++) + // { + // printf("%.3f ",d_c[i*ldc+j]); + // } + // printf("\n"); + // } } __sync_cluster(); @@ -562,19 +1826,21 @@ void add_c(float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) } -mluOpStatus_t sgemm(bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, float* d_b, int ldb, float* d_c, int ldc, mluOpHandle_t handle) +mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, int stride_a, float* d_b, int ldb, int stride_b, float* d_c, int ldc, int stride_c, mluOpHandle_t handle) { if(k==0) return MLUOP_STATUS_SUCCESS; int matmul_is_transA = trans_a; int matmul_is_transB = trans_b; - + // float matmul_alpha = alpha; + // float matmul_beta = beta; int matmul_requested_algo = 1; int matmul_recieved_algo = 0; size_t tempSize_matmulExtra = 0; int matmul_computetype = MLUOP_DTYPE_FLOAT; float *workspace; int matmul_use_beta = beta == 0.0f ? 0 : 1; + // lda = lda * sizeof(float); cnrtQueue_t queue; mluOpGetQueue(handle,&queue); @@ -613,9 +1879,15 @@ mluOpStatus_t sgemm(bool trans_a, bool trans_b, int m, int n, int k, float alpha CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_USE_STRIDE, &lda, sizeof(int32_t))); - int32_t matmul_a_shape[2] = {m, lda}; - int32_t matmul_b_shape[2] = {n, ldb}; - int32_t matmul_c_shape[2] = {m, n}; + // int32_t matmul_a_shape[2] = {m, k}; + // int32_t matmul_b_shape[2] = {n, k}; + // int32_t matmul_a_shape[3] = {batch, 16, lda}; + // int32_t matmul_b_shape[3] = {batch, 16, ldb}; + // int32_t matmul_c_shape[3] = {batch, 16, n}; + + int32_t matmul_a_shape[2] = {batch, stride_a}; + int32_t matmul_b_shape[2] = {batch, stride_b}; + int32_t matmul_c_shape[2] = {batch, m*n}; CHECK_RETURN(api_name, mluOpSetTensorDescriptor( matmul_a_desc, MLUOP_LAYOUT_ARRAY, @@ -627,6 +1899,24 @@ mluOpStatus_t sgemm(bool trans_a, bool trans_b, int m, int n, int k, float alpha matmul_c_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_FLOAT, 2, matmul_c_shape)); + + // matmul_a_desc->strides[0] = lda; + // matmul_a_desc->strides[1] = 1; + + + // matmul_b_desc->strides[0] = ldb; + // matmul_b_desc->strides[1] = 1; + // matmul_c_desc->strides[0] = ldc; + // matmul_c_desc->strides[1] = 1; + // matmul_a_desc->dims[0] = m; + // matmul_a_desc->dims[1] = k; + // matmul_b_desc->dims[0] = n; + // matmul_b_desc->dims[1] = k; + // matmul_c_desc->dims[0] = m; + // matmul_c_desc->dims[1] = n; + + + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); @@ -643,38 +1933,291 @@ mluOpStatus_t sgemm(bool trans_a, bool trans_b, int m, int n, int k, float alpha CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo, &tempSize_matmulExtra)); + + + // printf("m = %d, n = %d, k = %d\n",m,n,k); + // printf("alpha:%.3f, beta:%.3f\n",alpha,beta); + // float* h_a = (float*)malloc(m*k*sizeof(float)); + // float* h_b = (float*)malloc(n*k*sizeof(float)); + // float* h_c = (float*)malloc(m*n*sizeof(float)); + // cnrtMemcpy(h_a, d_a, sizeof(float)*m*k, CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtMemcpyAsync(h_b, d_b, sizeof(float)*n*k, queue, CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtMemcpyAsync(h_c, d_c, sizeof(float)*m*n, queue, CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtQueueSync(queue); + + + + // printf("before matmul, a:\n"); + + // for(int i = 0; i < m;i++) + // { + // for(int j = 0; j >>(beta,d_c,workspace,ldc,n,m,n)); + int nram_space = 2 * m * n * sizeof(float); + if(batch > 1 && nram_space < MAX_NRAM_SIZE) + { + dim.x = batch; + KERNEL_CHECK(add_c_batch<<>>(batch, stride_c, beta,d_c,workspace,ldc,n,m,n)); + } + else + { + int carry_batch = batch; + if(batch == 1) + { + func_type = CNRT_FUNC_TYPE_UNION1; + } + else if(batch == 2) + { + func_type = CNRT_FUNC_TYPE_UNION2; + } + else if(batch <= 4) + { + func_type = CNRT_FUNC_TYPE_UNION4; + carry_batch = 4; + } + else + { + func_type = CNRT_FUNC_TYPE_UNION8; + carry_batch = batch < 8 ? 8 : batch; + } + dim.x = carry_batch * 4; + + KERNEL_CHECK(add_c<<>>(batch, stride_c, beta,d_c,workspace,ldc,n,m,n)); + } + + } + + + + // cnnlMatMul_v2( + // cnnl_handle, matmul_desc, matmul_algo, &matmul_alpha, cnnl_a_desc, + // d_a, cnnl_b_desc, d_b, &matmul_beta, + // cnnl_c_desc, d_c, workspace, + // tempSize_matmulExtra, cnnl_d_desc, d_c); + + // cnrtMemcpy(h_a, d_a, sizeof(float)*m*k, CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtMemcpy(h_b, d_b, sizeof(float)*n*k, CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtMemcpy(h_c, d_c, sizeof(float)*m*n, CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtQueueSync(queue); + + // printf("after matmul, a:\n"); + + // for(int i = 0; i < m;i++) + // { + // for(int j = 0; j = batch) + return; + + float* orign_input = d_input; + float* orign_output = d_output; + d_input = orign_input + batch_id * stride_input; + d_output = orign_output + batch_id * stride_output; + + // __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; + + float* nram_offset = (float*)nram_buffer; + float* nram_src0 = nram_offset; + //nram_src1存放列主序的计算完成的矩阵 + float* nram_src1 = nram_src0 + m * m; + float* nram_src2 = nram_src1 + m * m; + float* mul_result = nram_src2 + m; + float* nram_dst = nram_src2 + m * m; + float* diag_start = nram_dst; + int height = m, span = m; + + __memset_nram(nram_offset, 4 * m * m, (float)ZERO); + + //void __memcpy(void *dst, const void *src, unsigned int size, mluMemcpyDirection_t dir, int dst_stride0, unsigned int dst_segnum1, int dst_stride1, unsigned int dst_segnum2, int src_stride0, unsigned int src_segnum1, int src_stride1, unsigned int src_segnum2) + // __memcpy(sram_buffer,d_input,m*m*sizeof(float),GDRAM2SRAM); + __memcpy(nram_dst,d_input,m*sizeof(float),GDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),m-1); + float result = 0.0; + for(int i = 0; i < m; i++) + { + int off = i * m + i; + result = nram_dst[off]; + result = 1.0 / result; + nram_src1[i*height+i] = result; + nram_dst[i*span + i] = result; + diag_start[off] = result; + } + + for(int i = 1; i < height; i++) + { + __memcpy(nram_src2,diag_start+i*m,i*sizeof(float),NRAM2NRAM); + int num = std::min(i, span); + float diag_element = diag_start[i*m+i]; + for(int j = 0; j < num; j++) + { + float temp = 0.0; + __bang_mul(mul_result,nram_src2,nram_src1+j*height,i); + for(int k = 0; k< i; k++) + { + temp += mul_result[k]; + } + temp = temp * -1.0 * diag_element; + nram_dst[i*span+j] = temp; + nram_src1[j*height+i] = temp; + } + __sync(); + + } + + __memcpy(d_output,nram_dst,m*sizeof(float),NRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); + + +} + +__mlu_global__ +void inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, float* d_output, int ld_output, int stride_output, int m) +{ + int id = taskId; + int batch_id = id / 4; + if(batch_id >= batch) + return; + id = taskId % 4; + float* orignInput = d_input; + float* orignOutput = d_output; + d_input = orignInput + batch_id * stride_input; + d_output = orignOutput + batch_id * stride_output; + // __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; - if (taskId == 0) { + if (id == 0) { + // __memcpy(sram_buffer,d_input,m*m*sizeof(float),GDRAM2SRAM); __memcpy(sram_buffer,d_input,m*sizeof(float),GDRAM2SRAM,m*sizeof(float),ld_input*sizeof(float),m-1); } __sync_cluster(); - int id = taskId; + int span = m/taskDim; int start = id * span; if (id == 3) @@ -682,6 +2225,7 @@ void inverse_kernel(float *d_input, int ld_input, float* d_output, int ld_output span = m - 3 * span; } float* nram_offset = (float*)nram_buffer + id * 3 * m * m; + //nram_src1存放列主序的计算完成的矩阵 float* nram_src1 = nram_offset; float* nram_src2 = nram_src1 + m * m; float* mul_result = nram_src2 + m; @@ -713,6 +2257,21 @@ void inverse_kernel(float *d_input, int ld_input, float* d_output, int ld_output for(int j = 0; j < num; j++) { float temp = 0.0; + // if(id == 0 && i == 3) + // { + // printf("nram_src2:\n"); + // for(int k = 0; k < i; k++) + // { + // printf("%.3f ",nram_src2[k]); + // } + // printf("\n"); + // printf("nrma_src1:\n"); + // for(int k = 0; k < i; k++) + // { + // printf("%.3f ",nram_src1[j*height+k]); + // } + // printf("diag_element:%.3f\n",diag_element); + // } __bang_mul(mul_result,nram_src2,nram_src1+j*height,i); for(int k = 0; k< i; k++) { @@ -733,17 +2292,37 @@ void inverse_kernel(float *d_input, int ld_input, float* d_output, int ld_output __sync_cluster(); - if (taskId == 0) { + if (id == 0) { // __memcpy(d_input,sram_buffer,m*m*sizeof(float),SRAM2GDRAM); __memcpy(d_output,sram_buffer,m*sizeof(float),SRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); } + // if(id == 0) + // { + // //printf nram_dst + // printf("last diag_start:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < m; j++) + // { + // printf("%.3f ",diag_start[i*m+j]); + // } + // printf("\n"); + // } + // } + } -__mlu_global__ void set_zero(bool upper, int m, float* d_c, int lddc) +__mlu_global__ void set_zero(int batch, int stride, bool upper, int m, float* d_c, int lddc) { int id = taskId; + int batch_id = id / 4; + if(batch_id >= batch) + return; + float* orignC = d_c; + d_c = orignC + batch_id * stride; + id = taskId % 4; int span = m/taskDim; int pre = id * span; float* start_c = d_c + pre * lddc + pre; @@ -768,9 +2347,36 @@ __mlu_global__ void set_zero(bool upper, int m, float* d_c, int lddc) } } +// mluOpStatus_t batch_inverse(int batch, float *d_input, int ld_input, int stride_input, float* d_output, int ld_output, int stride_output, int m, mluOpHandle_t handle) +// { +// if(m==0) +// return MLUOP_STATUS_SUCCESS; +// mluOpTensorDescriptor_t input_desc, output_desc, info_desc; +// std::string api_name = "Cholesky"; + +// cnrtQueue_t queue; +// mluOpGetQueue(handle,&queue); + +// int32_t *info; +// CNRT_CHECK(cnrtMalloc((void **)&info, batch*sizeof(int32_t)); + +// CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&input_desc)); +// CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&output_desc)); +// CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&info_desc)); + +// int32_t input_shape[3] = {batch, m*m}; +// int32_t output_shape[3] = {batch, m*m}; +// int32_t info_shape[1] = {batch}; +// float* workspace; +// CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*m*m*sizeof(float))); -mluOpStatus_t strsm(bool upper, bool trans, int m, int n, float* d_a, int lda, float* d_b, int ldb, mluOpHandle_t handle) +// __memcpy(sram_buffer,d_input,m*sizeof(float),GDRAM2SRAM,m*sizeof(float),m-1,stride_input * sizeof(float),batch_num-1,ld_input*sizeof(float),m-1,m*m*sizeof(float),batch_num-1); + +// } + + +mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, float* d_a, int lda, float* d_b, int ldb, mluOpHandle_t handle) { if(n==0) return MLUOP_STATUS_SUCCESS; @@ -781,13 +2387,13 @@ mluOpStatus_t strsm(bool upper, bool trans, int m, int n, float* d_a, int lda, f mluOpGetQueue(handle,&queue); int32_t *info; - CNRT_CHECK(cnrtMalloc((void **)&info, sizeof(int32_t))); + CNRT_CHECK(cnrtMalloc((void **)&info, batch*sizeof(int32_t))); CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc)); CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&info_desc)); - int32_t matmul_a_shape[2] = {m, m}; - int32_t matmul_b_shape[2] = {n, ldb}; + int32_t matmul_a_shape[2] = {batch, m*m}; + int32_t matmul_b_shape[2] = {batch, stride}; int32_t info_shape[1] = {1}; CHECK_RETURN(api_name, mluOpSetTensorDescriptor( @@ -805,10 +2411,14 @@ mluOpStatus_t strsm(bool upper, bool trans, int m, int n, float* d_a, int lda, f DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(info_desc, cnnl_info_desc); + //cnnlStatus_t cnnlInverse(cnnlHandle_t handle, const cnnlTensorDescriptor_t input_desc, const void *input, const bool is_trans, const cnnlTensorDescriptor_t output_desc, void *output, const cnnlTensorDescriptor_t infos_desc, void *infos) + // cnnlInverse(cnnl_handle, cnnl_a_desc,work_space,false,cnnl_a_desc,work_space,cnnl_info_desc,info); float* workspace; - CNRT_CHECK(cnrtMalloc((void **)&workspace, m*m*sizeof(float))); - CNRT_CHECK(cnrtMemset(workspace, 0.0, m*m*sizeof(float))); + CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*m*m*sizeof(float))); + CNRT_CHECK(cnrtMemset(workspace, 0.0, batch*m*m*sizeof(float))); + float* h_i; + h_i = (float*)malloc(m*m*sizeof(float)); int m1 = m/2; @@ -818,47 +2428,186 @@ mluOpStatus_t strsm(bool upper, bool trans, int m, int n, float* d_a, int lda, f float* workspace2 = workspace1 + m1*m+m1; cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; - dim.x = 4; dim.y = 1; dim.z = 1; - KERNEL_CHECK(inverse_kernel<<>>(d_a,lda,workspace1,m,m1)); - KERNEL_CHECK(inverse_kernel<<>>(d_a+m1*lda+m1,lda,workspace2,m,m2)); + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + if(batch > 1) + { + dim.x = batch; + KERNEL_CHECK(batch_inverse_kernel<<>>(batch, d_a,lda,stride, workspace1,m,m*m,m1)); + KERNEL_CHECK(batch_inverse_kernel<<>>(batch, d_a+m1*lda+m1,lda,stride, workspace2,m,m*m,m2)); + } + else + { + int carry_batch = batch; + if(batch == 1) + { + func_type = CNRT_FUNC_TYPE_UNION1; + } + else if(batch == 2) + { + func_type = CNRT_FUNC_TYPE_UNION2; + } + else if(batch <= 4) + { + func_type = CNRT_FUNC_TYPE_UNION4; + carry_batch = 4; + } + else + { + func_type = CNRT_FUNC_TYPE_UNION8; + carry_batch = batch < 8 ? 8 : batch; + } + dim.x = carry_batch * 4; - sgemm(false,false,m2,m1,m1,1.0f,0.0f,d_a+m1*lda,lda,workspace1,m,workspace1+m1*m,m,handle); - sgemm(false,false,m2,m2,m1,-1.0f,0.0f,workspace2,m,workspace1+m1*m,m,workspace1+m1*m,m,handle); + KERNEL_CHECK(inverse_kernel<<>>(batch, d_a,lda,stride, workspace1,m,m*m,m1)); + KERNEL_CHECK(inverse_kernel<<>>(batch, d_a+m1*lda+m1,lda,stride, workspace2,m,m*m,m2)); + + } + + + sgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,d_a+m1*lda,lda,stride,workspace1,m,m*m,workspace1+m1*m,m,m*m,handle); + sgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,workspace2,m,m*m,workspace1+m1*m,m,m*m,workspace1+m1*m,m,m*m,handle); + cnrtQueueSync(queue); + + // cnrtMemcpy(h_i, workspace, m*m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // print + // printf("batch 0 whole inverse:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < m; j++) + // { + // printf("%.3f ",h_i[i*m+j]); + // } + // printf("\n"); + // } + // cnrtMemcpy(h_i, workspace+m*m, m*m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // // print + // printf("batch 1 whole inverse:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < m; j++) + // { + // printf("%.3f ",h_i[i*m+j]); + // } + // printf("\n"); + // } - cnnlStrideBatchMatMul(cnnl_handle, false, true, n,m, m, 1, 1.0, cnnl_b_desc, d_b, ldb, n*ldb, cnnl_a_desc, workspace, m, m*m, 0.0f, cnnl_b_desc, d_b, ldb, n*ldb); + + // cnrtMemcpy(h_i, work_space, m*m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // //print h_i + // printf("h_i:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < m; j++) + // { + // printf("%.3f ",h_i[i*m+j]); + // } + // printf("\n"); + // } + // float *h_i; + // h_i = (float*)malloc(m*n*sizeof(float)); + // cnrtQueueSync(queue); + + // for(int i = 0; i < n; i++) + // { + // cnrtMemcpy(h_i+i*m, d_b+i*ldb, m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + + // } + // cnrtQueueSync(queue); + // printf("before strsm, b:\n"); + // for(int i = 0; i < n; i++) + // { + // for(int l = 0; l < m; l++) + // { + // printf("%.3f ",h_i[i*m+l]); + // } + // printf("\n"); + // } + //cnnlStrideBatchMatMul(cnnlHandle_t handle, const bool is_transa, const bool is_transb, const int m, const int n, const int k, const int batch_size, const float alpha, + //const cnnlTensorDescriptor_t a_desc, const void *a, const int lda, const int64_t stride_a, const cnnlTensorDescriptor_t b_desc, const void *b, const int ldb, const int64_t stride_b, const float beta, constcnnlTensorDescriptor_t c_desc, void *c, const int ldc, const int64_t stride_c) + // cnnlStrideBatchMatMul(cnnl_handle, false, true, m, n, m, 1, 1.0, cnnl_a_desc, work_space, m, m*NB, cnnl_b_desc, d_b, ldb, ldb*n, 0.0f, cnnl_b_desc, d_b, ldb, ldb*n); + cnnlStrideBatchMatMul(cnnl_handle, false, true, n,m, m, batch, 1.0, cnnl_b_desc, d_b, ldb, stride, cnnl_a_desc, workspace, m, m*m, 0.0f, cnnl_b_desc, d_b, ldb, stride); + // cnrtQueueSync(queue); + + // for(int i = 0; i < n; i++) + // { + // cnrtMemcpy(h_i+i*m, d_b+i*ldb, m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + + // } + // cnrtQueueSync(queue); + // printf("after strsm, b:\n"); + // for(int i = 0; i < n; i++) + // { + // for(int l = 0; l < m; l++) + // { + // printf("%.3f ",h_i[i*m+l]); + // } + // printf("\n"); + // } + + return MLUOP_STATUS_SUCCESS; } +mluOpStatus_t set_half_zero(int batch,int stride,float* d_a, int lda, int m, mluOpHandle_t handle) +{ + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; + dim.x = 4; + dim.y = 1; + dim.z = 1; + KERNEL_CHECK(set_zero<<>>(batch, stride, false, m, d_a,lda)); + return MLUOP_STATUS_SUCCESS; +} -mluOpStatus_t ssyrk(bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle) +mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle) { if(k==0) return MLUOP_STATUS_SUCCESS; - sgemm(false,true,n,n,k,-1.0f,1.0f,d_a,ldda,d_a,ldda,d_c,lddc,handle); + sgemm(batch, false,true,n,n,k,-1.0f,1.0f,d_a,ldda,stride,d_a,ldda,stride,d_c,lddc,stride,handle); cnrtQueue_t queue; mluOpGetQueue(handle,&queue); cnrtDim3_t dim; cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; - dim.x = 4; + int carry_batch = batch; + if(batch == 1) + { + func_type = CNRT_FUNC_TYPE_UNION1; + } + else if(batch == 2) + { + func_type = CNRT_FUNC_TYPE_UNION2; + } + else if(batch <= 4) + { + func_type = CNRT_FUNC_TYPE_UNION4; + carry_batch = 4; + } + else + { + func_type = CNRT_FUNC_TYPE_UNION8; + carry_batch = batch < 8 ? 8 : batch; + } + dim.x = carry_batch * 4; dim.y = 1; dim.z = 1; - KERNEL_CHECK(set_zero<<>>(upper, n, d_c,lddc)); + KERNEL_CHECK(set_zero<<>>(batch, stride, upper, n, d_c,lddc)); return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t mlu_spotrf_rectile(bool trans, bool uplo, int n, int recnb, float* d_A, int lda, int gbstep, mluOpHandle_t handle) +mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, int n, int recnb, float* d_A, int lda, int gbstep, mluOpHandle_t handle) { cnrtQueue_t queue; mluOpGetQueue(handle,&queue); @@ -868,24 +2617,50 @@ mluOpStatus_t mlu_spotrf_rectile(bool trans, bool uplo, int n, int recnb, float* if(n <=recnb) { // printf("n:%d, recnb:%d, mlu_spotf2_lpin\n",n,recnb); - mlu_spotf2_lpin(trans, uplo,n,lda,d_A,gbstep,queue); + mlu_spotf2_lpin(batch, stride, trans, uplo,n,lda,d_A,gbstep,queue); } else { int n1 = n/2; int n2 = n-n1; - mlu_spotrf_rectile(trans,uplo,n1,recnb,OFFSET_ROW(d_A,0,0),lda,gbstep, handle); - strsm_rectile(uplo,trans,n1,n2,OFFSET_ROW(d_A,0,0),lda,OFFSET_ROW(d_A,n1,0),lda,queue); - ssyrk(uplo,trans,n2,n1,d_A+n1*lda,lda,OFFSET_ROW(d_A,n1,n1),lda,handle); - mlu_spotrf_rectile(trans,uplo,n2,recnb,OFFSET_ROW(d_A,n1,n1),lda,gbstep+n1,handle); + // printf("n1:%d, n2:%d recnb:%d,mlu_spotrf_rectile1\n",n1,n2,recnb); + mlu_spotrf_rectile(batch,stride,trans,uplo,n1,recnb,OFFSET_ROW(d_A,0,0),lda,gbstep, handle); + // printf("n1:%d, n2:%d recnb:%d,strsm_rectile\n",n1,n2,recnb); + // strsm(batch, stride, uplo,trans,n1, n2, OFFSET_ROW(d_A,0,0), lda,OFFSET_ROW(d_A,n1,0), lda, handle); + strsm_rectile(batch, stride, uplo,trans,n1,n2,OFFSET_ROW(d_A,0,0),lda,OFFSET_ROW(d_A,n1,0),lda,queue); + // printf("n1:%d, n2:%d recnb:%d,ssyrk\n",n1,n2,recnb); + ssyrk(batch,stride,uplo,trans,n2,n1,d_A+n1*lda,lda,OFFSET_ROW(d_A,n1,n1),lda,handle); + // printf("n1:%d, n2:%d recnb:%d,mlu_spotrf_rectile2\n",n1,n2,recnb); + mlu_spotrf_rectile(batch,stride,trans,uplo,n2,recnb,OFFSET_ROW(d_A,n1,n1),lda,gbstep+n1,handle); - + //printf d_A+n1*lda+n1 + // printf("after calculate, dA+n1*lda+n1:\n"); + // for(int i = 0; i < n2; i++) + // { + // for(int j = 0; j < n2; j++) + // { + // printf("%.3f ",*(d_A+n1*lda+n1+i*lda+j)); + // } + // printf("\n"); + // } + // //printf work_space + n1 * NB+n1 + // printf("after calculate, work_space + n1 * NB+n1:\n"); + // for(int i = 0; i < n2; i++) + // { + // for(int j = 0; j < n2; j++) + // { + // printf("%.3f ",*(work_space + n1 * NB+n1+i*NB+j)); + // } + // printf("\n"); + // } } + // strsm(false,true,n, n, work_space, n,d_A, n, work_space, handle); return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t transpose(int m, float* d_input,float* d_output, mluOpHandle_t handle) +// m * n +mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, mluOpHandle_t handle) { if(m==0) return MLUOP_STATUS_SUCCESS; @@ -894,23 +2669,23 @@ mluOpStatus_t transpose(int m, float* d_input,float* d_output, mluOpHandle_t han mluOpTensorDescriptor_t trans_input_desc, trans_output_desc; std::string api_name = "Cholesky"; - const int input_dim = 2; + const int input_dim = 3; CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_input_desc)); CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_output_desc)); - int32_t transpose_input_shape[2] = {m, m}; - int32_t transpose_output_shape[2] = {m, m}; + int32_t transpose_input_shape[3] = {batch, m, n}; + int32_t transpose_output_shape[3] = {batch, n, m}; CHECK_RETURN(api_name, mluOpSetTensorDescriptor( trans_input_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 2, transpose_input_shape)); + MLUOP_DTYPE_FLOAT, 3, transpose_input_shape)); CHECK_RETURN(api_name, mluOpSetTensorDescriptor( trans_output_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 2, transpose_output_shape)); + MLUOP_DTYPE_FLOAT, 3, transpose_output_shape)); - int permute[2] = {1, 0}; + int permute[3] = {0, 2, 1}; DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_input_desc, cnnl_in_desc); @@ -927,9 +2702,17 @@ mluOpStatus_t transpose(int m, float* d_input,float* d_output, mluOpHandle_t han CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, cnnl_trans_desc, size)); + // printf("transpose1 need size: %zu\n",*size); float *workspace = NULL; + if(*size > 0ul) + { + printf("start malloc\n"); + CNRT_CHECK(cnrtMalloc((void **)&workspace, *size)); + printf("transpose2 need size: %zu\n",*size); + } + CALL_CNNL(cnnlTranspose_v2(cnnl_handle, cnnl_trans_desc, cnnl_in_desc, d_input, cnnl_out_desc, d_output, workspace, *size)); diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu new file mode 100644 index 000000000..73062f98b --- /dev/null +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -0,0 +1,1089 @@ +#include "cholesky.h" +#define COMPLEX_OFFSET(A,off) (((float*)A) + (2 * (off))) +#define COMPLEX_TYPE_SIZE ((2) * sizeof(float)) +__nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + +// __mlu_func__ +// void small_cgemm(int m,int k, +// float* A0, const int lda, +// int width, float* sram_buffer, float* dst) +// { +// int id = taskId % 4; +// int span = CPOTF_NB; +// int finish = id * span; +// int remain = m - finish; +// bool if_execute = remain > 0; +// span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; + + +// float* rC = dst + CPOTF_NB *CREC_NB*2; +// float* rA = rC + CPOTF_NB *CREC_NB*2; +// float* rp = rA + CPOTF_NB *CREC_NB*2; +// float* rB = rp + CPOTF_NB *CREC_NB*2; + +// float *sB = sram_buffer; + +// int total_length = k + width; +// int loop_width = CPOTF_NB; +// int b_height = std::min(width, CPOTF_NB); + + +// if(if_execute) +// { +// int prefetch_width = std::min(loop_width, total_length); +// __memcpy(rp,COMPLEX_OFFSET(A0,finish*lda),prefetch_width*COMPLEX_TYPE_SIZE,GDRAM2NRAM,CPOTF_NB*COMPLEX_TYPE_SIZE,lda*COMPLEX_TYPE_SIZE,span-1); +// } +// __memset_nram(rC,CPOTF_NB*CREC_NB*2,(float)ZERO); +// __sync_cluster(); +// if(id == 0) +// { +// __memcpy(sB,rp,CPOTF_NB*CPOTF_NB*COMPLEX_TYPE_SIZE,NRAM2SRAM); +// } +// __sync_cluster(); +// float a1,a2,b1,b2; +// for(int iter = 0; iter < k; iter += loop_width) +// { +// __bang_move(rA,rp,CPOTF_NB * span*COMPLEX_TYPE_SIZE); +// __memcpy(rB,sB,CPOTF_NB*b_height*COMPLEX_TYPE_SIZE,SRAM2NRAM); +// __sync_cluster(); +// if(if_execute) +// { +// int prefetch_width = std::min(loop_width, total_length-iter-loop_width); +// __memcpy_async(rp,COMPLEX_OFFSET(A0,finish*lda+iter+loop_width),prefetch_width*COMPLEX_TYPE_SIZE,GDRAM2NRAM,CPOTF_NB*COMPLEX_TYPE_SIZE,lda*COMPLEX_TYPE_SIZE,span-1); +// } +// for(int i = 0; i < span; i++) +// { +// for(int j = 0; j < b_height; j++) +// { +// for(int h = 0; h < loop_width; h++) +// { +// a1 = rA[(i*CPOTF_NB+h)*2]; +// b1 = rA[(i*CPOTF_NB+h)*2+1]; +// a2 = rB[(j*CPOTF_NB+h)*2]; +// b2 = rB[(j*CPOTF_NB+h)*2+1]; +// rC[(i*CPOTF_NB+j)*2] += (a1*a2+b1*b2); +// rC[(i*CPOTF_NB+j)*2+1] += (a2*b1-a1*b2); +// } +// } +// } +// __sync_cluster(); +// if(id == 0) +// { +// __memcpy(sB,rp,CPOTF_NB*b_height*COMPLEX_TYPE_SIZE,NRAM2SRAM); +// } +// __sync_cluster(); +// } + +// __bang_sub(rp,rp,rC,CPOTF_NB * span*2); +// if(if_execute) +// { +// __memcpy(dst,rp,span*CPOTF_NB*COMPLEX_TYPE_SIZE,NRAM2NRAM); +// } +// if(id == 0) +// { +// __memcpy(sram_buffer,rp,span*CPOTF_NB*COMPLEX_TYPE_SIZE,NRAM2SRAM); +// } +// __sync_cluster(); +// } + +__mlu_func__ +void small_cgemm(int m,int k, + float* rA0, float* iA0, const int lda, + int width, float* sram_buffer, float* dst) +{ + int id = taskId % 4; + int span = CPOTF_NB; + int finish = id * span; + int remain = m - finish; + bool if_execute = remain > 0; + span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; + + + float* rC = dst + CPOTF_NB *CREC_NB; + float* iC = rC + CPOTF_NB *CREC_NB; + float* rA = iC + CPOTF_NB *CREC_NB; + float* iA = rA + CPOTF_NB *CREC_NB; + float* rp = iA + CPOTF_NB *CREC_NB; + float* ip = rp + CPOTF_NB *CREC_NB; + float* rB = ip + CPOTF_NB *CREC_NB; + float* iB = rB + CPOTF_NB *CREC_NB; + + float *srB = sram_buffer; //srB:shared_real_B + float *siB = srB + CPOTF_NB * CREC_NB; //siB:shared_imag_B + + float* rdst = dst; + float* idst = rdst + span*CPOTF_NB; + + int total_length = k + width; + int loop_width = CPOTF_NB; + int b_height = std::min(width, CPOTF_NB); + + + if(if_execute) + { + int prefetch_width = std::min(loop_width, total_length); + __memcpy(rp,(rA0+finish*lda),prefetch_width*sizeof(float),GDRAM2NRAM,CPOTF_NB*sizeof(float),lda*sizeof(float),span-1); + __memcpy(ip,(iA0+finish*lda),prefetch_width*sizeof(float),GDRAM2NRAM,CPOTF_NB*sizeof(float),lda*sizeof(float),span-1); + } + __memset_nram(rC,CPOTF_NB*CREC_NB*2,(float)ZERO); + __sync_cluster(); + if(id == 0) + { + __memcpy(srB,rp,CPOTF_NB*CPOTF_NB*sizeof(float),NRAM2SRAM); + __memcpy(siB,ip,CPOTF_NB*CPOTF_NB*sizeof(float),NRAM2SRAM); + } + __sync_cluster(); + float a1,a2,b1,b2; + for(int iter = 0; iter < k; iter += loop_width) + { + __bang_move(rA,rp,CPOTF_NB * span*sizeof(float)); + __bang_move(iA,ip,CPOTF_NB * span*sizeof(float)); + __memcpy(rB,srB,CPOTF_NB*b_height*sizeof(float),SRAM2NRAM); + __memcpy(iB,siB,CPOTF_NB*b_height*sizeof(float),SRAM2NRAM); + __sync_cluster(); + if(if_execute) + { + int prefetch_width = std::min(loop_width, total_length-iter-loop_width); + __memcpy_async(rp,(rA0+finish*lda+iter+loop_width),prefetch_width*sizeof(float),GDRAM2NRAM,CPOTF_NB*sizeof(float),lda*sizeof(float),span-1); + __memcpy_async(ip,(iA0+finish*lda+iter+loop_width),prefetch_width*sizeof(float),GDRAM2NRAM,CPOTF_NB*sizeof(float),lda*sizeof(float),span-1); + } + for(int i = 0; i < span; i++) + { + for(int j = 0; j < b_height; j++) + { + for(int h = 0; h < loop_width; h++) + { + a1 = rA[(i*CPOTF_NB+h)]; + b1 = iA[(i*CPOTF_NB+h)]; + a2 = rB[(j*CPOTF_NB+h)]; + b2 = iB[(j*CPOTF_NB+h)]; + rC[(i*CPOTF_NB+j)] += (a1*a2+b1*b2); + iC[(i*CPOTF_NB+j)] += (a2*b1-a1*b2); + } + } + } + __sync_cluster(); + if(id == 0) + { + __memcpy(srB,rp,CPOTF_NB*b_height*sizeof(float),NRAM2SRAM); + __memcpy(siB,ip,CPOTF_NB*b_height*sizeof(float),NRAM2SRAM); + } + __sync_cluster(); + } + + __bang_sub(rp,rp,rC,CPOTF_NB * span); + __bang_sub(ip,ip,iC,CPOTF_NB * span); + if(if_execute) + { + __memcpy(rdst,rp,span*CPOTF_NB*sizeof(float),NRAM2NRAM); + __memcpy(idst,ip,span*CPOTF_NB*sizeof(float),NRAM2NRAM); + } + if(id == 0) + { + __memcpy(sram_buffer,rp,span*CPOTF_NB*sizeof(float),NRAM2SRAM); + __memcpy(sram_buffer+span*CPOTF_NB,ip,span*CPOTF_NB*sizeof(float),NRAM2SRAM); + } + __sync_cluster(); +} + +__mlu_func__ +void small_cminout(int m, int width, + float *dst, float *sram_buffer, int lda) +{ + float factor; + int id = taskId; + int finish = id * CPOTF_NB; + int remain = m - finish; + bool if_execute = remain > 0; + int span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; + float *rdst = dst; + float *idst = dst + span*CPOTF_NB; + float *rdiag = idst + CPOTF_NB *CREC_NB; + float *idiag = rdiag + CPOTF_NB*CPOTF_NB; + float a1,b1,a2,b2,a3,b3; + if(if_execute) + { + + __memcpy(rdiag,sram_buffer,width*CPOTF_NB*COMPLEX_TYPE_SIZE,SRAM2NRAM); + __memcpy(idiag,sram_buffer+CPOTF_NB*CPOTF_NB,width*CPOTF_NB*COMPLEX_TYPE_SIZE,SRAM2NRAM); + for(int iter = 0; iter < width; iter++) + { + factor = sqrt(rdiag[(iter * CPOTF_NB+iter)]); + factor = 1.0/factor; + for(int i = 0; i 0; + int span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; + __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; + float* dst = (float*)nram_buffer; + small_cgemm(m, localstep, rA0, iA0, lda, width, (float*)sram_buffer, dst); + + __sync_cluster(); + + // if(id == 1) + // { + // printf("id:1, after gemm, before inout,dst:\n"); + // for(int i = 0; i < width; i++) + // { + // for(int j = 0; j < width; j++) + // { + // printf("%.3f,%.3f ",dst[(i*CPOTF_NB+j)*2],dst[(i*CPOTF_NB+j)*2+1]); + // } + // printf("\n"); + // } + // } + + small_cminout(m, width, dst, (float*)sram_buffer, CPOTF_NB); + + __sync_cluster(); + + float *rdst = dst; + float *idst = dst + span*CPOTF_NB; + + if(id == 0) + { + for(int i = 0; i < width; i++) + { + __memcpy((rA+(i*lda)),(rdst+(i*CPOTF_NB)),(i+1)*sizeof(float),NRAM2LDRAM); + __memcpy((iA+(i*lda)),(idst+(i*CPOTF_NB)),(i+1)*sizeof(float),NRAM2LDRAM); + } + // printf("id:0, after gemm and inout,dst:\n"); + // for(int i = 0; i < width; i++) + // { + // for(int j = 0; j < width; j++) + // { + // printf("%8.3f ",rdst[(i*CPOTF_NB+j)]); + // } + // printf("\n"); + // } + // printf("\n"); + // for(int i = 0; i < width; i++) + // { + // for(int j = 0; j < width; j++) + // { + // printf("%8.3f ",idst[(i*CPOTF_NB+j)]); + // } + // printf("\n"); + // } + } + else if(if_execute) + { + __memcpy((rA+(finish*lda)),rdst,width*sizeof(float),NRAM2LDRAM,lda*sizeof(float),CPOTF_NB*sizeof(float),span-1); + __memcpy((iA+(finish*lda)),idst,width*sizeof(float),NRAM2LDRAM,lda*sizeof(float),CPOTF_NB*sizeof(float),span-1); + } + __sync_cluster(); + // if(id == 0) + // { + // printf("end of cmplout:\n"); + // for (int j = 0; j < lda; j++) + // { + // for(int h = 0; h < lda; h++) + // { + // printf("%8.3f ",rA[j*lda+h]); + // } + // printf("\n"); + // } + // printf("\n"); + // for (int j = 0; j < lda; j++) + // { + // for(int h = 0; h < lda; h++) + // { + // printf("%8.3f ",iA[j*lda+h]); + // } + // printf("\n"); + // } + // } + +} + +__mlu_global__ void cpotf_kernel(int m, float *drA, float *diA, int lda) +{ + int width = CPOTF_NB; + int span = width; + for(int i = 0; i < m; i += width) + { + span = std::min(width, m - i); + cmplout(m-i, (drA+i*lda), (drA+i*lda+i), (diA+i*lda), (diA+i*lda+i), lda, i, span); + } +} + +mluOpStatus_t mlu_cpotf_lpin(int batch, int stride, int n, int lda, float* drA, float* diA, cnrtQueue_t queue) +{ + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; + dim.y = 1; + dim.z = 1; + dim.x = 4; + KERNEL_CHECK(cpotf_kernel<<>>(n, drA,diA, lda)); + return MLUOP_STATUS_SUCCESS; +} + + + +__mlu_global__ +void complex_add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) +{ + + int id = taskId; + int ipu_per_cluster = 4; + id = taskId; + + int span = m/4; + int finish = id * span; + if(id == 3) + { + span = m - 3 * span; + } + + + + float* sram_buffer = (float*)nram_buffer; + if (beta == 0.0f) + { + if(id == 0) + { + __memcpy(sram_buffer,src,n*sizeof(float),GDRAM2NRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); + + } + __sync_cluster(); + if(id == 0) + { + __memcpy(d_c,sram_buffer,n*sizeof(float),NRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); + } + __sync_cluster(); + return; + } + + float* a_sram = (float*)sram_buffer + 3* m * n; + + int d_c_offset = ldc*finish; + int src_offset = ldsrc*finish; + + __memcpy(sram_buffer,d_c+d_c_offset,n*sizeof(float),LDRAM2NRAM,n*sizeof(float),ldc*sizeof(float),span-1); + __memcpy(a_sram,src+src_offset,n*span*sizeof(float),LDRAM2NRAM); + +// __sync_cluster(); + + +// int32_t data_num = m*n; + int32_t data_per_core = span*n; + int32_t data_last_core = data_per_core; + const float *a_offset = a_sram; + const float *b_offset = (float*)sram_buffer; + float *output_offset = (float*)sram_buffer; + + if (id == ipu_per_cluster - 1) { + data_per_core = data_last_core; + } + + + + int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); +// int32_t data_nram_num = +// MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; + int32_t data_nram_num = + MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; + float *a_nram = (float *)a_sram + m*n; + float *b_nram = (float *)a_nram + data_nram_num; + int32_t loop_num = data_per_core / data_nram_num; + int32_t rem_nram_num = data_per_core % data_nram_num; + + for (int32_t i = 0; i < loop_num; i++) { + __memcpy(a_nram, a_offset + i * data_nram_num, + data_nram_num * sizeof(float), NRAM2NRAM); + __memcpy(b_nram, b_offset + i * data_nram_num, + data_nram_num * sizeof(float), NRAM2NRAM); + __bang_add(a_nram, a_nram, b_nram, data_nram_num); + __memcpy(output_offset + i * data_nram_num, a_nram, + data_nram_num * sizeof(float), NRAM2NRAM); + } + if (rem_nram_num != 0) { + int32_t rem_align_num = + (rem_nram_num + align_num - 1) / align_num * align_num; + __memcpy(a_nram, a_offset + loop_num * data_nram_num, + rem_nram_num * sizeof(float), NRAM2NRAM); + __memcpy(b_nram, b_offset + loop_num * data_nram_num, + rem_nram_num * sizeof(float), NRAM2NRAM); + __bang_add(a_nram, a_nram, b_nram, rem_align_num); + __memcpy(output_offset + loop_num * data_nram_num, a_nram, + rem_nram_num * sizeof(float), NRAM2NRAM); + + } +// __sync_cluster(); +__memcpy(d_c+d_c_offset,sram_buffer,n*sizeof(float),NRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),span-1); + +// if (id == 0) { +// printf("id0,d_c:\n"); +// for(int i = 0; i < m; i++) +// { +// for(int j = 0; j < n; j++) +// { +// printf("%8.3f ",((float*)sram_buffer)[i*n+j]); +// } +// printf("\n"); +// } +// printf("add_c, d_c:\n"); +// for(int i = 0; i < m; i++) +// { +// for(int j = 0; j < n; j++) +// { +// printf("%8.3f ",((float*)d_c)[i*ldc+j]); +// } +// printf("\n"); +// } +// } + +// if(id == 0) +// { +// printf("id: 1, a_sram:\n"); +// for(int i = 0; i < m; i++) +// { +// for(int j = 0; j < n; j++) +// { +// printf("%8.3f ",((float*)a_sram)[i*n+j]); +// } +// printf("\n"); +// } +// printf("\nid: 1, sram_buffer:\n"); +// for(int i = 0; i < m; i++) +// { +// for(int j = 0; j < n; j++) +// { +// printf("%8.3f ",((float*)sram_buffer)[i*n+j]); +// } +// printf("\n"); +// } +// printf("\n"); +// } + +// __sync_cluster(); + +} + + +mluOpStatus_t complex_malloc(size_t size, float** workspace) +{ + CNRT_CHECK(cnrtMalloc((void **)workspace, size)); + + return MLUOP_STATUS_SUCCESS; +} + +__mlu_global__ +void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m) +{ + int id = taskId; + id = taskId % 4; + // __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + + + // if (id == 0) { + // // __memcpy(sram_buffer,d_input,m*m*sizeof(float),GDRAM2SRAM); + // __memcpy(sram_buffer,d_input,m*sizeof(float),GDRAM2SRAM,m*sizeof(float),ld_input*sizeof(float),m-1); + // } + // __sync_cluster(); + + + int span = m/taskDim; + int start = id * span; + if (id == 3) + { + span = m - 3 * span; + } + float* nram_offset = (float*)nram_buffer; + //diag_start:m*m ld:m + float* rdiag_start = (float*)nram_offset; + float* idiag_start = rdiag_start + m * m; + //nram_src1存放列主序的计算完成的矩阵 m*m ld:height + float* r_nram_src1 = idiag_start + m * m; + float* i_nram_src1 = r_nram_src1 + m * m; + float* r_nram_src2 = i_nram_src1 + m * m; + float* i_nram_src2 = r_nram_src2 + m; + float* r_mul_result = i_nram_src2 + m; + float* i_mul_result = r_mul_result + m; + //nram_dst存放计算结果,占用空间m*m ld为span + float* r_nram_dst = i_mul_result + m; + float* i_nram_dst = r_nram_dst + m * m; + + // float* diag_start = ((float*)sram_buffer) + m * start + start; + int height = m - start; + + __memset_nram(nram_offset, 4 * m * m * 2+2, (float)ZERO); + + if(span > 0) + { + __memcpy(rdiag_start,rd_input + ld_input * start + start,height*sizeof(float),LDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),height-1); + __memcpy(idiag_start,id_input + ld_input * start + start,height*sizeof(float),LDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),height-1); + } + // if(id == 0) + // { + // //print rdiag_start + // printf("diag_start:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < m; j++) + // { + // printf("%.3f ",rdiag_start[i*m+j]); + // } + // printf("\n"); + // } + // } + + + //计算对角线元素的倒数 + float result = 0.0; + for(int i = 0; i < height; i++) + { + int off = i * m + i; + result = rdiag_start[off]; + result = 1.0 / result; + r_nram_src1[i*height+i] = result; //i_nram_src1对应位置为0 + r_nram_dst[i*span + i] = result; + rdiag_start[off] = result; + + } + + + for(int i = 1; i < height; i++) + { + __memcpy(r_nram_src2,rdiag_start+i*m,i*sizeof(float),NRAM2NRAM); + __memcpy(i_nram_src2,idiag_start+i*m,i*sizeof(float),NRAM2NRAM); + int num = std::min(i, span); + float diag_element = rdiag_start[i*m+i]; + for(int j = 0; j < num; j++) + { + float r_temp = 0.0; + float i_temp = 0.0; + // if(id == 0 && i == 3) + // { + // printf("nram_src2:\n"); + // for(int k = 0; k < i; k++) + // { + // printf("%.3f ",nram_src2[k]); + // } + // printf("\n"); + // printf("nrma_src1:\n"); + // for(int k = 0; k < i; k++) + // { + // printf("%.3f ",nram_src1[j*height+k]); + // } + // printf("diag_element:%.3f\n",diag_element); + // } + //符号可能要改变 + __bang_mul(r_mul_result,r_nram_src2,r_nram_src1+j*height,i); + __bang_mul(i_mul_result,r_nram_src2,i_nram_src1+j*height,i); + for(int k = 0; k< i; k++) + { + r_temp += r_mul_result[k]; + i_temp += i_mul_result[k]; + // i_temp -= i_mul_result[k]; + } + //符号可能要改变 + __bang_mul(r_mul_result,i_nram_src2,i_nram_src1+j*height,i); + __bang_mul(i_mul_result,i_nram_src2,r_nram_src1+j*height,i); + for(int k = 0; k< i; k++) + { + r_temp += r_mul_result[k]; + i_temp -= i_mul_result[k]; + } + r_temp = r_temp * -1.0 * diag_element; + i_temp = i_temp * -1.0 * diag_element; + r_nram_dst[i*span+j] = r_temp; + i_nram_dst[i*span+j] = i_temp; + r_nram_src1[j*height+i] = r_temp; + i_nram_src1[j*height+i] = i_temp; + } + __sync(); + + } + + __sync(); + + // if(id == 0) + // { + // printf("id:0, r_nram_dst:\n"); + // for(int i = 0; i < height; i++) + // { + // for(int j = 0; j < span; j++) + // { + // printf("%8.3f ",r_nram_dst[i*span+j]); + // } + // printf("\n"); + // } + // printf("\n"); + // printf("id:0, i_nram_dst:\n"); + // for(int i = 0; i < height; i++) + // { + // for(int j = 0; j < span; j++) + // { + // printf("%8.3f ",i_nram_dst[i*span+j]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + __sync(); + + + if(span > 0) + { + __memcpy(rd_output + ld_output * start + start,r_nram_dst,span*sizeof(float),NRAM2LDRAM,ld_output*sizeof(float),span*sizeof(float),height-1); + __memcpy(id_output + ld_output * start + start,i_nram_dst,span*sizeof(float),NRAM2LDRAM,ld_output*sizeof(float),span*sizeof(float),height-1); + } + + // if(id == 0) + // { + // //printf nram_dst + // printf("last diag_start:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < m; j++) + // { + // printf("%.3f ",diag_start[i*m+j]); + // } + // printf("\n"); + // } + // } + + +} + + +mluOpStatus_t complex_inverse(int batch, float *rd_input, float *id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m, mluOpHandle_t handle) +{ + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + dim.y = 1; + dim.z = 1; + dim.x = 4; + + KERNEL_CHECK(complex_inverse_kernel<<>>(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m)); + return MLUOP_STATUS_SUCCESS; +} + + + +//这cgemm其实不是计算a*b,而是计算a*(b^H),即计算a乘b的共轭转置 +mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle) +{ + float *workspace = NULL; + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + CNRT_CHECK(cnrtMalloc((void **)&workspace, sizeof(float)*2*(m*n))); + // float temp1=0, temp2=0; + //print d_a + // printf("before transpose, d_a:\n"); + // for(int i = 0; i < batch; i++) + // { + // printf("batch:%d\n",i); + // for(int j = 0; j < m; j++) + // { + // for(int h = 0; h < k; h++) + // { + // cnrtMemcpy(&temp1, d_a+i*stride_a*2+j*lda*2+h*2, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtMemcpy(&temp2, d_a+i*stride_a*2+j*lda*2+h*2+1, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f,%8.3f ",temp1,temp2); + // } + // printf("\n"); + // } + // } + + // printf("after transpose, d_a:\n"); + // for(int i = 0; i < 2; i++) + // { + // for(int j = 0; j < m; j++) + // { + // for(int h = 0; h < k; h++) + // { + // cnrtMemcpy(&temp1, workspace+i*m*k+j*lda+h, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f",temp1); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + float *r_c, *i_c; + // float *res_ra_ib; + r_c = workspace; + i_c = r_c+m*n; + // res_ra_ib = res_ia_rb+m*n; + + int s_stride_a = stride_a; + int s_stride_b = stride_a; + int s_stride_c = stride_a; + + // float temp = 0; + // printf("before sgemm:\n"); + // printf("r_a:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < k; j++) + // { + // cnrtMemcpy(&temp, d_ra+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + // printf("i_a:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < k; j++) + // { + // cnrtMemcpy(&temp, d_ia+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + // printf("r_b:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < k; j++) + // { + // cnrtMemcpy(&temp, d_rb+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + // printf("i_b:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < k; j++) + // { + // cnrtMemcpy(&temp, d_ib+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_rb,ldb,s_stride_b,r_c,n,s_stride_c,handle); + cnrtQueueSync(queue); + + sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_ib,ldb,s_stride_b,r_c,n,s_stride_c,handle); + cnrtQueueSync(queue); + // sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,n,s_stride_c,handle); + // cnrtQueueSync(queue); + // sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,n,s_stride_c,handle); + // cnrtQueueSync(queue); + + sgemm(batch,trans_a,trans_b,m,n,k,-alpha,0,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,n,s_stride_c,handle); + cnrtQueueSync(queue); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,n,s_stride_c,handle); + cnrtQueueSync(queue); + + + + // printf("beta:%f\n",beta); + + // printf("r_c:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < n; j++) + // { + // cnrtMemcpy(&temp, r_c+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtQueueSync(queue); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + dim.y = 1; + dim.z = 1; + dim.x = 4; + KERNEL_CHECK(complex_add_c<<>>(batch,stride_c,beta,d_rc,r_c,ldc,n,m,n)); + KERNEL_CHECK(complex_add_c<<>>(batch,stride_c,beta,d_ic,i_c,ldc,n,m,n)); + cnrtQueueSync(queue); + return MLUOP_STATUS_SUCCESS; +} + + +//这cgemm其实不是计算a*b,而是计算a*(b^H),即计算a乘b的共轭转置 +mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle) +{ + float *workspace = NULL; + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + CNRT_CHECK(cnrtMalloc((void **)&workspace, sizeof(float)*2*(m*n))); + // float temp1=0, temp2=0; + //print d_a + // printf("before transpose, d_a:\n"); + // for(int i = 0; i < batch; i++) + // { + // printf("batch:%d\n",i); + // for(int j = 0; j < m; j++) + // { + // for(int h = 0; h < k; h++) + // { + // cnrtMemcpy(&temp1, d_a+i*stride_a*2+j*lda*2+h*2, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtMemcpy(&temp2, d_a+i*stride_a*2+j*lda*2+h*2+1, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f,%8.3f ",temp1,temp2); + // } + // printf("\n"); + // } + // } + + // printf("after transpose, d_a:\n"); + // for(int i = 0; i < 2; i++) + // { + // for(int j = 0; j < m; j++) + // { + // for(int h = 0; h < k; h++) + // { + // cnrtMemcpy(&temp1, workspace+i*m*k+j*lda+h, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f",temp1); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + float *r_c, *i_c; + // float *res_ra_ib; + r_c = workspace; + i_c = r_c+m*n; + // res_ra_ib = res_ia_rb+m*n; + + int s_stride_a = stride_a; + int s_stride_b = stride_a; + int s_stride_c = stride_a; + + // float temp = 0; + // printf("before sgemm:\n"); + // printf("r_a:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < k; j++) + // { + // cnrtMemcpy(&temp, d_ra+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + // printf("i_a:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < k; j++) + // { + // cnrtMemcpy(&temp, d_ia+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + // printf("r_b:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < k; j++) + // { + // cnrtMemcpy(&temp, d_rb+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + // printf("i_b:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < k; j++) + // { + // cnrtMemcpy(&temp, d_ib+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_rb,ldb,s_stride_b,r_c,n,s_stride_c,handle); + cnrtQueueSync(queue); + + + + sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,d_ia,lda,s_stride_a,d_ib,ldb,s_stride_b,r_c,n,s_stride_c,handle); + cnrtQueueSync(queue); + // sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,n,s_stride_c,handle); + // cnrtQueueSync(queue); + // sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,n,s_stride_c,handle); + // cnrtQueueSync(queue); + + sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,n,s_stride_c,handle); + cnrtQueueSync(queue); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,n,s_stride_c,handle); + cnrtQueueSync(queue); + + + + // printf("beta:%f\n",beta); + + // printf("r_c:\n"); + // for(int i = 0; i < m; i++) + // { + // for(int j = 0; j < n; j++) + // { + // cnrtMemcpy(&temp, r_c+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + // cnrtQueueSync(queue); + // printf("%8.3f ",temp); + // } + // printf("\n"); + // } + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + dim.y = 1; + dim.z = 1; + dim.x = 4; + KERNEL_CHECK(complex_add_c<<>>(batch,stride_c,beta,d_rc,r_c,ldc,n,m,n)); + KERNEL_CHECK(complex_add_c<<>>(batch,stride_c,beta,d_ic,i_c,ldc,n,m,n)); + cnrtQueueSync(queue); + return MLUOP_STATUS_SUCCESS; +} + +mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, float* id_a, int lda, float* rd_b, float* id_b, int ldb, mluOpHandle_t handle) +{ + if(n==0) + return MLUOP_STATUS_SUCCESS; + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + float* workspace; + CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*m*m*2*sizeof(float))); + CNRT_CHECK(cnrtMemset(workspace, 0.0, batch*m*m*2*sizeof(float))); + float *r_inverse_result, *i_inverse_result; + r_inverse_result = workspace; + i_inverse_result = r_inverse_result + batch*m*m; + + float temp_h; + printf("before inverse, real:\n"); + for(int i = 0; i < m; i++) + { + for(int j = 0; j < m; j++) + { + cnrtMemcpy(&temp_h, rd_a+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + printf("%8.3f ",temp_h); + } + printf("\n"); + } + printf("before inverse, imag:\n"); + for(int i = 0; i < m; i++) + { + for(int j = 0; j < m; j++) + { + cnrtMemcpy(&temp_h, id_a+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + printf("%8.3f ",temp_h); + } + printf("\n"); + } + + complex_inverse(batch,rd_a,id_a,lda,stride,r_inverse_result,i_inverse_result,m,m*m,m,handle); + cnrtQueueSync(queue); + + printf("inverse result real:\n"); + for(int i = 0; i < m; i++) + { + for(int j = 0; j < m; j++) + { + cnrtMemcpy(&temp_h, r_inverse_result+i*m+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + printf("%8.3f ",temp_h); + } + printf("\n"); + } + printf("inverse result imag:\n"); + for(int i = 0; i < m; i++) + { + for(int j = 0; j < m; j++) + { + cnrtMemcpy(&temp_h, i_inverse_result+i*m+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + printf("%8.3f ",temp_h); + } + printf("\n"); + } + cgemm_real(batch,false,true,n,m,m,1.0,0.0f,rd_b,id_b,ldb,stride,r_inverse_result,i_inverse_result,m,m*m,rd_b,id_b,ldb,stride,handle); + printf("trsm result real:\n"); + for(int i = 0; i < n; i++) + { + for(int j = 0; j < m; j++) + { + cnrtMemcpy(&temp_h, rd_b+i*ldb+j, sizeof(float),CNRT_MEM_TRANS_DIR_DEV2HOST); + printf("%8.3f ",temp_h); + } + printf("\n"); + } + + printf("trsm result imag:\n"); + for(int i = 0; i < n; i++) + { + for(int j = 0; j < m; j++) + { + cnrtMemcpy(&temp_h, id_b+i*ldb+j, sizeof(float),CNRT_MEM_TRANS_DIR_DEV2HOST); + printf("%8.3f ",temp_h); + } + printf("\n"); + } + + return MLUOP_STATUS_SUCCESS; +} + +mluOpStatus_t cherk(int batch, int stride, int n,int k, float* rd_a, float* id_a, int lda, float* rd_c, float* id_c, int ldc, mluOpHandle_t handle) +{ + if(k==0) + return MLUOP_STATUS_SUCCESS; + cgemm(batch,false,true,n,n,k,-1.0f,1.0f,rd_a,id_a,lda,stride,rd_a,id_a,lda,stride,rd_c,id_c,ldc,stride,handle); + set_half_zero(batch,stride,rd_c,ldc,n,handle); + set_half_zero(batch,stride,id_c,ldc,n,handle); + return MLUOP_STATUS_SUCCESS; +} + + +mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, float* drA, float* diA, int lda, mluOpHandle_t handle) +{ + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + if(n <= recnb) + { + mlu_cpotf_lpin(batch,stride, n, lda, drA, diA, queue); + } + else + { + int n1 = n/2; + int n2 = n-n1; + mlu_cpotrf_rectile(batch,stride,n1,recnb,drA,diA,lda,handle); + ctrsm(batch,stride,n1,n2,drA,diA,lda,drA+n1*lda,diA+n1*lda,lda,handle); + cherk(batch,stride,n2,n1,drA+n1*lda,diA+n1*lda,lda,drA+n1*lda+n1,diA+n1*lda+n1,lda,handle); + mlu_cpotrf_rectile(batch,stride,n2,recnb,drA+n1*lda+n1,diA+n1*lda+n1,lda,handle); + + } + return MLUOP_STATUS_SUCCESS; +} \ No newline at end of file diff --git a/mlu_op.h b/mlu_op.h index e75c6d715..372965a46 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -28,8 +28,8 @@ ******************************************************************************/ #define MLUOP_MAJOR 1 -#define MLUOP_MINOR 1 -#define MLUOP_PATCHLEVEL 1 +#define MLUOP_MINOR 0 +#define MLUOP_PATCHLEVEL 0 /********************************************************************************* * MLUOP_VERSION is deprecated and not recommended. To get the version of MLUOP, use @@ -8755,228 +8755,6 @@ mluOpActiveRotatedFilterForward(const mluOpHandle_t handle, const mluOpTensorDescriptor_t output_desc, void *output); -/*! - * @brief Enumeration variables describing the attributes of the AdamW computation. - */ -typedef enum { - MLUOP_ADAMW_WEIGHT_DECAY = 0, - /*!< Set the weight_decay attribute for the AdamW operation. */ - MLUOP_ADAMW_GRAD_SCALE = 1, - /*!< Set the grad_scale attribute for the AdamW operation. */ - MLUOP_ADAMW_USE_NESTEROV = 2, - /*!< Specifies whether to use nesterov on the AdamW operation. */ -} mluOpAdamWDescAttribute_t; - -typedef struct mluOpAdamWStruct *mluOpAdamWDescriptor_t; - -// Group: AdamW -/*! - * @brief Updates each attribute by using AdamW. - * - * @param[in] handle - * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices - * and queues in the AdamW operation. For detailed information, - * see ::mluOpHandle_t. - * @param[in] adamw_desc - * A host pointer to the AdamW descriptor that holds information about the AdamW operation. - * @param[in] param_desc - * The descriptor of the tensor, which contains the dimension and layout of param. - * For detailed information, see ::mluOpTensorDescriptor_t. - * @param[in] param - * Pointer to the MLU memory that stores the param tensor. - * @param[in] paramh_desc - * The descriptor of the tensor, which contains the dimension and layout of param_h. - * For detailed information, see ::mluOpTensorDescriptor_t. - * @param[in] param_h - * Pointer to the MLU memory that stores the param_h tensor. - * @param[in] momentum_desc - * The descriptor of the tensor, which contains the dimension and layout of momentum. - * For detailed information, see ::mluOpTensorDescriptor_t. - * @param[in] momentum - * Pointer to the MLU memory that stores the momentum tensor. - * @param[in] velocity_desc - * The descriptor of the tensor, which contains the dimension and layout of velocity. - * For detailed information, see ::mluOpTensorDescriptor_t. - * @param[in] velocity - * Pointer to the MLU memory that stores the velocity tensor. - * @param[in] grad_desc - * The descriptor of the tensor, which contains the dimension and layout of grad. - * For detailed information, see ::mluOpTensorDescriptor_t. - * @param[in] grad - * Pointer to the MLU memory that stores the grad tensor. - * @param[in] lr - * A scalar of lr factor that is used for AdamW. - * @param[in] beta1 - * A scalar of beta1 factor that is used for AdamW. - * @param[in] beta2 - * A scalar of beta2 factor that is used for AdamW. - * @param[in] bias1 - * A scalar of bias1 factor that is used for AdamW. - * @param[in] bias2 - * A scalar of bias2 factor that is used for AdamW. - * @param[in] epsilon - * A scalar of epsilon factor that is used for AdamW. - * @par Return - * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM, ::MLUOP_STATUS_ARCH_MISMATCH - * - * @par Data Type - * - The supported data types of input and output tensors are as follows: - * - param tensor: float - * - param_h tensor: bfloat16 - * - momentum tensor: float - * - velocity tensor: float - * - grad tensor: bfloat16 - * - * @par Data Layout - * - The supported data layouts of \b param tensor, \b param_h tensor, \b momentum tensor, \b velocity tensor, and \b - * grad tensor are as follows: - * - param tensor: \p MLUOP_LAYOUT_ARRAY - * - param_h tensor: \p MLUOP_LAYOUT_ARRAY - * - momentum tensor: \p MLUOP_LAYOUT_ARRAY - * - velocity tensor: \p MLUOP_LAYOUT_ARRAY - * - grad tensor: \p MLUOP_LAYOUT_ARRAY - * - * @par Scale Limitation - * - None. - * - * @par API Dependency - * - None. - * - * @par Note - * - None. - * - * @par Example - * - None. - * - * @par Reference - * - https://github.com/OpenBMB/BMTrain/blob/6abcf772aa1e120192f7656e55c4adbcde53c886/csrc/cuda/adam_cuda.cu - */ -mluOpStatus_t MLUOP_WIN_API -mluOpAdamW(mluOpHandle_t handle, - mluOpAdamWDescriptor_t adamw_desc, - const mluOpTensorDescriptor_t param_desc, - void *param, - const mluOpTensorDescriptor_t paramh_desc, - void *param_h, - const mluOpTensorDescriptor_t momentum_desc, - void *momentum, - const mluOpTensorDescriptor_t velocity_desc, - void *velocity, - const mluOpTensorDescriptor_t grad_desc, - void *grad, - const float lr, - const float beta1, - const float beta2, - const float bias1, - const float bias2, - const float epsilon); - -// Group: AdamW -/*! - * @brief Creates a descriptor pointed by \p adamw_desc for AdamW operation. - * The information is defined in ::mluOpAdamWDescriptor_t. - * For more information about the descriptor, see "Cambricon MLU-OPS User Guide". - * - * @param[out] adamw_desc - * A host pointer to the AdamW descriptor that holds information about the - * AdamW operation. - * - * @par Return - * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ALLOC_FAILED - * - * @par API Dependency - * - After calling this function, call ::mluOpSetAdamWDescAttr function to initialize - * and set the information to the AdamW descriptor. - * - * @par Note - * - None. - * - * @par Requirements - * - None. - * - * @par Example - * - None. - * - * @par Reference - * - None. - */ -mluOpStatus_t MLUOP_WIN_API -mluOpCreateAdamWDescriptor(mluOpAdamWDescriptor_t *adamw_desc); - -// Group: AdamW -/*! - * @brief Initializes the descriptor \b adamw_desc that was previously created with - * ::mluOpCreateAdamWDescriptor function, and sets AdamW information - * to the descriptor \b adamw_desc. The information includes \b weight_decay , \b grad_scale - * and \b use_nesterov for AdamW operation. - * - * @param[in] adamw_desc - * The descriptor of the AdamW operation. For detailed information, - * see ::mluOpAdamWDescriptor_t. - * @param[in] attr - * Attribute of AdamW descriptor to be set. For detailed information, - * see ::mluOpAdamWDescAttribute_t. - * @param[in] buf - * A host pointer to the attribute value set by this function. - * @param[in] size_in_bytes - * Buffer in bytes for verification. - * - * @par Return - * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM - * - * @par Data Type - * - None. - * - * @par Data Layout - * - None. - * - * @par Scale Limitation - * - None. - * - * @par API Dependency - * - This function should be called after ::mluOpCreateAdamWDescriptor. - * - * @par Note - * - None. - * - * @par Example - * - None. - * - * @par Reference - * - None. - */ -mluOpStatus_t MLUOP_WIN_API -mluOpSetAdamWDescAttr(mluOpAdamWDescriptor_t adamw_desc, - mluOpAdamWDescAttribute_t attr, - const void *buf, - const size_t size_in_bytes); - -// Group: AdamW -/*! - * @brief Destroys the AdamW descriptor \p adamw_desc that was previously created by - * ::mluOpCreateAdamWDescriptor. - * - * @param[in] adamw_desc - * The AdamW descriptor to be destroyed. - * @par Return - * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM - * - * @par Note - * - Call this function after calling ::mluOpAdamW. - * - It is necessary to call this function to destroy the AdamW descriptor to avoid memory leak. - * - * @par Requirements - * - None. - * - * @par Example - * - None. - * - * @par Reference - * - None - */ -mluOpStatus_t MLUOP_WIN_API -mluOpDestroyAdamWDescriptor(mluOpAdamWDescriptor_t adamw_desc); - // Group: DeformRoiPool /*! * @brief Computes deformable roi pooling over \b input tensor. This function firstly divides the obtained @@ -14362,59 +14140,18 @@ mluOpExecFFT(mluOpHandle_t handle, mluOpStatus_t MLUOP_WIN_API mluOpDestroyFFTPlan(mluOpFFTPlan_t fft_plan); -/*! - * @brief Computes the Cholesky factorization of a Hermitian positive-definite matrix. - * - * @param[in] handle - * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the - * deformable convolution operation. For detailed information, see ::mluOpHandle_t. - * - * @param[in] input_desc - * The descriptor of the input matrix to factorise, it is an n×n Hermitian matrix, - * only the lower or upper part is meaningful. - * - * @param[in] d_input - * Pointer to the MLU memory that stores the input matrix. - * - * @param[in] output_desc - * The descriptor of the result matrix, it is an n×n lower triangular matrix or an upper triangular matrix. - * - * @param[out] d_output - * Pointer to the MLU memory that stores the result matrix. - * - * @param[in] upper - * upper indicates which part of the matrix is used. - * - * @par Return - * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_EXECUTION_FAILED - * - * @par Data Type - * - float32 - * - * @par Data Layout - * - None. - * - * @par Scale Limitation - * - None. - * - * @par API Dependency - * - None. - * - * @par Note - * - None. - * - * @par Example. - * - None. - * - * @par Reference. - * - None. - */ mluOpStatus_t MLUOP_WIN_API mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, float* d_input, const mluOpTensorDescriptor_t output_desc, - float* d_output,bool upper); + float* d_output,bool upper, float* workspace); + +mluOpStatus_t MLUOP_WIN_API +mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, + size_t* size, float** workspace); + + #if defined(__cplusplus) } #endif diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp index d99c2922d..0ace85efa 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp @@ -22,6 +22,7 @@ *************************************************************************/ #include #include "cholesky.h" +// #include "kernels/kernel_wrapper/export_statement.h" namespace mluoptest { @@ -39,7 +40,7 @@ void CholeskyExecutor::paramCheck() { } -void set_matrix_zero(float*A, bool upper, bool trans_, int n_, int ldda_) +void set_matrix_zero(float*A, bool upper, bool trans_, int n_, int ldda_, mluOpDataType_t type_) { if(trans_) { @@ -47,16 +48,50 @@ void set_matrix_zero(float*A, bool upper, bool trans_, int n_, int ldda_) { for (int j = 0; j < ldda_; j++) { - if(upper) { - if(i > j) - A[j + i * ldda_] = 0.0; + if(i >= j) + { + if(i == j && type_ == MLUOP_DTYPE_COMPLEX_FLOAT) + { + A[(j + i * ldda_)*2+1] = 0.0; + } + else + { + if(type_ == MLUOP_DTYPE_FLOAT) + A[j + i * ldda_] = 0.0; + else + { + A[(j + i * ldda_)*2] = 0.0; + A[(j + i * ldda_)*2+1] = 0.0; + } + } + + } } else { - if(i < j) - A[j + i * ldda_] = 0.0; + if(i <= j) + { + if(i == j) + { + if(type_ == MLUOP_DTYPE_COMPLEX_FLOAT) + { + A[(j + i * ldda_)*2+1] = 0.0; + } + } + else + { + if(type_ == MLUOP_DTYPE_FLOAT) + A[j + i * ldda_] = 0.0; + else + { + A[(j + i * ldda_)*2] = 0.0; + A[(j + i * ldda_)*2+1] = 0.0; + } + } + + } } } } @@ -68,14 +103,22 @@ void set_matrix_zero(float*A, bool upper, bool trans_, int n_, int ldda_) for (int j = 0; j < ldda_; j++) { if((i > j && ~upper)||(i < j && upper)) - A[j + i * ldda_] = 0.0; + { + if(type_ == MLUOP_DTYPE_FLOAT) + A[j + i * ldda_] = 0.0; + else + { + A[(j + i * ldda_)*2] = 0.0; + A[(j + i * ldda_)*2+1] = 0.0; + } + } } } } } -void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int ldda_) +void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int ldda_, mluOpDataType_t type_) { if(trans_) { @@ -83,7 +126,14 @@ void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int l { for(int j = 0;j < n_; j++) { - A[i+j*lda] = 0.0; + if(type_ == MLUOP_DTYPE_FLOAT) + A[i+j*lda] = 0.0; + // else if(type_ == MLUOP_DTYPE_COMPLEX_FLOAT && ((upper_==false && j >= i) || (upper_==true && j <= i))) + else + { + A[j*lda*2+i*2] = 0.0; + A[j*lda*2+i*2+1] = 0.0; + } for(int k = 0; k <=i; k++) { if(upper_==false) @@ -92,7 +142,18 @@ void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int l continue; else { - A[i+j*lda] += (C[k+i*lda]*C[k+j*lda]); + if(type_ == MLUOP_DTYPE_FLOAT) + A[i+j*lda] += (C[k+i*lda]*C[k+j*lda]); + else + { + A[(i+j*lda)*2] += (C[(k+i*lda)*2]*C[(k+j*lda)*2]+C[(k+i*lda)*2+1]*C[(k+j*lda)*2+1]); + A[(i+j*lda)*2+1] += (C[(k+i*lda)*2]*C[(k+j*lda)*2+1]-C[(k+i*lda)*2+1]*C[(k+j*lda)*2]); + } + } + if(type_ != MLUOP_DTYPE_FLOAT && j != i) + { + A[(j+i*lda)*2] = A[(i+j*lda)*2]; + A[(j+i*lda)*2+1] = -A[(i+j*lda)*2+1]; } } else @@ -101,10 +162,22 @@ void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int l continue; else { - A[i+j*lda] += (C[k*lda+i]*C[k*lda+j]); + if(type_ == MLUOP_DTYPE_FLOAT) + A[i+j*lda] += (C[k*lda+i]*C[k*lda+j]); + else + { + A[(i+j*lda)*2] += (C[(k*lda+i)*2]*C[(k*lda+j)*2]+C[(k*lda+i)*2+1]*C[(k*lda+j)*2+1]); + A[(i+j*lda)*2+1] += (-C[(k*lda+i)*2]*C[(k*lda+j)*2+1]+C[(k*lda+i)*2+1]*C[(k*lda+j)*2]); + } } + } } + if(type_ != MLUOP_DTYPE_FLOAT &&((upper_==false && j > i) || (upper_==true && j < i))) + { + A[(j+i*lda)*2] = A[(i+j*lda)*2]; + A[(j+i*lda)*2+1] = -A[(i+j*lda)*2+1]; + } } } } @@ -114,49 +187,83 @@ void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int l { for(int j = 0;j < n_; j++) { - A[j+i*lda] = 0.0; + if(type_ == MLUOP_DTYPE_FLOAT) + A[j+i*lda] = 0.0; + else + { + A[(i+j*lda)*2] = 0.0; + A[(i+j*lda)*2+1] = 0.0; + } for(int k = 0; k <=i; k++) { if(j < i) continue; - A[j+i*lda] += (C[j+k*lda]*C[i+k*lda]); + if(type_ == MLUOP_DTYPE_FLOAT) + A[j+i*lda] += (C[j+k*lda]*C[i+k*lda]); + else + { + A[(j+i*lda)*2] += (C[(j+k*lda)*2]*C[(i+k*lda)*2]); + A[(j+i*lda)*2+1] += (C[(j+k*lda)*2+1]*C[(i+k*lda)*2+1]); + } } } } - } - + } } -void print_matrix(float*A, int lda, bool trans_, int n_, int ldda_) +void print_matrix(int batch, float*A, int lda, bool trans_, int n_, int ldda_, mluOpDataType_t type_) { - if(trans_) + for(int x = 0; x < batch; x++) { - for(int i = 0; i getProtoNode()->cholesky_param().upper(); int dim_size = input_shape.dims_size(); + type_ = input_desc_->dtype; + type_size_ = type_ == MLUOP_DTYPE_FLOAT ? 4 : 8; if(dim_size ==2) { n_ = input_shape.dims(0); int dim = input_desc_->dim; stride_ = (input_desc_->strides)[dim-1]; ldda_ = input_desc_->dims[1]; - printf("n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n",n_,ldda_,stride_,upper_,trans_); int size = input_desc_->dims[0]; - printf("size:%d, dim:%d, \n",size,dim); printf("strides:\n"); for(int i = 0; i < dim; i++) @@ -206,24 +313,29 @@ void CholeskyExecutor::prepareComputeParam() printf("\n"); printf("data vector length : %ld\n",data_vector_.size()); } - - std::memcpy(dev_c,dev_a,sizeof(float)*n_*ldda_); - set_matrix_zero((float*)dev_c,upper_,trans_,n_,ldda_); - trans_mul(dev_a,dev_c,ldda_,upper_,trans_,n_,ldda_); - printf("matrix A:\n"); + +// printf("matrix random:\n"); +// print_matrix(batch_size_, dev_a,ldda_,trans_,n_,ldda_,type_); + std::memcpy(dev_c,dev_a,type_size_*n_*ldda_); + set_matrix_zero((float*)dev_c,upper_,trans_,n_,ldda_,type_); + trans_mul(dev_a,dev_c,ldda_,upper_,trans_,n_,ldda_,type_); + if(dim_size == 3) { for(int i = 1; i < batch_size_;i++) { - std::memcpy(dev_a+i*n_*ldda_,dev_a,sizeof(float)*n_*ldda_); - std::memcpy(dev_c+i*n_*ldda_,dev_c,sizeof(float)*n_*ldda_); + std::memcpy(dev_a+(i*n_*ldda_)*type_size_/4,dev_a,type_size_*n_*ldda_); + std::memcpy(dev_c+(i*n_*ldda_)*type_size_/4,dev_c,type_size_*n_*ldda_); } } - +// printf("matrix A:\n"); +// print_matrix(batch_size_,dev_a,ldda_,trans_,n_,ldda_,type_); +// printf("matrix C:\n"); +// print_matrix(batch_size_,dev_c,ldda_,trans_,n_,ldda_,type_); GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(dev_d, dev_a, sizeof(float)*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_HOST2DEV)); + cnrtMemcpy(dev_d, dev_a, type_size_*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_HOST2DEV)); float* cpu_a = cpu_fp32_input_[0]; - std::memcpy(cpu_a,dev_a,sizeof(float)*n_*ldda_); + std::memcpy(cpu_a,dev_a,type_size_*n_*ldda_); printf("end prepare compute.\n"); } @@ -239,66 +351,139 @@ void CholeskyExecutor::compute() { auto h_output = (float*)(data_vector_[1].host_ptr); auto d_intput = (float*)(data_vector_[0].device_ptr); auto d_output = (float*)(data_vector_[1].device_ptr); + std::memcpy(h_input,h_output,type_size_*n_*ldda_*batch_size_); GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(h_output, d_intput, sizeof(float)*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_DEV2HOST)); - + cnrtMemcpy(h_output, d_intput, type_size_*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_DEV2HOST)); +// printf("mlu before cholesky result:\n"); +// print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); interface_timer_.start(); - MLUOP_CHECK(mluOpCholesky(handle_,input_desc_,d_intput, output_desc_, d_output, upper_)); + float* workspace = nullptr; + size_t size = 0; + mluOpGetCholeskyWorkspace(input_desc_,&size,&workspace); + MLUOP_CHECK(mluOpCholesky(handle_,input_desc_,d_intput, output_desc_, d_output, upper_,workspace)); interface_timer_.stop(); - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(h_output, d_output, sizeof(float)*n_*ldda_, CNRT_MEM_TRANS_DIR_DEV2HOST)); - printf("mlu after cholesky result:\n"); + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(h_output, d_output, batch_size_*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_DEV2HOST)); +// printf("mlu after cholesky result:\n"); +// print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); return; } void CholeskyExecutor::cpuCompute() { - +// auto dev_a = (float*)(data_vector_[0].host_ptr); + auto dev_c = (float*)(data_vector_[0].host_ptr); +// std::memcpy(dev_c,dev_a,sizeof(float)*n_*ldda_); float* cpu_a = cpu_fp32_input_[0]; float* cpu_c = cpu_fp32_output_[0]; if(n_ > 2000) { - auto dev_c = (float*)(data_vector_[1].host_ptr); - std::memcpy(cpu_c,dev_c,sizeof(float)*n_*ldda_*batch_size_); + std::memcpy(cpu_c,dev_c,type_size_*n_*ldda_*batch_size_); return; } - std::memcpy(cpu_c,cpu_a,sizeof(float)*n_*ldda_); + std::memcpy(cpu_c,cpu_a,type_size_*n_*ldda_); if(trans_) { for(int i = 0; i < n_; i++) { - float dia = cpu_c[i+i*ldda_]; + float dia; + if(type_ == MLUOP_DTYPE_FLOAT) + { + dia = cpu_c[i+i*ldda_]; + } + else + { + dia = cpu_c[(i+i*ldda_)*2]; + } float dia_root = sqrt(dia); - cpu_c[i+i*ldda_] = sqrt(dia); + + if(type_ == MLUOP_DTYPE_FLOAT) + { + cpu_c[i+i*ldda_] = sqrt(dia); + } + else + { + cpu_c[(i+i*ldda_)*2] = sqrt(dia); + } if(upper_==false) { - for(int j = i+1;j Date: Fri, 28 Jun 2024 19:43:03 +0800 Subject: [PATCH 06/27] finish complex batch --- kernels/cholesky/cholesky.cpp | 253 ++- kernels/cholesky/cholesky.h | 14 +- kernels/cholesky/cholesky_union1.mlu | 1452 ++---------------- kernels/cholesky/complex_cholesky_union1.mlu | 1113 +++++++------- 4 files changed, 751 insertions(+), 2081 deletions(-) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index f0e263a59..838985d40 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -1,11 +1,5 @@ #include "cholesky.h" -//dA:输入被分解方阵 -//dC:cholesky分解结果方阵 -//trans -> false: col major; true: row major -//uplo -> false: lower; true: upper -//ldda:leading dimension - mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace) { PARAM_CHECK("mluOpCholesky", input_desc != NULL); @@ -36,11 +30,9 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t in batch_size = input_desc->dims[0]; size_a = input_desc->dims[1]; } - printf("fuck you!"); if (dtype == MLUOP_DTYPE_FLOAT) { - // *size = size_a*size_a*sizeof(float); *size = 0; } else @@ -57,30 +49,10 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t in } mluOpStatus_t MLUOP_WIN_API -mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace) +calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace) { - PARAM_CHECK("mluOpCholesky", handle != NULL); - PARAM_CHECK("mluOpCholesky", input_desc != NULL); - PARAM_CHECK("mluOpCholesky", d_input != NULL); - PARAM_CHECK("mluOpCholesky", output_desc != NULL); - PARAM_CHECK("mluOpCholesky", d_output != NULL); - - PARAM_CHECK("mluOpCholesky", input_desc->dim == 2||input_desc->dim == 3); - PARAM_CHECK("mluOpCholesky", output_desc->dim == input_desc->dim); - PARAM_CHECK("mluOpCholesky", input_desc->dims[0] > 0); - PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); - PARAM_CHECK("mluOpCholesky", output_desc->dims[0] > 0); - PARAM_CHECK("mluOpCholesky", output_desc->dims[1] > 0); - - if(input_desc->dim == 3) - { - PARAM_CHECK("mluOpCholesky", input_desc->dims[2] > 0); - PARAM_CHECK("mluOpCholesky", output_desc->dims[2] > 0); - } - mluOpDataType_t dtype = input_desc->dtype; - PARAM_CHECK("mluOpCholesky", dtype == output_desc->dtype); - PARAM_CHECK("mluOpCholesky", dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); + printf("batch_size:%d\n",batch_size); int recnb = REC_NB; @@ -91,7 +63,6 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa int type_size = (dtype == MLUOP_DTYPE_FLOAT) ? 4 : 8; int size_a = 0, lda = 0, size_c = 0, ldc = 0; - int batch_size = 1; if(dim == 2) { size_a = input_desc->dims[0]; @@ -101,7 +72,6 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa } else if(dim == 3) { - batch_size = input_desc->dims[0]; size_a = input_desc->dims[1]; lda = input_desc->dims[2]; size_c = output_desc->dims[1]; @@ -113,18 +83,13 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa float* work_space_h; CNRT_CHECK(cnrtMalloc((void **)&work_space, NB*NB*sizeof(float))); CNRT_CHECK(cnrtMemset(work_space, 0, NB*NB*sizeof(float))); - work_space_h = (float*)malloc(NB*NB*sizeof(float)); + work_space_h = (float*)malloc(batch_size*2*lda*lda*sizeof(float)); PARAM_CHECK("mluOpCholesky", lda >= size_a); PARAM_CHECK("mluOpCholesky", ldc >= size_c); cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - // CNRT_CHECK(cnrtSetDevice(0)); - // CNRT_CHECK(cnrtQueueCreate(&queue)); - // cnrtNotifier_t start, end; - // CNRT_CHECK(cnrtNotifierCreate(&start)); - // CNRT_CHECK(cnrtNotifierCreate(&end)); int jb; const float s_one = 1.0; @@ -150,11 +115,9 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa } cnrtQueueSync(queue); - - //TODO:检查拷贝开销 - int stride = size_a*lda; - //printf original matrix + + if(dtype == MLUOP_DTYPE_FLOAT) { @@ -168,7 +131,6 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa cnrtQueueSync(queue); CHECK_RETURN("mluOpCholesky", mlu_spotrf_rectile(batch_size,stride,is_row_major,false,jb,recnb,OFFSET_ROW(d_output,j,j),lda,j, handle)); - // cnrtQueueSync(queue); if(j+jb < row) { CHECK_RETURN("mluOpCholesky", @@ -196,14 +158,16 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa else { recnb = CREC_NB; - // int nb = NB; - int nb = NB; + int nb = CNB; int row = lda; - float* r_start = d_output; //实数首地址 - float* i_start = d_output + size_a*lda;//虚数首地址 + float* r_start = d_output; + float* i_start = d_output + size_a*lda; + stride *= 2; - set_half_zero(batch_size, size_a*lda, r_start, lda, lda, handle); - set_half_zero(batch_size, size_a*lda, i_start, lda, lda, handle); + + set_half_zero(batch_size, stride, r_start, lda, lda, handle); + set_half_zero(batch_size, stride, i_start, lda, lda, handle); + cnrtQueueSync(queue); for(int j = 0; j < row; j+=nb) { @@ -213,7 +177,7 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa cnrtQueueSync(queue); CHECK_RETURN("mluOpCholesky", mlu_cpotrf_rectile(batch_size,stride,jb,recnb,r_start+j*lda+j,i_start+j*lda+j,lda, handle)); - // cnrtQueueSync(queue); + cnrtQueueSync(queue); if(j+jb < row) { CHECK_RETURN("mluOpCholesky", @@ -233,118 +197,111 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa } } - // printf("after transpose, d_output:\n"); - // for(int i = 0; i < 2; i++) - // { - // for(int j = 0; j < lda; j++) - // { - // for(int h = 0; h < lda; h++) - // { - // cnrtMemcpy(work_space_h, d_output+i*lda*lda+j*lda+h, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f",*work_space_h); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - - printf("before finally, transpose:\n"); - cnrtMemcpy(work_space_h, d_output, sizeof(float)*lda*lda*2, CNRT_MEM_TRANS_DIR_DEV2HOST); - printf("real result:\n"); - for(int j = 0; j < lda; j++) - { - for(int h = 0; h < lda; h++) + CHECK_RETURN("mluOpCholesky", + transpose(batch_size,2,size_a*size_a,d_output,workspace,handle)); + cnrtQueueSync(queue); + if(batch_size > 16) { - printf("%8.3f",work_space_h[j*lda+h]); + CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*16, CNRT_MEM_TRANS_DIR_DEV2DEV)); + CNRT_CHECK(cnrtMemcpy(d_output+type_size/4*size_a*lda*16, workspace+type_size/4*size_a*lda*16, type_size*size_a*lda*(batch_size-16), CNRT_MEM_TRANS_DIR_DEV2DEV)); } - printf("\n"); - } - printf("\n"); - printf("imag result:\n"); - for(int j = 0; j < lda; j++) - { - for(int h = 0; h < lda; h++) + else { - printf("%8.3f",work_space_h[lda*lda+j*lda+h]); + CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*batch_size, CNRT_MEM_TRANS_DIR_DEV2DEV)); } - printf("\n"); + + } + - // CHECK_RETURN("mluOpCholesky", - // sgemm(batch_size, false,true,row-j-jb,jb,j,-1.0f,1.0f, - // OFFSET_ROW(d_output,j+jb,0),lda,stride, - // OFFSET_ROW(d_output,j,0),lda,stride, - // OFFSET_ROW(d_output,j+jb,j),lda,stride, handle)); - // cnrtQueueSync(queue); - - // cnrtMemcpy(work_space_h, d_output, sizeof(float)*lda*lda*2, CNRT_MEM_TRANS_DIR_DEV2HOST); - // for(int i = 0; i < 2; i++) - // { - // for(int j = 0; j < lda; j++) - // { - // for(int h = 0; h < lda; h++) - // { - // // cnrtMemcpy(work_space_h, d_output+i*lda*lda+j*lda+h, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f",work_space_h[i*lda*lda+j*lda+h]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - CHECK_RETURN("mluOpCholesky", - transpose(batch_size,2,size_a*size_a,d_output,workspace,handle)); - cnrtQueueSync(queue); - CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*batch_size, CNRT_MEM_TRANS_DIR_DEV2DEV)); - - // printf("after transpose, d_a:\n"); - - // for(int j = 0; j < lda; j++) - // { - // for(int h = 0; h < lda; h++) - // { - // cnrtMemcpy(work_space_h, d_output+j*lda*2+h*2, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtMemcpy((work_space_h+1), d_output+j*lda*2+h*2+1, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f,%8.3f ",work_space_h[0],work_space_h[1]); - // } - // printf("\n"); - // } + cnrtQueueSync(queue); + + return MLUOP_STATUS_SUCCESS; +} + + +mluOpStatus_t MLUOP_WIN_API +mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace) +{ + PARAM_CHECK("mluOpCholesky", handle != NULL); + PARAM_CHECK("mluOpCholesky", input_desc != NULL); + PARAM_CHECK("mluOpCholesky", d_input != NULL); + PARAM_CHECK("mluOpCholesky", output_desc != NULL); + PARAM_CHECK("mluOpCholesky", d_output != NULL); + + PARAM_CHECK("mluOpCholesky", input_desc->dim == 2||input_desc->dim == 3); + PARAM_CHECK("mluOpCholesky", output_desc->dim == input_desc->dim); + PARAM_CHECK("mluOpCholesky", input_desc->dims[0] > 0); + PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); + PARAM_CHECK("mluOpCholesky", output_desc->dims[0] > 0); + PARAM_CHECK("mluOpCholesky", output_desc->dims[1] > 0); + + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + + if(input_desc->dim == 3) + { + PARAM_CHECK("mluOpCholesky", input_desc->dims[2] > 0); + PARAM_CHECK("mluOpCholesky", output_desc->dims[2] > 0); } - - // printf("matrix after calculate:\n"); - // for(int i = 0; i < batch_size; i++) - // { - // printf("batch %d:\n",i); - // for(int j = 0; j < size_a; j++) - // { - // for(int k = 0; k < size_a; k++) - // { - // cnrtMemcpy(work_space_h, d_output + i*stride+j*lda+k, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%.2f ",work_space_h[0]); - // } - // printf("\n"); - // } - // } - + mluOpDataType_t dtype = input_desc->dtype; + PARAM_CHECK("mluOpCholesky", dtype == output_desc->dtype); + PARAM_CHECK("mluOpCholesky", dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); + + int dim = input_desc->dim; + int size_a = 0, lda = 0, size_c = 0, ldc = 0; + + int batch_size = 1; + if(dim == 2) + { + size_a = input_desc->dims[0]; + lda = input_desc->dims[1]; + size_c = output_desc->dims[0]; + ldc = output_desc->dims[1]; + } + else if(dim == 3) + { + batch_size = input_desc->dims[0]; + size_a = input_desc->dims[1]; + lda = input_desc->dims[2]; + size_c = output_desc->dims[1]; + ldc = output_desc->dims[2]; + } + + float* last_addr = d_input+batch_size*size_a*lda*2; + float* temp_addr = last_addr - 10; + + + float* work_space_h; + work_space_h = (float*)malloc(100*sizeof(float)); + cnrtMemcpy(work_space_h, temp_addr, 10*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); + printf("last 10 input:\n"); + for(int i = 0; i < 10;i++) + { + printf("%8.3f ",work_space_h[i]); + } + printf("\n"); + + + int type_size = (dtype == MLUOP_DTYPE_FLOAT) ? 4 : 8; + if(type_size == 8 && batch_size > 16 && size_a > 2000) + { + int stride = 2*size_a*lda; + calculate_body(handle, 16, input_desc,d_input, output_desc, d_output, upper, workspace); + cnrtQueueSync(queue); + calculate_body(handle, batch_size-16, input_desc,d_input+16*stride, output_desc, d_output+16*stride, upper, workspace); + } + else + { + calculate_body(handle, batch_size, input_desc,d_input, output_desc, d_output, upper, workspace); + } - cnrtQueueSync(queue); - // cnrtMemcpy(work_space_h, work_space, sizeof(float)*NB*NB, CNRT_MEM_TRANS_DIR_DEV2HOST); - //print work_space_h - // printf("work_space:\n"); - // for(int i = 0; i < NB; i++) - // { - // for(int j = 0; j < NB; j++) - // { - // printf("%.2f ",work_space_h[i*NB+j]); - // } - // printf("\n"); - // } return MLUOP_STATUS_SUCCESS; } \ No newline at end of file diff --git a/kernels/cholesky/cholesky.h b/kernels/cholesky/cholesky.h index 88afe06e1..e06f66c87 100644 --- a/kernels/cholesky/cholesky.h +++ b/kernels/cholesky/cholesky.h @@ -20,16 +20,17 @@ #include "kernels/kernel.h" #include "kernels/utils/cnnl_helper.h" - +#define CNB (16) #define REC_NB (16) #define POTF_NB ((REC_NB)/4) -#define CREC_NB (4) +#define CREC_NB (8) #define CPOTF_NB ((CREC_NB)/4) #define __CNRT_FUNC_TYPE__ CNRT_FUNC_TYPE_UNION1 #define TASK_NUM (4) -#define NB (8) +#define NB (32) + #define CLUSTER_NUM 1 -#define M (TASK_NUM * POTF_NB) //POTF边长 +#define M (TASK_NUM * POTF_NB) #define ZERO 0.0 #define SHARED_MEM_SIZE (((M*POTF_NB/TASK_NUM * 4)+(POTF_NB * POTF_NB))) #define OFFSET_ROW(A, i, j) A + ((i) * (lda) + (j)) @@ -37,14 +38,12 @@ mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, int n, int recnb, float* dA, int ldda, int gbstep, mluOpHandle_t handle); -// void mluOpCholesky(bool trans, bool uplo, int n, float* dA, float* dC, int ldda); mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle); mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, int stride_a, float* d_b, int ldb, int stride_b, float* d_c, int ldc, int stride_c, mluOpHandle_t handle); -//side:true->right -// false->left + mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, float* d_a, int ldda, float* d_b, int lddb, mluOpHandle_t handle); mluOpStatus_t transpose(int batch, int m, int n,float* d_input,float* d_output, mluOpHandle_t handle); @@ -55,7 +54,6 @@ mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, mluOpStatus_t complex_malloc(size_t size, float** workspace); -// mluOpStatus_t complex_set_half_zero(int batch, int stride, float* d_a, int m, int ld); mluOpStatus_t set_half_zero(int batch,int stride,float* d_a, int lda, int m, mluOpHandle_t handle); diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index 669fceb35..0f06250e8 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -1,6 +1,5 @@ #include "cholesky.h" __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; -// __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; __mlu_func__ void sgemm_fixwidth_device(int m, int k, @@ -9,23 +8,17 @@ void sgemm_fixwidth_device(int m, int k, { int id = taskId % 4; - int span = POTF_NB;//span = remain > POTF_NB ? POTF_NB : remain; - + int span = POTF_NB; - //这个m和M不同!这个m是前面M-i的 - __nram__ float rC[M * POTF_NB/TASK_NUM ]; + __nram__ float rC[M * POTF_NB/TASK_NUM ]; __nram__ float rA[M * POTF_NB/TASK_NUM ]; __nram__ float rp[M * POTF_NB/TASK_NUM ]; __nram__ float rB[POTF_NB * POTF_NB]; - // __nram__ float rC_inter[POTF_NB * POTF_NB]; - - // __wram__ float wB[POTF_NB * POTF_NB]; - - //void __memcpy(void *dst, const void *src, unsigned int size, mluMemcpyDirection_t dir, unsigned int dst_stride, int src_stride, unsigned int segnum) - // row major - if(id*span 0; int span = (remain > POTF_NB||remain <= 0) ? POTF_NB : remain; float *rA = (float*)nram_buffer + id * NB * NB * 4; - // float *rA = (float*)nram_buffer; - float *rB = rA + NB * NB; - float *rC = rB + NB * NB; - float* rp = rC + NB * NB; - int span_b = POTF_NB > m ? m : POTF_NB; - - - __memset_nram(rC,span_b*span,(float)ZERO); - if(if_execute) { if(k>0) @@ -302,49 +143,14 @@ void sgemm_anywidth_device(int m, int k, __memcpy(rA,A0+id*POTF_NB*lda,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span-1); } __memcpy(rp,sC+id*POTF_NB*lda,span_b*sizeof(float),SRAM2NRAM,span_b*sizeof(float),lda*sizeof(float),span-1); - } if(k>0) { - // if(id == 0) - // { - // printf("tmdsb\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < 7;j ++) - // { - // printf("%.3f ",A0[i*lda+j]); - // } - // printf("\n"); - // } - // printf("k:%d\n",k); - // printf("m:%d\n",m); - // printf("lda:%d\n",lda); - // printf("span_b:%d\n",span_b); - // } - __memcpy(rB,A0,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span_b-1); - // if(id == 0) - // { - // printf("after memcpy rB:\n"); - // for(int i = 0; i 0 && if_execute) - // { - // __memcpy(rB,A0,1*sizeof(float),SRAM2NRAM,1*sizeof(float),1*sizeof(float),0); - // } - - + } __sync_cluster(); for(int i = 0; i < span; i++) @@ -357,7 +163,6 @@ void sgemm_anywidth_device(int m, int k, } } } - __bang_sub(rp,rp,rC,span_b * span); __sync_cluster(); @@ -374,15 +179,8 @@ void sgemm_anywidth_device(int m, int k, { __memcpy(sC+(id*POTF_NB*lda),rp,span_b*sizeof(float),NRAM2SRAM,lda*sizeof(float),span_b*sizeof(float),span-1); } - - - - - - } - static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) { float factor; @@ -399,17 +197,14 @@ static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) __sync_cluster(); for(int i = 0; i < span; i++) { - // if(iter == 0) - // printf("before: %.3f\n",A[i*POTF_NB+iter+id*span*POTF_NB]); + + if(if_execute) - A[i*lda+iter+id*POTF_NB*lda] *= factor; - // if(iter == 0) - // printf("after: %.3f\n",A[i*POTF_NB+iter+id*span*POTF_NB]); + A[i*lda+iter+id*POTF_NB*lda] *= factor; } __sync_cluster(); - - //TODO:可能要重点优化 + if(if_execute) { for(int i = iter + 1; i < iter_num; i++) @@ -436,223 +231,47 @@ __mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float *A0, float * float* sdata_A = shared_data; float* sdata_B = shared_data + m *POTF_NB/TASK_NUM * 4; - // if(localstep == 8) - // { - // if(id == 0) - // { - // printf("before sgemm:\n"); - // } - // for(int i = 0; i =j) - // { - // A[i*lda+j] = sdata_A[coreId*span*POTF_NB+i*POTF_NB+j]; - // } - // } - __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); - // __memcpy(work_space+(i*NB),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); + __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); + } } else if(id*span < m) { __memcpy(A+(id*POTF_NB*lda),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,lda*sizeof(float),POTF_NB*sizeof(float),span-1); - // __memcpy(work_space+(id*POTF_NB*NB),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,NB*sizeof(float),POTF_NB*sizeof(float),span-1); + } - __sync_cluster(); - // if(id==3) - // { - // printf("sdata:\n"); - // for(int i = 0; i 0; - // int span = remain > POTF_NB ? POTF_NB : remain; - // __sync_cluster(); sgemm_anywidth_device(m, localstep, A0, lda, A, nullptr); - - // __sync_cluster(); - - // if(id==3) - // { - // printf("sdata:\n"); - // for(int i = 0; i =j) - // // { - // // A[i*lda+j] = sdata_A[coreId*span*POTF_NB+i*POTF_NB+j]; - // // } - // // } - // __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); - // // __memcpy(work_space+(i*NB),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); - // } - - // } - // else if(if_execute) - // { - // __memcpy(A+(id*POTF_NB*lda),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,lda*sizeof(float),POTF_NB*sizeof(float),span-1); - // // __memcpy(work_space+(id*POTF_NB*NB),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,NB*sizeof(float),POTF_NB*sizeof(float),span-1); - // } - - // __sync_cluster(); - - // if(id==3) - // { - // printf("sdata:\n"); - // for(int i = 0; i = batch) - return; - dA = orignA + batch_id * stride; + + float* orignA = dA; + int batch_id = id / 4; + if(batch_id >= batch) + return; + dA = orignA + batch_id * stride; __mlu_shared__ float shared_data[NB * NB]; @@ -669,17 +288,7 @@ __mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, bool tr if(id == 0) { __memcpy(shared_data,dA,m*sizeof(float),GDRAM2SRAM,NB*sizeof(float),lda*sizeof(float),m-1); - //printf shared_data - // printf("shared_data:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < m; j++) - // { - // printf("%.3f ",shared_data[i*NB+j]); - // } - // printf("\n"); - // } - // printf("localstep:%d\n",localstep); + } __sync_cluster(); @@ -696,10 +305,6 @@ __mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, bool tr } __sync_cluster(); } - // } - - - } @@ -709,7 +314,7 @@ void small_sgemm_batch(int m, int k, float* A0, const int lda,int width, float* dst, float* nram_remain) { - //dst和dst2形状: m*width src1形状:m*k src2形状:width*k + int ldk = k; int ldm = m; float* src1 = nram_remain; @@ -754,14 +359,14 @@ void small_sgemm_batch(int m, int k, __mlu_func__ void small_sminout_batch(int m, int width, float *dst, float *nram_remain, int lda) { float factor; - // __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + float* diag = dst; for(int iter = 0; iter < width; iter++) { factor=sqrt(diag[iter*width+iter]); factor = 1.0/factor; - // __sync_cluster(); + for(int i = 0; i < m; i ++) { dst[i*width+iter] *= factor; @@ -773,154 +378,26 @@ __mlu_func__ void small_sminout_batch(int m, int width, float *dst, float *nram_ { dst[j * width + i ] -= dst[i*width+iter] * dst[j * width + iter]; - // nram_src[j * POTF_NB + i ] -= diag[i*POTF_NB+iter] * nram_src[j * POTF_NB + iter]; + } } __sync(); - - // for(int i = 0; i < width; i++) - // { - // // if(iter == 0) - // // printf("before: %.3f\n",A[i*POTF_NB+iter+id*span*POTF_NB]); - // nram_src[i*POTF_NB+iter] *= factor; - // diag[i*POTF_NB+iter] *= factor; - // // if(iter == 0) - // // printf("after: %.3f\n",A[i*POTF_NB+iter+id*span*POTF_NB]); - - // } - - - - // for(int i = iter + 1; i < POTF_NB; i++) - // { - // for(int j = 0; j < span; j++) - // { - // diag[j * POTF_NB + i ] -= diag[i*POTF_NB+iter] * diag[j * POTF_NB + iter]; - // nram_src[j * POTF_NB + i ] -= diag[i*POTF_NB+iter] * nram_src[j * POTF_NB + iter]; - // } - // } - - + } __sync(); - - // __memcpy(nram_src,A + id *span*POTF_NB,span*span*sizeof(float),SRAM2NRAM); - // __sync(); - //print diag and nram_src - // if(id*span < m) - // { - // printf("before sminout,id:%d\n",id); - // printf("diag:\n"); - // for(int i = 0; i < span; i++) - // { - // for(int j = 0; j < span; j++) - // { - // printf("%.3f ",diag[i*span+j]); - // } - // printf("\n"); - // } - // printf("nram_src:\n"); - // for(int i = 0; i < span; i++) - // { - // for(int j = 0; j < span; j++) - // { - // printf("%.3f ",nram_src[i*span+j]); - // } - // printf("\n"); - // } - // } - - // __sync_cluster(); - - // if(id * span < m) - // { - // printf("after sminout,id:%d\n",id); - // printf("diag:\n"); - // for(int i = 0; i < span; i++) - // { - // for(int j = 0; j < span; j++) - // { - // printf("%.3f ",diag[i*span+j]); - // } - // printf("\n"); - // } - // printf("nram_src:\n"); - // for(int i = 0; i < span; i++) - // { - // for(int j = 0; j < span; j++) - // { - // printf("%.3f ",nram_src[i*span+j]); - // } - // printf("\n"); - // } - // } - - // if(id*span>>(batch, stride, trans, n, dA, ldda, 0,gbstep)); } - // dim.x = TASK_NUM * 4; - - // } - // cnrtQueueSync(queue); - // float* h_i; - // h_i = (float*)malloc(n*n*sizeof(float)); - // for(int i = 0; i < n; i ++) - // { - // cnrtMemcpy(h_i+i*n, work_space+i*NB, n*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // } - - // cnrtQueueSync(queue); - // //print h_i - // printf("work_space after mlu_spotf2_lpin:\n"); - // for(int i = 0; i < n; i++) - // { - // for(int l = 0; l < n; l++) - // { - // printf("%.3f ",h_i[i*n+l]); - // } - // printf("\n"); - // } - // cnrtQueueSync(queue); return MLUOP_STATUS_SUCCESS; } @@ -1037,10 +490,7 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( float* orignB = dB; dA = orignA + batch_id * stride; dB = orignB + batch_id * stride; - - // int remain = n - id * POTF_NB; - // bool if_execute = remain > 0; - // int span = (remain > POTF_NB || remain <= 0) ? POTF_NB : remain; + int span = n; int start = 0; @@ -1055,18 +505,11 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( float temp_b = 0, factor = 0; - - __memcpy_async(sA,dA,sizeof(float),GDRAM2NRAM); __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),GDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); - __sync(); - // if(id == 3) - // { - // printf("sA[0]:%.3f\n",sA[0]); - // printf("dA[0]:%.3f\n",dA[0]); - // } + __sync(); if(trans) { @@ -1079,8 +522,8 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( factor = 1.0 / rA[0]; for(int i = 0; i < span; i++) { - //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) - //float __bang_sum(const float *src, unsigned int elem_count) + + rB[i*calc_length] *= factor; } @@ -1097,11 +540,11 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( factor = 1.0 / rA[iter]; for(int i = 0; i < span; i++) { - //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) + __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,iter); - //float __bang_sum(const float *src, unsigned int elem_count) + temp_b = 0; - //reduce add rC + for(int j = 0; j < iter; j++) { temp_b += rC[i*calc_length+j]; @@ -1119,11 +562,11 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( factor = 1.0 / rA[m-1]; for(int i = 0; i < span; i++) { - //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) + __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,m-1); temp_b = 0; - //reduce add rC + for(int j = 0; j < m-1; j++) { temp_b += rC[i*calc_length+j]; @@ -1134,7 +577,6 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( } __sync(); - __memcpy(OFFSET_B_ROW(dB,start,0),rB,calc_length*sizeof(float),NRAM2GDRAM,ldb * sizeof(float), calc_length * sizeof(float), span - 1); __sync(); @@ -1157,11 +599,8 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( float* orignA = dA; float* orignB = dB; dA = orignA + batch_id * stride; - dB = orignB + batch_id * stride; + dB = orignB + batch_id * stride; - // int remain = n - id * POTF_NB; - // bool if_execute = remain > 0; - // int span = (remain > POTF_NB || remain <= 0) ? POTF_NB : remain; int span = n / 4; int start = id * span; if(id == 3) @@ -1192,12 +631,7 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( } if(if_execute) __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); - __sync_cluster(); - // if(id == 3) - // { - // printf("sA[0]:%.3f\n",sA[0]); - // printf("dA[0]:%.3f\n",dA[0]); - // } + __sync_cluster(); if(trans) { @@ -1214,8 +648,8 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( factor = 1.0 / rA[0]; for(int i = 0; i < span; i++) { - //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) - //float __bang_sum(const float *src, unsigned int elem_count) + + rB[i*calc_length] *= factor; } @@ -1236,23 +670,19 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( factor = 1.0 / rA[iter]; for(int i = 0; i < span; i++) { - //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) + __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,iter); - //float __bang_sum(const float *src, unsigned int elem_count) + temp_b = 0; sum = 0.0; c = 0.0; - t = 0.0; - //reduce add rC - // for(int j = 0; j < iter; j++) - // { - // temp_b += rC[i*calc_length+j]; - // } + t = 0.0; + for(int j = 0; j < iter; j++) { - temp_b = rC[i*calc_length+j] - c; //So far, so good: c is zero. - t = sum + temp_b; //Alas, sum is big, y small, so low-order digits of y are lost. - c = (t - sum) - temp_b; //(t - sum) recovers the high-order part of y; subtracting y recovers -(low part of y) + temp_b = rC[i*calc_length+j] - c; + t = sum + temp_b; + c = (t - sum) - temp_b; sum = t; } temp_b = sum; @@ -1270,23 +700,23 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( factor = 1.0 / rA[m-1]; for(int i = 0; i < span; i++) { - //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) + __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,m-1); sum = 0.0; c = 0.0; t = 0.0; temp_b = 0; - //reduce add rC - // for(int j = 0; j < m-1; j++) - // { - // temp_b += rC[i*calc_length+j]; - // } + + + + + for(int j = 0; j < m-1; j++) { - temp_b = rC[i*calc_length+j] - c; //So far, so good: c is zero. - t = sum + temp_b; //Alas, sum is big, y small, so low-order digits of y are lost. - c = (t - sum) - temp_b; //(t - sum) recovers the high-order part of y; subtracting y recovers -(low part of y) + temp_b = rC[i*calc_length+j] - c; + t = sum + temp_b; + c = (t - sum) - temp_b; sum = t; } temp_b = sum; @@ -1306,166 +736,6 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( } -// __mlu_entry__ void mlu_strsm_rectile_kernel( -// int m,int n, bool trans, -// float *dA, int32_t lda, -// float *dB, int32_t ldb) -// { -// int id = taskId; -// int remain = n - id * POTF_NB; -// bool if_execute = remain > 0; -// int span = (remain > POTF_NB || remain <= 0) ? POTF_NB : remain; -// __mlu_shared__ float sA[REC_NB]; -// __nram__ float rB[POTF_NB * REC_NB]; -// __nram__ float rC[POTF_NB * REC_NB]; -// __nram__ float rBp[POTF_NB]; -// __nram__ float rA[REC_NB]; -// __memset_nram(rB,POTF_NB*REC_NB,(float)ZERO); -// __sramset(sA,REC_NB*REC_NB,0); - - -// float temp_b = 0, factor = 0; - -// if(id == 0) -// { -// __memcpy_async(sA,dA,sizeof(float),LDRAM2SRAM); -// } -// __memcpy(rBp,OFFSET_B_ROW(dB,id*POTF_NB,0),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); -// __sync_cluster(); -// // if(id == 3) -// // { -// // printf("sA[0]:%.3f\n",sA[0]); -// // printf("dA[0]:%.3f\n",dA[0]); -// // } - -// if(trans) -// { -// __memcpy_async(rA,sA,(1)*sizeof(float),SRAM2NRAM); - -// __memcpy_async(rB,rBp,sizeof(float),NRAM2NRAM,REC_NB * sizeof(float), sizeof(float), span - 1); -// __sync_cluster(); -// // if(id == 0) -// // printf("rA[0]:%.3f\n",rA[0]); -// //print rB -// // printf("id :%d\n",id); -// // printf("before calculation\n"); -// // for(int i = 0; i < span; i++) -// // { -// // printf("rB[%d]:%.3f\n",i,rB[i*REC_NB]); -// // } -// if(id == 0) -// { -// __memcpy_async(sA,OFFSET_ROW(dA,1,0),2*sizeof(float),LDRAM2SRAM); -// } -// __memcpy_async(rBp,OFFSET_B_ROW(dB,id*POTF_NB,1),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); -// factor = 1.0 / rA[0]; -// for(int i = 0; i < span; i++) -// { -// //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) -// //float __bang_sum(const float *src, unsigned int elem_count) -// rB[i*REC_NB] *= factor; -// } -// //print rB after first calculation -// // printf("id :%d\n",id); -// // printf("after first calculation\n"); -// // for(int i = 0; i < span; i++) -// // { -// // printf("rB[%d]:%.3f\n",i,rB[i*REC_NB]); -// // } - -// __sync_cluster(); - -// for(int iter = 1; iter < m - 1; iter++) -// { -// __memcpy_async(rA,sA,(iter+1)*sizeof(float),SRAM2NRAM); - -// __memcpy_async(rB+iter,rBp,sizeof(float),NRAM2NRAM,REC_NB * sizeof(float), sizeof(float), span - 1); -// __sync_cluster(); -// //printf rA -// // printf("id :%d\n",id); -// // printf("rA:\n"); -// // for(int i = 0; i < iter+1; i++) -// // { -// // printf("%.3f ",rA[i]); -// // } -// //printf rB -// // printf("\n"); -// // printf("rB:\n"); -// // for(int i = 0; i < span; i++) -// // { -// // for(int j = 0; j < iter+1; j++) -// // { -// // printf("%.3f ",rB[i*REC_NB+j]); -// // } -// // printf("\n"); -// // } -// if(id == 0) -// { -// __memcpy_async(sA,OFFSET_ROW(dA,iter+1,0),(iter+2)*sizeof(float),LDRAM2SRAM); -// } -// __memcpy_async(rBp,OFFSET_B_ROW(dB,id*POTF_NB,iter+1),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); -// factor = 1.0 / rA[iter]; -// for(int i = 0; i < span; i++) -// { -// //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) -// __bang_mul(rC+i*REC_NB,rA,rB+i*REC_NB,iter); -// //float __bang_sum(const float *src, unsigned int elem_count) -// temp_b = 0; -// //reduce add rC -// for(int j = 0; j < iter; j++) -// { -// temp_b += rC[i*REC_NB+j]; -// } -// temp_b = rB[i*REC_NB+iter] - temp_b; -// rB[i*REC_NB+iter] = temp_b * factor; -// } - -// __sync_cluster(); -// } - -// __memcpy_async(rA,sA,(m)*sizeof(float),SRAM2NRAM); - -// __memcpy_async(rB+m-1,rBp,sizeof(float),NRAM2NRAM,REC_NB * sizeof(float), sizeof(float), span - 1); -// __sync_cluster(); -// factor = 1.0 / rA[m-1]; -// for(int i = 0; i < span; i++) -// { -// //void __bang_mul(float *dst, const float *src0, const float *src1, unsigned int elem_count) -// __bang_mul(rC+i*REC_NB,rA,rB+i*REC_NB,m-1); - -// temp_b = 0; -// //reduce add rC -// for(int j = 0; j < m-1; j++) -// { -// temp_b += rC[i*REC_NB+j]; -// } -// temp_b = rB[i*REC_NB+m-1] - temp_b; - -// rB[i*REC_NB+m-1] = temp_b * factor; -// } -// __sync_cluster(); - -// // printf("id:%d\n",id); -// //print rB after complete calculation -// // printf("after complete calculation\n"); -// // for(int i = 0; i < span; i++) -// // { -// // for(int j = 0; j < REC_NB; j++) -// // { -// // printf("%.3f ",rB[i*REC_NB+j]); -// // } -// // printf("\n"); -// // } - -// if(if_execute) -// { -// __memcpy(OFFSET_B_ROW(dB,id*POTF_NB,0),rB,REC_NB*sizeof(float),NRAM2LDRAM,ldb * sizeof(float), REC_NB * sizeof(float), span - 1); -// } -// __sync_cluster(); - -// } - -// } mluOpStatus_t strsm_rectile(int batch, int stride, bool upper, bool trans, int m, int n, float *d_a, int lda, float *d_b, int lddb, cnrtQueue_t queue) { @@ -1516,146 +786,11 @@ mluOpStatus_t strsm_rectile(int batch, int stride, bool upper, bool trans, int m return MLUOP_STATUS_SUCCESS; } - - -// d_c = d_c - src -// __mlu_global__ -// void add_c(float *d_c, float* src,int ldc, int ldsrc, int m, int n) -// { -// int id = taskId; -// int span = m/4; - -// float* start_c = d_c + id * span * ldc; -// float* start_src = src + id * span * ldsrc; -// float* temp_c = start_c, *temp_src =start_src; -// int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); -// int32_t data_nram_num = MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; -// __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; - -// if (id == 3) -// { -// span = m - 3 * span; -// } - // float *rC = (float *)nram_buffer; - // float *rsrc = (float *)nram_buffer + data_nram_num; -// int k = n/data_nram_num; -// int remain = n - k * data_nram_num; -// for(int i = 0; i < span; i++) -// { -// temp_c = start_c + i * ldc; -// temp_src = start_src + i * ldsrc; - // for(int i = 0; i < k; i++) - // { - // __memcpy(rC,temp_c,data_nram_num*sizeof(float),GDRAM2NRAM); - // __memcpy(rsrc,temp_src,data_nram_num*sizeof(float),GDRAM2NRAM); - // temp_c += data_nram_num; - // temp_src += data_nram_num; - // __sync(); - // __bang_add(rC, rC, rsrc, data_nram_num); - // __memcpy(temp_c - data_nram_num,rC,data_nram_num*sizeof(float),NRAM2GDRAM); - // __sync_cluster(); - // } - // if(remain > 0) - // { - // __memcpy(rC,temp_c,remain*sizeof(float),GDRAM2NRAM); - // __memcpy(rsrc,temp_src,remain*sizeof(float),GDRAM2NRAM); - // __sync(); - // __bang_add(rC, rC, rsrc, remain); - // __memcpy(temp_c,rC,remain*sizeof(float),NRAM2GDRAM); - // __sync_cluster(); - // } - - -// } - -// } - - -// __mlu_global__ -// void add_c(float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) -// { - - -// __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; -// if (beta == 0.0f) -// { -// if(taskId == 0) -// { -// __memcpy(sram_buffer,src,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); - - -// } -// __sync_cluster(); -// if(taskId == 0) -// { -// __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); -// } -// __sync_cluster(); -// return; -// } - - -// if (taskId == 0) { -// __memcpy(sram_buffer,d_c,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldc*sizeof(float),m-1); -// } - -// __sync_cluster(); - - -// int32_t data_num = m*n; -// int32_t data_per_core = data_num / taskDim; -// int32_t data_last_core = data_per_core + data_num % taskDim; -// const float *a_offset = src + taskId * data_per_core; -// const float *b_offset = (float*)sram_buffer + taskId * data_per_core; -// float *output_offset = (float*)sram_buffer + taskId * data_per_core; - -// if (taskId == taskDim - 1) { -// data_per_core = data_last_core; -// } - -// int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); -// int32_t data_nram_num = -// MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; -// float *a_nram = (float *)nram_buffer; -// float *b_nram = (float *)a_nram + data_nram_num; -// int32_t loop_num = data_per_core / data_nram_num; -// int32_t rem_nram_num = data_per_core % data_nram_num; - -// for (int32_t i = 0; i < loop_num; i++) { -// __memcpy(a_nram, a_offset + i * data_nram_num, -// data_nram_num * sizeof(float), GDRAM2NRAM); -// __memcpy(b_nram, b_offset + i * data_nram_num, -// data_nram_num * sizeof(float), SRAM2NRAM); -// __bang_add(a_nram, a_nram, b_nram, data_nram_num); -// __memcpy(output_offset + i * data_nram_num, a_nram, -// data_nram_num * sizeof(float), NRAM2SRAM); -// } -// if (rem_nram_num != 0) { -// int32_t rem_align_num = -// (rem_nram_num + align_num - 1) / align_num * align_num; -// __memcpy(a_nram, a_offset + loop_num * data_nram_num, -// rem_nram_num * sizeof(float), GDRAM2NRAM); -// __memcpy(b_nram, b_offset + loop_num * data_nram_num, -// rem_nram_num * sizeof(float), SRAM2NRAM); -// __bang_add(a_nram, a_nram, b_nram, rem_align_num); -// __memcpy(output_offset + loop_num * data_nram_num, a_nram, -// rem_nram_num * sizeof(float), NRAM2SRAM); -// } -// __sync_cluster(); - -// if (taskId == 0) { -// __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); -// } - -// __sync_cluster(); - -// } - __mlu_global__ void add_c_batch(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) { -// __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + int id = taskId; int batch_id = id; if(batch_id >= batch) @@ -1667,11 +802,7 @@ void add_c_batch(int batch, int stride, float beta, float *d_c, float* src,int l if (beta == 0.0f) - { - - // __memcpy(nram_buffer,src,n*sizeof(float),GDRAM2NRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); - - // __memcpy(d_c,nram_buffer,n*sizeof(float),NRAM2GDRAM,ldc*sizeof(float),n*sizeof(float),m-1); + { __memcpy(d_c,src,n*sizeof(float),GDRAM2GDRAM,ldc*sizeof(float),ldsrc*sizeof(float),m-1); return; } @@ -1704,7 +835,7 @@ __mlu_global__ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) { -// __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + int id = taskId; int ipu_per_cluster = 4; int batch_id = id / ipu_per_cluster; @@ -1715,28 +846,6 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in float* orignSrc = src; d_c = orignC + batch_id * stride; src = orignSrc + batch_id * m*n; - - // if(batch_id == 1 && id== 0) - // { - // printf("add_c d_c:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < n; j++) - // { - // printf("%.3f ",d_c[i*ldc+j]); - // } - // printf("\n"); - // } - // printf("add_c src:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < n; j++) - // { - // printf("%.3f ",src[i*n+j]); - // } - // printf("\n"); - // } - // } __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; if (beta == 0.0f) @@ -1777,8 +886,8 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in } int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); -// int32_t data_nram_num = -// MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; + + int32_t data_nram_num = MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; float *a_nram = (float *)nram_buffer; @@ -1810,15 +919,7 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in if (id == 0) { __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2GDRAM,ldc*sizeof(float),n*sizeof(float),m-1); - // printf("d_c after add:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < n; j++) - // { - // printf("%.3f ",d_c[i*ldc+j]); - // } - // printf("\n"); - // } + } __sync_cluster(); @@ -1832,24 +933,19 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, return MLUOP_STATUS_SUCCESS; int matmul_is_transA = trans_a; int matmul_is_transB = trans_b; - // float matmul_alpha = alpha; - // float matmul_beta = beta; + + int matmul_requested_algo = 1; int matmul_recieved_algo = 0; size_t tempSize_matmulExtra = 0; int matmul_computetype = MLUOP_DTYPE_FLOAT; float *workspace; int matmul_use_beta = beta == 0.0f ? 0 : 1; - // lda = lda * sizeof(float); + cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - - - - - - + mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc; cnnlMatMulDescriptor_t matmul_desc; @@ -1879,12 +975,6 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_USE_STRIDE, &lda, sizeof(int32_t))); - // int32_t matmul_a_shape[2] = {m, k}; - // int32_t matmul_b_shape[2] = {n, k}; - // int32_t matmul_a_shape[3] = {batch, 16, lda}; - // int32_t matmul_b_shape[3] = {batch, 16, ldb}; - // int32_t matmul_c_shape[3] = {batch, 16, n}; - int32_t matmul_a_shape[2] = {batch, stride_a}; int32_t matmul_b_shape[2] = {batch, stride_b}; int32_t matmul_c_shape[2] = {batch, m*n}; @@ -1899,24 +989,6 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, matmul_c_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_FLOAT, 2, matmul_c_shape)); - - // matmul_a_desc->strides[0] = lda; - // matmul_a_desc->strides[1] = 1; - - - // matmul_b_desc->strides[0] = ldb; - // matmul_b_desc->strides[1] = 1; - // matmul_c_desc->strides[0] = ldc; - // matmul_c_desc->strides[1] = 1; - // matmul_a_desc->dims[0] = m; - // matmul_a_desc->dims[1] = k; - // matmul_b_desc->dims[0] = n; - // matmul_b_desc->dims[1] = k; - // matmul_c_desc->dims[0] = m; - // matmul_c_desc->dims[1] = n; - - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); @@ -1933,84 +1005,12 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo, &tempSize_matmulExtra)); - - - // printf("m = %d, n = %d, k = %d\n",m,n,k); - // printf("alpha:%.3f, beta:%.3f\n",alpha,beta); - // float* h_a = (float*)malloc(m*k*sizeof(float)); - // float* h_b = (float*)malloc(n*k*sizeof(float)); - // float* h_c = (float*)malloc(m*n*sizeof(float)); - // cnrtMemcpy(h_a, d_a, sizeof(float)*m*k, CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtMemcpyAsync(h_b, d_b, sizeof(float)*n*k, queue, CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtMemcpyAsync(h_c, d_c, sizeof(float)*m*n, queue, CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtQueueSync(queue); - - - - // printf("before matmul, a:\n"); - - // for(int i = 0; i < m;i++) - // { - // for(int j = 0; j >>(batch, stride_c, beta,d_c,workspace,ldc,n,m,n)); } - - - } - - - - - // cnnlMatMul_v2( - // cnnl_handle, matmul_desc, matmul_algo, &matmul_alpha, cnnl_a_desc, - // d_a, cnnl_b_desc, d_b, &matmul_beta, - // cnnl_c_desc, d_c, workspace, - // tempSize_matmulExtra, cnnl_d_desc, d_c); - - // cnrtMemcpy(h_a, d_a, sizeof(float)*m*k, CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtMemcpy(h_b, d_b, sizeof(float)*n*k, CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtMemcpy(h_c, d_c, sizeof(float)*m*n, CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtQueueSync(queue); - - // printf("after matmul, a:\n"); - - // for(int i = 0; i < m;i++) - // { - // for(int j = 0; j >>(batch, d_a+m1*lda+m1,lda,stride, workspace2,m,m*m,m2)); } - - sgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,d_a+m1*lda,lda,stride,workspace1,m,m*m,workspace1+m1*m,m,m*m,handle); sgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,workspace2,m,m*m,workspace1+m1*m,m,m*m,workspace1+m1*m,m,m*m,handle); cnrtQueueSync(queue); - // cnrtMemcpy(h_i, workspace, m*m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // print - // printf("batch 0 whole inverse:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < m; j++) - // { - // printf("%.3f ",h_i[i*m+j]); - // } - // printf("\n"); - // } - - // cnrtMemcpy(h_i, workspace+m*m, m*m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // // print - // printf("batch 1 whole inverse:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < m; j++) - // { - // printf("%.3f ",h_i[i*m+j]); - // } - // printf("\n"); - // } - - - - - - // cnrtMemcpy(h_i, work_space, m*m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // //print h_i - // printf("h_i:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < m; j++) - // { - // printf("%.3f ",h_i[i*m+j]); - // } - // printf("\n"); - // } - // float *h_i; - // h_i = (float*)malloc(m*n*sizeof(float)); - // cnrtQueueSync(queue); - - // for(int i = 0; i < n; i++) - // { - // cnrtMemcpy(h_i+i*m, d_b+i*ldb, m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - - // } - // cnrtQueueSync(queue); - // printf("before strsm, b:\n"); - // for(int i = 0; i < n; i++) - // { - // for(int l = 0; l < m; l++) - // { - // printf("%.3f ",h_i[i*m+l]); - // } - // printf("\n"); - // } - //cnnlStrideBatchMatMul(cnnlHandle_t handle, const bool is_transa, const bool is_transb, const int m, const int n, const int k, const int batch_size, const float alpha, - //const cnnlTensorDescriptor_t a_desc, const void *a, const int lda, const int64_t stride_a, const cnnlTensorDescriptor_t b_desc, const void *b, const int ldb, const int64_t stride_b, const float beta, constcnnlTensorDescriptor_t c_desc, void *c, const int ldc, const int64_t stride_c) - // cnnlStrideBatchMatMul(cnnl_handle, false, true, m, n, m, 1, 1.0, cnnl_a_desc, work_space, m, m*NB, cnnl_b_desc, d_b, ldb, ldb*n, 0.0f, cnnl_b_desc, d_b, ldb, ldb*n); cnnlStrideBatchMatMul(cnnl_handle, false, true, n,m, m, batch, 1.0, cnnl_b_desc, d_b, ldb, stride, cnnl_a_desc, workspace, m, m*m, 0.0f, cnnl_b_desc, d_b, ldb, stride); - // cnrtQueueSync(queue); - - // for(int i = 0; i < n; i++) - // { - // cnrtMemcpy(h_i+i*m, d_b+i*ldb, m*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - - // } - // cnrtQueueSync(queue); - // printf("after strsm, b:\n"); - // for(int i = 0; i < n; i++) - // { - // for(int l = 0; l < m; l++) - // { - // printf("%.3f ",h_i[i*m+l]); - // } - // printf("\n"); - // } - return MLUOP_STATUS_SUCCESS; } @@ -2616,50 +1401,29 @@ mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, i if(n <=recnb) { - // printf("n:%d, recnb:%d, mlu_spotf2_lpin\n",n,recnb); + mlu_spotf2_lpin(batch, stride, trans, uplo,n,lda,d_A,gbstep,queue); } else { int n1 = n/2; int n2 = n-n1; - // printf("n1:%d, n2:%d recnb:%d,mlu_spotrf_rectile1\n",n1,n2,recnb); + mlu_spotrf_rectile(batch,stride,trans,uplo,n1,recnb,OFFSET_ROW(d_A,0,0),lda,gbstep, handle); - // printf("n1:%d, n2:%d recnb:%d,strsm_rectile\n",n1,n2,recnb); - // strsm(batch, stride, uplo,trans,n1, n2, OFFSET_ROW(d_A,0,0), lda,OFFSET_ROW(d_A,n1,0), lda, handle); + + strsm_rectile(batch, stride, uplo,trans,n1,n2,OFFSET_ROW(d_A,0,0),lda,OFFSET_ROW(d_A,n1,0),lda,queue); - // printf("n1:%d, n2:%d recnb:%d,ssyrk\n",n1,n2,recnb); + ssyrk(batch,stride,uplo,trans,n2,n1,d_A+n1*lda,lda,OFFSET_ROW(d_A,n1,n1),lda,handle); - // printf("n1:%d, n2:%d recnb:%d,mlu_spotrf_rectile2\n",n1,n2,recnb); - mlu_spotrf_rectile(batch,stride,trans,uplo,n2,recnb,OFFSET_ROW(d_A,n1,n1),lda,gbstep+n1,handle); - - //printf d_A+n1*lda+n1 - // printf("after calculate, dA+n1*lda+n1:\n"); - // for(int i = 0; i < n2; i++) - // { - // for(int j = 0; j < n2; j++) - // { - // printf("%.3f ",*(d_A+n1*lda+n1+i*lda+j)); - // } - // printf("\n"); - // } - // //printf work_space + n1 * NB+n1 - // printf("after calculate, work_space + n1 * NB+n1:\n"); - // for(int i = 0; i < n2; i++) - // { - // for(int j = 0; j < n2; j++) - // { - // printf("%.3f ",*(work_space + n1 * NB+n1+i*NB+j)); - // } - // printf("\n"); - // } + mlu_spotrf_rectile(batch,stride,trans,uplo,n2,recnb,OFFSET_ROW(d_A,n1,n1),lda,gbstep+n1,handle); + } - // strsm(false,true,n, n, work_space, n,d_A, n, work_space, handle); + return MLUOP_STATUS_SUCCESS; } -// m * n + mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, mluOpHandle_t handle) { if(m==0) @@ -2702,7 +1466,7 @@ mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, cnnl_trans_desc, size)); - // printf("transpose1 need size: %zu\n",*size); + float *workspace = NULL; diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index 73062f98b..6da17bad2 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -3,89 +3,6 @@ #define COMPLEX_TYPE_SIZE ((2) * sizeof(float)) __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; -// __mlu_func__ -// void small_cgemm(int m,int k, -// float* A0, const int lda, -// int width, float* sram_buffer, float* dst) -// { -// int id = taskId % 4; -// int span = CPOTF_NB; -// int finish = id * span; -// int remain = m - finish; -// bool if_execute = remain > 0; -// span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; - - -// float* rC = dst + CPOTF_NB *CREC_NB*2; -// float* rA = rC + CPOTF_NB *CREC_NB*2; -// float* rp = rA + CPOTF_NB *CREC_NB*2; -// float* rB = rp + CPOTF_NB *CREC_NB*2; - -// float *sB = sram_buffer; - -// int total_length = k + width; -// int loop_width = CPOTF_NB; -// int b_height = std::min(width, CPOTF_NB); - - -// if(if_execute) -// { -// int prefetch_width = std::min(loop_width, total_length); -// __memcpy(rp,COMPLEX_OFFSET(A0,finish*lda),prefetch_width*COMPLEX_TYPE_SIZE,GDRAM2NRAM,CPOTF_NB*COMPLEX_TYPE_SIZE,lda*COMPLEX_TYPE_SIZE,span-1); -// } -// __memset_nram(rC,CPOTF_NB*CREC_NB*2,(float)ZERO); -// __sync_cluster(); -// if(id == 0) -// { -// __memcpy(sB,rp,CPOTF_NB*CPOTF_NB*COMPLEX_TYPE_SIZE,NRAM2SRAM); -// } -// __sync_cluster(); -// float a1,a2,b1,b2; -// for(int iter = 0; iter < k; iter += loop_width) -// { -// __bang_move(rA,rp,CPOTF_NB * span*COMPLEX_TYPE_SIZE); -// __memcpy(rB,sB,CPOTF_NB*b_height*COMPLEX_TYPE_SIZE,SRAM2NRAM); -// __sync_cluster(); -// if(if_execute) -// { -// int prefetch_width = std::min(loop_width, total_length-iter-loop_width); -// __memcpy_async(rp,COMPLEX_OFFSET(A0,finish*lda+iter+loop_width),prefetch_width*COMPLEX_TYPE_SIZE,GDRAM2NRAM,CPOTF_NB*COMPLEX_TYPE_SIZE,lda*COMPLEX_TYPE_SIZE,span-1); -// } -// for(int i = 0; i < span; i++) -// { -// for(int j = 0; j < b_height; j++) -// { -// for(int h = 0; h < loop_width; h++) -// { -// a1 = rA[(i*CPOTF_NB+h)*2]; -// b1 = rA[(i*CPOTF_NB+h)*2+1]; -// a2 = rB[(j*CPOTF_NB+h)*2]; -// b2 = rB[(j*CPOTF_NB+h)*2+1]; -// rC[(i*CPOTF_NB+j)*2] += (a1*a2+b1*b2); -// rC[(i*CPOTF_NB+j)*2+1] += (a2*b1-a1*b2); -// } -// } -// } -// __sync_cluster(); -// if(id == 0) -// { -// __memcpy(sB,rp,CPOTF_NB*b_height*COMPLEX_TYPE_SIZE,NRAM2SRAM); -// } -// __sync_cluster(); -// } - -// __bang_sub(rp,rp,rC,CPOTF_NB * span*2); -// if(if_execute) -// { -// __memcpy(dst,rp,span*CPOTF_NB*COMPLEX_TYPE_SIZE,NRAM2NRAM); -// } -// if(id == 0) -// { -// __memcpy(sram_buffer,rp,span*CPOTF_NB*COMPLEX_TYPE_SIZE,NRAM2SRAM); -// } -// __sync_cluster(); -// } - __mlu_func__ void small_cgemm(int m,int k, float* rA0, float* iA0, const int lda, @@ -108,8 +25,8 @@ void small_cgemm(int m,int k, float* rB = ip + CPOTF_NB *CREC_NB; float* iB = rB + CPOTF_NB *CREC_NB; - float *srB = sram_buffer; //srB:shared_real_B - float *siB = srB + CPOTF_NB * CREC_NB; //siB:shared_imag_B + float *srB = sram_buffer; + float *siB = srB + CPOTF_NB * CREC_NB; float* rdst = dst; float* idst = rdst + span*CPOTF_NB; @@ -186,12 +103,14 @@ void small_cgemm(int m,int k, __sync_cluster(); } + + __mlu_func__ void small_cminout(int m, int width, float *dst, float *sram_buffer, int lda) { float factor; - int id = taskId; + int id = taskId % 4; int finish = id * CPOTF_NB; int remain = m - finish; bool if_execute = remain > 0; @@ -244,9 +163,9 @@ void small_cminout(int m, int width, -__mlu_func__ void cmplout(const int m, float *rA0, float *rA,float *iA0, float *iA, int lda, int localstep, int width) +__mlu_func__ void cmplout(int batch, const int m, float *rA0, float *rA,float *iA0, float *iA, int lda, int localstep, int width) { - int id = taskId; + int id = taskId%4; int finish = id * CPOTF_NB; int remain = m - finish; bool if_execute = remain > 0; @@ -257,19 +176,6 @@ __mlu_func__ void cmplout(const int m, float *rA0, float *rA,float *iA0, float * __sync_cluster(); - // if(id == 1) - // { - // printf("id:1, after gemm, before inout,dst:\n"); - // for(int i = 0; i < width; i++) - // { - // for(int j = 0; j < width; j++) - // { - // printf("%.3f,%.3f ",dst[(i*CPOTF_NB+j)*2],dst[(i*CPOTF_NB+j)*2+1]); - // } - // printf("\n"); - // } - // } - small_cminout(m, width, dst, (float*)sram_buffer, CPOTF_NB); __sync_cluster(); @@ -284,24 +190,6 @@ __mlu_func__ void cmplout(const int m, float *rA0, float *rA,float *iA0, float * __memcpy((rA+(i*lda)),(rdst+(i*CPOTF_NB)),(i+1)*sizeof(float),NRAM2LDRAM); __memcpy((iA+(i*lda)),(idst+(i*CPOTF_NB)),(i+1)*sizeof(float),NRAM2LDRAM); } - // printf("id:0, after gemm and inout,dst:\n"); - // for(int i = 0; i < width; i++) - // { - // for(int j = 0; j < width; j++) - // { - // printf("%8.3f ",rdst[(i*CPOTF_NB+j)]); - // } - // printf("\n"); - // } - // printf("\n"); - // for(int i = 0; i < width; i++) - // { - // for(int j = 0; j < width; j++) - // { - // printf("%8.3f ",idst[(i*CPOTF_NB+j)]); - // } - // printf("\n"); - // } } else if(if_execute) { @@ -309,39 +197,201 @@ __mlu_func__ void cmplout(const int m, float *rA0, float *rA,float *iA0, float * __memcpy((iA+(finish*lda)),idst,width*sizeof(float),NRAM2LDRAM,lda*sizeof(float),CPOTF_NB*sizeof(float),span-1); } __sync_cluster(); - // if(id == 0) - // { - // printf("end of cmplout:\n"); - // for (int j = 0; j < lda; j++) - // { - // for(int h = 0; h < lda; h++) - // { - // printf("%8.3f ",rA[j*lda+h]); - // } - // printf("\n"); - // } - // printf("\n"); - // for (int j = 0; j < lda; j++) - // { - // for(int h = 0; h < lda; h++) - // { - // printf("%8.3f ",iA[j*lda+h]); - // } - // printf("\n"); - // } - // } + +} + + + +__mlu_func__ +void small_cgemm_batch(int batch, int m,int k, + float* rA0, float* iA0, const int lda, + int width, float* r_dst,float* i_dst) +{ + int ldk = k; + int ldm = m; + + float* r_dst2 = i_dst + m * width; + float* i_dst2 = r_dst2 + m * width; + float* r_src1 = i_dst2 + m * width; + float* i_src1 = r_src1 + ldk * ldm; + float* r_src2 = i_src1 + ldk * ldm; + float* i_src2 = r_src2 + width * ldk; + + float* r_dA = rA0 + k; + float* i_dA = iA0 + k; + __memcpy_async(r_dst, r_dA, width*sizeof(float),GDRAM2NRAM,width*sizeof(float),lda*sizeof(float),m-1); + __memcpy_async(i_dst, i_dA, width*sizeof(float),GDRAM2NRAM,width*sizeof(float),lda*sizeof(float),m-1); + + if(k == 0) + { + __sync(); + + return; + } + + __memset_nram(r_src1,2*ldm*ldk,(float)ZERO); + + __memcpy_async(r_src1, rA0, k*sizeof(float),GDRAM2NRAM,ldk*sizeof(float),lda*sizeof(float),m-1); + __memcpy_async(i_src1, iA0, k*sizeof(float),GDRAM2NRAM,ldk*sizeof(float),lda*sizeof(float),m-1); + + __memset_nram(r_dst2,2*ldm*width,(float)ZERO); + + __sync(); + + __memcpy(r_src2, r_src1, ldk*width*sizeof(float),NRAM2NRAM); + + __memcpy(i_src2, i_src1, ldk*width*sizeof(float),NRAM2NRAM); + + + + + + float a1,a2,b1,b2; + for(int i = 0; i < m; i++) + { + for(int j = 0; j < width; j++) + { + for(int h = 0; h < k; h++) + { + a1 = r_src1[i*ldk+h]; + b1 = i_src1[i*ldk+h]; + a2 = r_src2[j*ldk+h]; + b2 = i_src2[j*ldk+h]; + r_dst2[i*width+j] += (a1*a2+b1*b2); + i_dst2[i*width+j] += (a2*b1-a1*b2); + } + } + } + + + __bang_sub(r_dst,r_dst,r_dst2,width * m); + __bang_sub(i_dst,i_dst,i_dst2,width * m); + + __sync(); +} + +__mlu_func__ +void small_cminout_batch(int m, int width, + float *r_dst, float* i_dst, int lda) +{ + float factor; + float* r_diag = r_dst; + float* i_diag = i_dst; + + float a1,a2,b1,b2; + + + + + for(int iter = 0; iter < width; iter++) + { + + + if (r_diag[iter*width+iter]<0) + { + printf("cccnm\n"); + printf("iter:%d,taskId:%d\n",iter,taskId); + } + factor = sqrt(r_diag[iter*width+iter]); + factor = 1.0/factor; + for(int i = 0; i < m; i++) + { + r_dst[i*width+iter] *= factor; + i_dst[i*width+iter] *= factor; + } + __sync(); + for(int i = iter+1; i < width; i++) + { + for(int j = 0; j < m; j++) + { + a1 = r_dst[(j*width+iter)]; + b1 = i_dst[(j*width+iter)]; + a2 = r_diag[(i*width+iter)]; + b2 = i_diag[(i*width+iter)]; + + r_dst[(j*width+i)] -= (a1*a2+b1*b2); + i_dst[(j*width+i)] -= (a2*b1-a1*b2); + } + } + __sync(); + } + __sync(); + + +} + +__mlu_func__ +void smlpout_batch(const int m, float *rA0, float* iA0, + float *rA, float* iA, int lda, const int localstep, int width) +{ + float* r_dst = (float*)nram_buffer; + float* i_dst = r_dst + m * width; + + small_cgemm_batch(1,m,localstep,rA0,iA0,lda,width,r_dst,i_dst); + + __sync(); + + small_cminout_batch(m,width,r_dst,i_dst,lda); + + __sync(); + + for(int i = 0;i < width; i++) + { + __memcpy((rA+(i*lda)),(r_dst+(i*width)),(i+1)*sizeof(float),NRAM2GDRAM); + __memcpy((iA+(i*lda)),(i_dst+(i*width)),(i+1)*sizeof(float),NRAM2GDRAM); + } + + if(m > width) + { + __memcpy(rA+(width*lda),r_dst+width*width,width*sizeof(float),NRAM2GDRAM,lda*sizeof(float),width*sizeof(float),m-width-1); + __memcpy(iA+(width*lda),i_dst+width*width,width*sizeof(float),NRAM2GDRAM,lda*sizeof(float),width*sizeof(float),m-width-1); + } + + __sync(); +} + +__mlu_global__ void cpotf_kernel(int batch, int stride, int m, float *drA, float *diA, int lda) +{ + int width = CPOTF_NB; + int span = width; + float* origin_rA, *origin_iA; + origin_rA = drA; + origin_iA = diA; + int id = taskId; + int batch_id = id / 4; + if(batch_id >= batch) + return; + drA = origin_rA + batch_id * stride; + diA = origin_iA + batch_id * stride; + for(int i = 0; i < m; i += width) + { + span = std::min(width, m - i); + cmplout(batch, m-i, (drA+i*lda), (drA+i*lda+i), (diA+i*lda), (diA+i*lda+i), lda, i, span); + } } -__mlu_global__ void cpotf_kernel(int m, float *drA, float *diA, int lda) +__mlu_global__ +void cpotf_batch_kernel(int batch, int stride, int m, float *r_dA, float* i_dA, int lda) { + int id = taskId; + int batch_id = id; + if(batch_id >= batch) + return; + float* r_orignA = r_dA; + float* i_orignA = i_dA; + r_dA = r_orignA + batch_id * stride; + i_dA = i_orignA + batch_id * stride; int width = CPOTF_NB; int span = width; + for(int i = 0; i < m; i += width) { span = std::min(width, m - i); - cmplout(m-i, (drA+i*lda), (drA+i*lda+i), (diA+i*lda), (diA+i*lda+i), lda, i, span); + smlpout_batch(m-i, r_dA+i*lda, i_dA+i*lda, r_dA+i*lda+i, i_dA+i*lda+i, lda, i, span); + } + } mluOpStatus_t mlu_cpotf_lpin(int batch, int stride, int n, int lda, float* drA, float* diA, cnrtQueue_t queue) @@ -350,11 +400,115 @@ mluOpStatus_t mlu_cpotf_lpin(int batch, int stride, int n, int lda, float* drA, cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; dim.y = 1; dim.z = 1; - dim.x = 4; - KERNEL_CHECK(cpotf_kernel<<>>(n, drA,diA, lda)); + if (batch < 8) + { + dim.x = 4*batch; + KERNEL_CHECK(cpotf_kernel<<>>(batch, stride, n, drA,diA, lda)); + } + else + { + func_type = CNRT_FUNC_TYPE_BLOCK; + dim.x = batch; + KERNEL_CHECK(cpotf_batch_kernel<<>>(batch, stride, n, drA,diA, lda)); + } + return MLUOP_STATUS_SUCCESS; } +__mlu_global__ +void add_c1(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) +{ + + int id = taskId; + int ipu_per_cluster = 4; + int batch_id = id / ipu_per_cluster; + if(batch_id >= batch) + return; + id = taskId % ipu_per_cluster; + float* orignC = d_c; + float* orignSrc = src; + d_c = orignC + batch_id * stride; + src = orignSrc + batch_id * m*n; + + + + __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; + if (beta == 0.0f) + { + if(id == 0) + { + __memcpy(sram_buffer,src,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); + + } + __sync_cluster(); + if(id == 0) + { + __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); + } + __sync_cluster(); + return; + } + + float* a_sram = (float*)sram_buffer + 3* m * n; + + if (id == 0) { + __memcpy(sram_buffer,d_c,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldc*sizeof(float),m-1); + __memcpy(a_sram,src,n*m*sizeof(float),GDRAM2SRAM); + } + + __sync_cluster(); + + + int32_t data_num = m*n; + int32_t data_per_core = data_num / ipu_per_cluster; + int32_t data_last_core = data_per_core + data_num % ipu_per_cluster; + const float *a_offset = a_sram + id * data_per_core; + const float *b_offset = (float*)sram_buffer + id * data_per_core; + float *output_offset = (float*)sram_buffer + id * data_per_core; + + if (id == ipu_per_cluster - 1) { + data_per_core = data_last_core; + } + + int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); + + int32_t data_nram_num = + MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; + float *a_nram = (float *)nram_buffer; + float *b_nram = (float *)a_nram + data_nram_num; + int32_t loop_num = data_per_core / data_nram_num; + int32_t rem_nram_num = data_per_core % data_nram_num; + + for (int32_t i = 0; i < loop_num; i++) { + __memcpy(a_nram, a_offset + i * data_nram_num, + data_nram_num * sizeof(float), SRAM2NRAM); + __memcpy(b_nram, b_offset + i * data_nram_num, + data_nram_num * sizeof(float), SRAM2NRAM); + __bang_add(a_nram, a_nram, b_nram, data_nram_num); + __memcpy(output_offset + i * data_nram_num, a_nram, + data_nram_num * sizeof(float), NRAM2SRAM); + } + if (rem_nram_num != 0) { + int32_t rem_align_num = + (rem_nram_num + align_num - 1) / align_num * align_num; + __memcpy(a_nram, a_offset + loop_num * data_nram_num, + rem_nram_num * sizeof(float), SRAM2NRAM); + __memcpy(b_nram, b_offset + loop_num * data_nram_num, + rem_nram_num * sizeof(float), SRAM2NRAM); + __bang_add(a_nram, a_nram, b_nram, rem_align_num); + __memcpy(output_offset + loop_num * data_nram_num, a_nram, + rem_nram_num * sizeof(float), NRAM2SRAM); + } + __sync_cluster(); + + if (id == 0) { + __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2GDRAM,ldc*sizeof(float),n*sizeof(float),m-1); + + } + + __sync_cluster(); + +} __mlu_global__ @@ -399,10 +553,8 @@ void complex_add_c(int batch, int stride, float beta, float *d_c, float* src,int __memcpy(sram_buffer,d_c+d_c_offset,n*sizeof(float),LDRAM2NRAM,n*sizeof(float),ldc*sizeof(float),span-1); __memcpy(a_sram,src+src_offset,n*span*sizeof(float),LDRAM2NRAM); -// __sync_cluster(); -// int32_t data_num = m*n; int32_t data_per_core = span*n; int32_t data_last_core = data_per_core; const float *a_offset = a_sram; @@ -416,8 +568,6 @@ void complex_add_c(int batch, int stride, float beta, float *d_c, float* src,int int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); -// int32_t data_nram_num = -// MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; int32_t data_nram_num = MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; float *a_nram = (float *)a_sram + m*n; @@ -446,54 +596,9 @@ void complex_add_c(int batch, int stride, float beta, float *d_c, float* src,int rem_nram_num * sizeof(float), NRAM2NRAM); } -// __sync_cluster(); __memcpy(d_c+d_c_offset,sram_buffer,n*sizeof(float),NRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),span-1); -// if (id == 0) { -// printf("id0,d_c:\n"); -// for(int i = 0; i < m; i++) -// { -// for(int j = 0; j < n; j++) -// { -// printf("%8.3f ",((float*)sram_buffer)[i*n+j]); -// } -// printf("\n"); -// } -// printf("add_c, d_c:\n"); -// for(int i = 0; i < m; i++) -// { -// for(int j = 0; j < n; j++) -// { -// printf("%8.3f ",((float*)d_c)[i*ldc+j]); -// } -// printf("\n"); -// } -// } - -// if(id == 0) -// { -// printf("id: 1, a_sram:\n"); -// for(int i = 0; i < m; i++) -// { -// for(int j = 0; j < n; j++) -// { -// printf("%8.3f ",((float*)a_sram)[i*n+j]); -// } -// printf("\n"); -// } -// printf("\nid: 1, sram_buffer:\n"); -// for(int i = 0; i < m; i++) -// { -// for(int j = 0; j < n; j++) -// { -// printf("%8.3f ",((float*)sram_buffer)[i*n+j]); -// } -// printf("\n"); -// } -// printf("\n"); -// } - -// __sync_cluster(); + } @@ -510,38 +615,37 @@ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_ { int id = taskId; id = taskId % 4; - // __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; - - - // if (id == 0) { - // // __memcpy(sram_buffer,d_input,m*m*sizeof(float),GDRAM2SRAM); - // __memcpy(sram_buffer,d_input,m*sizeof(float),GDRAM2SRAM,m*sizeof(float),ld_input*sizeof(float),m-1); - // } - // __sync_cluster(); - + int batch_id = taskId/4; + if(batch_id >= batch) + return; + float* origin_r_input = rd_input; + float* origin_i_input = id_input; + float* origin_r_output = rd_output; + float* origin_i_output = id_output; + rd_input = origin_r_input + batch_id * stride_input; + id_input = origin_i_input + batch_id * stride_input; + rd_output = origin_r_output + batch_id * stride_output; + id_output = origin_i_output + batch_id * stride_output; + - int span = m/taskDim; + int span = m/4; int start = id * span; if (id == 3) { span = m - 3 * span; } float* nram_offset = (float*)nram_buffer; - //diag_start:m*m ld:m float* rdiag_start = (float*)nram_offset; float* idiag_start = rdiag_start + m * m; - //nram_src1存放列主序的计算完成的矩阵 m*m ld:height float* r_nram_src1 = idiag_start + m * m; float* i_nram_src1 = r_nram_src1 + m * m; float* r_nram_src2 = i_nram_src1 + m * m; float* i_nram_src2 = r_nram_src2 + m; float* r_mul_result = i_nram_src2 + m; float* i_mul_result = r_mul_result + m; - //nram_dst存放计算结果,占用空间m*m ld为span float* r_nram_dst = i_mul_result + m; float* i_nram_dst = r_nram_dst + m * m; - // float* diag_start = ((float*)sram_buffer) + m * start + start; int height = m - start; __memset_nram(nram_offset, 4 * m * m * 2+2, (float)ZERO); @@ -551,29 +655,16 @@ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_ __memcpy(rdiag_start,rd_input + ld_input * start + start,height*sizeof(float),LDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),height-1); __memcpy(idiag_start,id_input + ld_input * start + start,height*sizeof(float),LDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),height-1); } - // if(id == 0) - // { - // //print rdiag_start - // printf("diag_start:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < m; j++) - // { - // printf("%.3f ",rdiag_start[i*m+j]); - // } - // printf("\n"); - // } - // } + - //计算对角线元素的倒数 float result = 0.0; for(int i = 0; i < height; i++) { int off = i * m + i; result = rdiag_start[off]; result = 1.0 / result; - r_nram_src1[i*height+i] = result; //i_nram_src1对应位置为0 + r_nram_src1[i*height+i] = result; r_nram_dst[i*span + i] = result; rdiag_start[off] = result; @@ -590,31 +681,14 @@ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_ { float r_temp = 0.0; float i_temp = 0.0; - // if(id == 0 && i == 3) - // { - // printf("nram_src2:\n"); - // for(int k = 0; k < i; k++) - // { - // printf("%.3f ",nram_src2[k]); - // } - // printf("\n"); - // printf("nrma_src1:\n"); - // for(int k = 0; k < i; k++) - // { - // printf("%.3f ",nram_src1[j*height+k]); - // } - // printf("diag_element:%.3f\n",diag_element); - // } - //符号可能要改变 + __bang_mul(r_mul_result,r_nram_src2,r_nram_src1+j*height,i); __bang_mul(i_mul_result,r_nram_src2,i_nram_src1+j*height,i); for(int k = 0; k< i; k++) { r_temp += r_mul_result[k]; i_temp += i_mul_result[k]; - // i_temp -= i_mul_result[k]; } - //符号可能要改变 __bang_mul(r_mul_result,i_nram_src2,i_nram_src1+j*height,i); __bang_mul(i_mul_result,i_nram_src2,r_nram_src1+j*height,i); for(int k = 0; k< i; k++) @@ -635,29 +709,7 @@ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_ __sync(); - // if(id == 0) - // { - // printf("id:0, r_nram_dst:\n"); - // for(int i = 0; i < height; i++) - // { - // for(int j = 0; j < span; j++) - // { - // printf("%8.3f ",r_nram_dst[i*span+j]); - // } - // printf("\n"); - // } - // printf("\n"); - // printf("id:0, i_nram_dst:\n"); - // for(int i = 0; i < height; i++) - // { - // for(int j = 0; j < span; j++) - // { - // printf("%8.3f ",i_nram_dst[i*span+j]); - // } - // printf("\n"); - // } - // printf("\n"); - // } + __sync(); @@ -668,310 +720,269 @@ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_ __memcpy(id_output + ld_output * start + start,i_nram_dst,span*sizeof(float),NRAM2LDRAM,ld_output*sizeof(float),span*sizeof(float),height-1); } - // if(id == 0) - // { - // //printf nram_dst - // printf("last diag_start:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < m; j++) - // { - // printf("%.3f ",diag_start[i*m+j]); - // } - // printf("\n"); - // } - // } - + } - -mluOpStatus_t complex_inverse(int batch, float *rd_input, float *id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m, mluOpHandle_t handle) +__mlu_global__ +void complex_batch_inverse_kernel(int batch, float *rd_input, float* id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m) { - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); + int id = taskId; + int batch_id = id; + if(batch_id >= batch) + return; - cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; - dim.y = 1; - dim.z = 1; - dim.x = 4; + float* r_orign_input = rd_input; + float* i_orign_input = id_input; + float* r_orign_output = rd_output; + float* i_orign_output = id_output; + rd_input = r_orign_input + batch_id * stride_input; + id_input = i_orign_input + batch_id * stride_input; + rd_output = r_orign_output + batch_id * stride_output; + id_output = i_orign_output + batch_id * stride_output; + + + float* nram_offset = (float*)nram_buffer; + float* r_nram_src0 = nram_offset; + float* i_nram_src0 = r_nram_src0 + m * m; + float* r_nram_src1 = i_nram_src0 + m * m; + float* i_nram_src1 = r_nram_src1 + m * m; + float* r_nram_src2 = i_nram_src1 + m * m; + float* i_nram_src2 = r_nram_src2 + m ; + float* r_mul_result = i_nram_src2 + m; + float* i_mul_result = r_mul_result + m; + float* r_nram_dst = i_mul_result + m; + float* i_nram_dst = r_nram_dst + m * m; + float* r_diag_start = r_nram_dst; + float* i_diag_start = i_nram_dst; + int height = m, span = m; + + __memset_nram(nram_offset, 10 * m * m, (float)ZERO); + + __memcpy(r_nram_dst,rd_input,m*sizeof(float),GDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),m-1); + __memcpy(i_nram_dst,id_input,m*sizeof(float),GDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),m-1); + float result = 0.0; + for(int i = 0; i < m; i++) + { + int off = i * m + i; + result = r_nram_dst[off]; + result = 1.0 / result; + r_nram_src1[i*height+i] = result; + r_nram_dst[i*span + i] = result; + r_diag_start[off] = result; + } + + for(int i = 1; i < height; i++) + { + __memcpy(r_nram_src2,r_diag_start+i*m,i*sizeof(float),NRAM2NRAM); + __memcpy(i_nram_src2,i_diag_start+i*m,i*sizeof(float),NRAM2NRAM); + int num = std::min(i, span); + float diag_element = r_diag_start[i*m+i]; + for(int j = 0; j < num; j++) + { + float r_temp = 0.0; + float i_temp = 0.0; + __bang_mul(r_mul_result,r_nram_src2,r_nram_src1+j*height,i); + __bang_mul(i_mul_result,r_nram_src2,i_nram_src1+j*height,i); + for(int k = 0; k< i; k++) + { + r_temp += r_mul_result[k]; + i_temp += i_mul_result[k]; + } + __bang_mul(r_mul_result,i_nram_src2,i_nram_src1+j*height,i); + __bang_mul(i_mul_result,i_nram_src2,r_nram_src1+j*height,i); + for(int k = 0; k< i; k++) + { + r_temp += r_mul_result[k]; + i_temp -= i_mul_result[k]; + } + r_temp = r_temp * -1.0 * diag_element; + i_temp = i_temp * -1.0 * diag_element; + r_nram_dst[i*span+j] = r_temp; + i_nram_dst[i*span+j] = i_temp; + r_nram_src1[j*height+i] = r_temp; + i_nram_src1[j*height+i] = i_temp; + } + __sync(); + + } + __sync(); + + __memcpy(rd_output,r_nram_dst,m*sizeof(float),NRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); + __memcpy(id_output,i_nram_dst,m*sizeof(float),NRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); - KERNEL_CHECK(complex_inverse_kernel<<>>(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m)); - return MLUOP_STATUS_SUCCESS; } -//这cgemm其实不是计算a*b,而是计算a*(b^H),即计算a乘b的共轭转置 + + + + + mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle) { - float *workspace = NULL; + if(k==0) + return MLUOP_STATUS_SUCCESS; + cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - CNRT_CHECK(cnrtMalloc((void **)&workspace, sizeof(float)*2*(m*n))); - // float temp1=0, temp2=0; - //print d_a - // printf("before transpose, d_a:\n"); - // for(int i = 0; i < batch; i++) - // { - // printf("batch:%d\n",i); - // for(int j = 0; j < m; j++) - // { - // for(int h = 0; h < k; h++) - // { - // cnrtMemcpy(&temp1, d_a+i*stride_a*2+j*lda*2+h*2, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtMemcpy(&temp2, d_a+i*stride_a*2+j*lda*2+h*2+1, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f,%8.3f ",temp1,temp2); - // } - // printf("\n"); - // } - // } - - // printf("after transpose, d_a:\n"); - // for(int i = 0; i < 2; i++) - // { - // for(int j = 0; j < m; j++) - // { - // for(int h = 0; h < k; h++) - // { - // cnrtMemcpy(&temp1, workspace+i*m*k+j*lda+h, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f",temp1); - // } - // printf("\n"); - // } - // printf("\n"); - // } + float *r_c, *i_c; - // float *res_ra_ib; - r_c = workspace; - i_c = r_c+m*n; - // res_ra_ib = res_ia_rb+m*n; + r_c = d_rc; + i_c = d_ic; int s_stride_a = stride_a; - int s_stride_b = stride_a; - int s_stride_c = stride_a; - - // float temp = 0; - // printf("before sgemm:\n"); - // printf("r_a:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < k; j++) - // { - // cnrtMemcpy(&temp, d_ra+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - // printf("i_a:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < k; j++) - // { - // cnrtMemcpy(&temp, d_ia+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - // printf("r_b:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < k; j++) - // { - // cnrtMemcpy(&temp, d_rb+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - // printf("i_b:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < k; j++) - // { - // cnrtMemcpy(&temp, d_ib+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_rb,ldb,s_stride_b,r_c,n,s_stride_c,handle); + int s_stride_b = stride_b; + int s_stride_c = stride_c; + + + sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,d_ra,lda,s_stride_a,d_rb,ldb,s_stride_b,r_c,ldc,s_stride_c,handle); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_ib,ldb,s_stride_b,r_c,n,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_ib,ldb,s_stride_b,r_c,ldc,s_stride_c,handle); cnrtQueueSync(queue); - // sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,n,s_stride_c,handle); - // cnrtQueueSync(queue); - // sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,n,s_stride_c,handle); - // cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,-alpha,0,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,n,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,-alpha,beta,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,ldc,s_stride_c,handle); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,n,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,ldc,s_stride_c,handle); cnrtQueueSync(queue); - // printf("beta:%f\n",beta); - - // printf("r_c:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < n; j++) - // { - // cnrtMemcpy(&temp, r_c+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtQueueSync(queue); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; - dim.y = 1; - dim.z = 1; - dim.x = 4; - KERNEL_CHECK(complex_add_c<<>>(batch,stride_c,beta,d_rc,r_c,ldc,n,m,n)); - KERNEL_CHECK(complex_add_c<<>>(batch,stride_c,beta,d_ic,i_c,ldc,n,m,n)); - cnrtQueueSync(queue); + return MLUOP_STATUS_SUCCESS; } -//这cgemm其实不是计算a*b,而是计算a*(b^H),即计算a乘b的共轭转置 mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle) { - float *workspace = NULL; + if(k==0) + return MLUOP_STATUS_SUCCESS; + cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - CNRT_CHECK(cnrtMalloc((void **)&workspace, sizeof(float)*2*(m*n))); - // float temp1=0, temp2=0; - //print d_a - // printf("before transpose, d_a:\n"); - // for(int i = 0; i < batch; i++) - // { - // printf("batch:%d\n",i); - // for(int j = 0; j < m; j++) - // { - // for(int h = 0; h < k; h++) - // { - // cnrtMemcpy(&temp1, d_a+i*stride_a*2+j*lda*2+h*2, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtMemcpy(&temp2, d_a+i*stride_a*2+j*lda*2+h*2+1, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f,%8.3f ",temp1,temp2); - // } - // printf("\n"); - // } - // } - - // printf("after transpose, d_a:\n"); - // for(int i = 0; i < 2; i++) - // { - // for(int j = 0; j < m; j++) - // { - // for(int h = 0; h < k; h++) - // { - // cnrtMemcpy(&temp1, workspace+i*m*k+j*lda+h, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f",temp1); - // } - // printf("\n"); - // } - // printf("\n"); - // } - float *r_c, *i_c; - // float *res_ra_ib; - r_c = workspace; - i_c = r_c+m*n; - // res_ra_ib = res_ia_rb+m*n; + float *workspace = NULL; + CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*sizeof(float)*2*(m*k))); + float* copy_ra = workspace; + float* copy_ia = copy_ra + batch*m*k; + int copy_lda = k; + int copy_stride_a = m*k; - int s_stride_a = stride_a; - int s_stride_b = stride_a; - int s_stride_c = stride_a; - - // float temp = 0; - // printf("before sgemm:\n"); - // printf("r_a:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < k; j++) - // { - // cnrtMemcpy(&temp, d_ra+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - // printf("i_a:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < k; j++) - // { - // cnrtMemcpy(&temp, d_ia+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - // printf("r_b:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < k; j++) - // { - // cnrtMemcpy(&temp, d_rb+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - // printf("i_b:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < k; j++) - // { - // cnrtMemcpy(&temp, d_ib+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_rb,ldb,s_stride_b,r_c,n,s_stride_c,handle); - cnrtQueueSync(queue); + for(int i = 0; i < batch; i++) + { + CNRT_CHECK(cnrtMemcpy2D(copy_ra+i*m*k, k*sizeof(float), d_ra+i*stride_a, lda*sizeof(float), + k*sizeof(float), m, CNRT_MEM_TRANS_DIR_DEV2DEV)); + CNRT_CHECK(cnrtMemcpy2D(copy_ia+i*m*k, k*sizeof(float), d_ia+i*stride_a, lda*sizeof(float), + k*sizeof(float), m, CNRT_MEM_TRANS_DIR_DEV2DEV)); + } + + + float *r_c, *i_c; + r_c = d_rc; + i_c = d_ic; + int s_stride_b = stride_b; + int s_stride_c = stride_c; + + + sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,copy_ra,copy_lda,copy_stride_a,d_rb,ldb,s_stride_b,r_c,ldc,s_stride_c,handle); + cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,d_ia,lda,s_stride_a,d_ib,ldb,s_stride_b,r_c,n,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,copy_ia,copy_lda,copy_stride_a,d_ib,ldb,s_stride_b,r_c,ldc,s_stride_c,handle); cnrtQueueSync(queue); - // sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,n,s_stride_c,handle); - // cnrtQueueSync(queue); - // sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,n,s_stride_c,handle); - // cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,0,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,n,s_stride_c,handle); + + sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,copy_ra,copy_lda,copy_stride_a,d_ib,ldb,s_stride_b,i_c,ldc,s_stride_c,handle); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,n,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,copy_ia,copy_lda,copy_stride_a,d_rb,ldb,s_stride_b,i_c,ldc,s_stride_c,handle); cnrtQueueSync(queue); - // printf("beta:%f\n",beta); - // printf("r_c:\n"); - // for(int i = 0; i < m; i++) - // { - // for(int j = 0; j < n; j++) - // { - // cnrtMemcpy(&temp, r_c+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - // cnrtQueueSync(queue); - // printf("%8.3f ",temp); - // } - // printf("\n"); - // } - cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; - dim.y = 1; - dim.z = 1; - dim.x = 4; - KERNEL_CHECK(complex_add_c<<>>(batch,stride_c,beta,d_rc,r_c,ldc,n,m,n)); - KERNEL_CHECK(complex_add_c<<>>(batch,stride_c,beta,d_ic,i_c,ldc,n,m,n)); - cnrtQueueSync(queue); + + + + return MLUOP_STATUS_SUCCESS; +} + + +mluOpStatus_t complex_inverse(int batch, float *rd_input, float *id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m, mluOpHandle_t handle) +{ + int inverse_rec = 16; + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + if(m <= inverse_rec) + { + + + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + dim.y = 1; + dim.z = 1; + if(batch < 8) + { + dim.x = 4 * batch; + KERNEL_CHECK(complex_inverse_kernel<<>>(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m)); + } + else + { + dim.x = batch; + KERNEL_CHECK(complex_batch_inverse_kernel<<>>(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m)); + } + + } + else + { + int m1 = m/2; + int m2 = m - m1; + + float* output1_r = rd_output; + float* output2_r = rd_output + m1*m+m1; + float* output1_i = id_output; + float* output2_i = id_output + m1*m+m1; + + + complex_inverse(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m1, handle); + complex_inverse(batch, rd_input+m1*ld_input+m1, id_input+m1*ld_input+m1, ld_input, stride_input, output2_r, output2_i, ld_output, stride_output, m2, handle); + cnrtQueueSync(queue); + + + + + float *workspace = NULL; + CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*sizeof(float)*2*(m2*m1))); + float* temp_r = workspace; + float* temp_i = temp_r + batch*m2*m1; + int temp_ld = m1; + int temp_stride = m2*m1; + + cgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,rd_input+m1*ld_input,id_input+m1*ld_input,ld_input,stride_input,output1_r,output1_i,ld_output,stride_output,temp_r,temp_i,temp_ld,temp_stride,handle); + cnrtQueueSync(queue); + cgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,output2_r,output2_i,ld_output,stride_output,temp_r,temp_i,temp_ld,temp_stride,rd_output+m1*ld_output,id_output + m1*ld_output,ld_output,stride_output,handle); + cnrtQueueSync(queue); + + + + + + + + + + } + + + + return MLUOP_STATUS_SUCCESS; } + mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, float* id_a, int lda, float* rd_b, float* id_b, int ldb, mluOpHandle_t handle) { if(n==0) @@ -985,74 +996,11 @@ mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, float* id_ r_inverse_result = workspace; i_inverse_result = r_inverse_result + batch*m*m; - float temp_h; - printf("before inverse, real:\n"); - for(int i = 0; i < m; i++) - { - for(int j = 0; j < m; j++) - { - cnrtMemcpy(&temp_h, rd_a+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - printf("%8.3f ",temp_h); - } - printf("\n"); - } - printf("before inverse, imag:\n"); - for(int i = 0; i < m; i++) - { - for(int j = 0; j < m; j++) - { - cnrtMemcpy(&temp_h, id_a+i*lda+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - printf("%8.3f ",temp_h); - } - printf("\n"); - } - + complex_inverse(batch,rd_a,id_a,lda,stride,r_inverse_result,i_inverse_result,m,m*m,m,handle); cnrtQueueSync(queue); - printf("inverse result real:\n"); - for(int i = 0; i < m; i++) - { - for(int j = 0; j < m; j++) - { - cnrtMemcpy(&temp_h, r_inverse_result+i*m+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - printf("%8.3f ",temp_h); - } - printf("\n"); - } - printf("inverse result imag:\n"); - for(int i = 0; i < m; i++) - { - for(int j = 0; j < m; j++) - { - cnrtMemcpy(&temp_h, i_inverse_result+i*m+j, sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - printf("%8.3f ",temp_h); - } - printf("\n"); - } cgemm_real(batch,false,true,n,m,m,1.0,0.0f,rd_b,id_b,ldb,stride,r_inverse_result,i_inverse_result,m,m*m,rd_b,id_b,ldb,stride,handle); - printf("trsm result real:\n"); - for(int i = 0; i < n; i++) - { - for(int j = 0; j < m; j++) - { - cnrtMemcpy(&temp_h, rd_b+i*ldb+j, sizeof(float),CNRT_MEM_TRANS_DIR_DEV2HOST); - printf("%8.3f ",temp_h); - } - printf("\n"); - } - - printf("trsm result imag:\n"); - for(int i = 0; i < n; i++) - { - for(int j = 0; j < m; j++) - { - cnrtMemcpy(&temp_h, id_b+i*ldb+j, sizeof(float),CNRT_MEM_TRANS_DIR_DEV2HOST); - printf("%8.3f ",temp_h); - } - printf("\n"); - } - return MLUOP_STATUS_SUCCESS; } @@ -1061,6 +1009,9 @@ mluOpStatus_t cherk(int batch, int stride, int n,int k, float* rd_a, float* id_a if(k==0) return MLUOP_STATUS_SUCCESS; cgemm(batch,false,true,n,n,k,-1.0f,1.0f,rd_a,id_a,lda,stride,rd_a,id_a,lda,stride,rd_c,id_c,ldc,stride,handle); + cnrtQueue_t queue; + mluOpGetQueue(handle,&queue); + cnrtQueueSync(queue); set_half_zero(batch,stride,rd_c,ldc,n,handle); set_half_zero(batch,stride,id_c,ldc,n,handle); return MLUOP_STATUS_SUCCESS; From c86edf7d9d2112f73cb3ece4c79cae8f02ede59b Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 19 Jul 2024 22:05:00 +0800 Subject: [PATCH 07/27] fix ang bugs --- kernels/cholesky/cholesky.cpp | 74 +-- kernels/cholesky/cholesky.h | 16 +- kernels/cholesky/cholesky_union1.mlu | 507 ++++++++++-------- kernels/cholesky/complex_cholesky_union1.mlu | 140 +++-- .../pb_gtest/src/zoo/cholesky/cholesky.cpp | 399 ++++++++++---- .../pb_gtest/src/zoo/cholesky/cholesky.h | 6 +- 6 files changed, 745 insertions(+), 397 deletions(-) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index 838985d40..7ab0f3d81 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -1,5 +1,7 @@ #include "cholesky.h" + + mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace) { PARAM_CHECK("mluOpCholesky", input_desc != NULL); @@ -18,8 +20,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t in PARAM_CHECK("mluOpCholesky", dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); int type_size = (dtype == MLUOP_DTYPE_FLOAT) ? 4 : 8; - int size_a = 0, lda = 0, size_c = 0, ldc = 0; - int batch_size = 1; + long int size_a = 0, lda = 0, size_c = 0, ldc = 0; + long int batch_size = 1; int dim = input_desc->dim; if(dim == 2) { @@ -33,17 +35,18 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t in if (dtype == MLUOP_DTYPE_FLOAT) { - *size = 0; + *size = size_a*size_a*sizeof(float)*2*batch_size; } else { *size = size_a*size_a*sizeof(float)*2*batch_size; - printf("size:%ul\n",(int)(*size)); + } + printf("workspace size:%ul\n",(int)(*size)); if(*size>0) { CHECK_RETURN("mluOpCholesky", - complex_malloc(*size, workspace)); + workspace_malloc(*size, workspace)); } return MLUOP_STATUS_SUCCESS; } @@ -83,14 +86,13 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ float* work_space_h; CNRT_CHECK(cnrtMalloc((void **)&work_space, NB*NB*sizeof(float))); CNRT_CHECK(cnrtMemset(work_space, 0, NB*NB*sizeof(float))); - work_space_h = (float*)malloc(batch_size*2*lda*lda*sizeof(float)); + work_space_h = (float*)malloc(((unsigned long)batch_size)*2*lda*lda*sizeof(float)); PARAM_CHECK("mluOpCholesky", lda >= size_a); PARAM_CHECK("mluOpCholesky", ldc >= size_c); cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - int jb; const float s_one = 1.0; const float s_neg_one = -1.0; @@ -100,29 +102,30 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ if(upper == true) { CHECK_RETURN("mluOpCholesky", - transpose(batch_size,size_a,size_a,d_input,d_output,handle)); + transpose(batch_size,size_a,size_a,d_input,d_output,handle,dtype,workspace)); } else { - CNRT_CHECK(cnrtMemcpy(d_output, d_input, type_size*size_a*lda*batch_size, CNRT_MEM_TRANS_DIR_DEV2DEV)); + CNRT_CHECK(cnrtMemcpy(d_output, d_input, type_size*size_a*lda*((unsigned long)batch_size), CNRT_MEM_TRANS_DIR_DEV2DEV)); } } else { CHECK_RETURN("mluOpCholesky", - transpose(batch_size,size_a*size_a,2,d_input,d_output,handle)); + transpose(batch_size,size_a*size_a,2,d_input,d_output,handle,MLUOP_DTYPE_FLOAT,workspace)); } cnrtQueueSync(queue); int stride = size_a*lda; - - + if(dtype == MLUOP_DTYPE_FLOAT) { int row = is_row_major ? lda : size_a; int nb = NB; + set_half_zero(batch_size, stride, d_output, lda, lda, handle); + cnrtQueueSync(queue); for(int j = 0; j < row; j+=nb) { jb = std::min(nb, row-j); @@ -152,7 +155,9 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ { cnrtQueueSync(queue); CHECK_RETURN("mluOpCholesky", - transpose(batch_size, size_a,size_a,d_output,d_output,handle)); + transpose(batch_size, size_a,size_a,d_output,workspace,handle,dtype,workspace)); + cnrtQueueSync(queue); + CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*((unsigned long)batch_size), CNRT_MEM_TRANS_DIR_DEV2DEV)); } } else @@ -195,20 +200,35 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ OFFSET_ROW(r_start,j+jb,j),OFFSET_ROW(i_start,j+jb,j),lda, handle)); cnrtQueueSync(queue); } - } - + } CHECK_RETURN("mluOpCholesky", - transpose(batch_size,2,size_a*size_a,d_output,workspace,handle)); + transpose(batch_size,2,size_a*size_a,d_output,workspace,handle,MLUOP_DTYPE_FLOAT,workspace)); cnrtQueueSync(queue); - if(batch_size > 16) + + + + if(upper) { - CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*16, CNRT_MEM_TRANS_DIR_DEV2DEV)); - CNRT_CHECK(cnrtMemcpy(d_output+type_size/4*size_a*lda*16, workspace+type_size/4*size_a*lda*16, type_size*size_a*lda*(batch_size-16), CNRT_MEM_TRANS_DIR_DEV2DEV)); + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + transpose(batch_size, size_a,size_a,workspace,d_output,handle,dtype,workspace)); + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + conj_complex(batch_size, size_a,size_a,d_output,d_output,handle)); + cnrtQueueSync(queue); } else { - CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*batch_size, CNRT_MEM_TRANS_DIR_DEV2DEV)); + if(batch_size > 16) + { + CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*16, CNRT_MEM_TRANS_DIR_DEV2DEV)); + CNRT_CHECK(cnrtMemcpy(d_output+type_size/4*size_a*lda*16, workspace+type_size/4*size_a*lda*16, type_size*size_a*lda*((unsigned long)batch_size-16), CNRT_MEM_TRANS_DIR_DEV2DEV)); + } + else + { + CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*((unsigned long)batch_size), CNRT_MEM_TRANS_DIR_DEV2DEV)); + } } @@ -271,19 +291,11 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa ldc = output_desc->dims[2]; } - float* last_addr = d_input+batch_size*size_a*lda*2; + float* last_addr = d_input+((unsigned long)batch_size)*size_a*lda*2; float* temp_addr = last_addr - 10; - float* work_space_h; - work_space_h = (float*)malloc(100*sizeof(float)); - cnrtMemcpy(work_space_h, temp_addr, 10*sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST); - printf("last 10 input:\n"); - for(int i = 0; i < 10;i++) - { - printf("%8.3f ",work_space_h[i]); - } - printf("\n"); + int type_size = (dtype == MLUOP_DTYPE_FLOAT) ? 4 : 8; @@ -292,7 +304,7 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa int stride = 2*size_a*lda; calculate_body(handle, 16, input_desc,d_input, output_desc, d_output, upper, workspace); cnrtQueueSync(queue); - calculate_body(handle, batch_size-16, input_desc,d_input+16*stride, output_desc, d_output+16*stride, upper, workspace); + calculate_body(handle, ((unsigned long)batch_size)-16, input_desc,d_input+16*stride, output_desc, d_output+16*stride, upper, workspace); } else { diff --git a/kernels/cholesky/cholesky.h b/kernels/cholesky/cholesky.h index e06f66c87..cf5f0a7b5 100644 --- a/kernels/cholesky/cholesky.h +++ b/kernels/cholesky/cholesky.h @@ -23,14 +23,15 @@ #define CNB (16) #define REC_NB (16) #define POTF_NB ((REC_NB)/4) -#define CREC_NB (8) +#define CREC_NB (16) #define CPOTF_NB ((CREC_NB)/4) +// #define CPOTF_NB ((CREC_NB)) #define __CNRT_FUNC_TYPE__ CNRT_FUNC_TYPE_UNION1 #define TASK_NUM (4) #define NB (32) #define CLUSTER_NUM 1 -#define M (TASK_NUM * POTF_NB) +#define M (TASK_NUM * POTF_NB) //POTF边长 #define ZERO 0.0 #define SHARED_MEM_SIZE (((M*POTF_NB/TASK_NUM * 4)+(POTF_NB * POTF_NB))) #define OFFSET_ROW(A, i, j) A + ((i) * (lda) + (j)) @@ -38,22 +39,27 @@ mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, int n, int recnb, float* dA, int ldda, int gbstep, mluOpHandle_t handle); +// void mluOpCholesky(bool trans, bool uplo, int n, float* dA, float* dC, int ldda); mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle); mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, int stride_a, float* d_b, int ldb, int stride_b, float* d_c, int ldc, int stride_c, mluOpHandle_t handle); - +//side:true->right +// false->left mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, float* d_a, int ldda, float* d_b, int lddb, mluOpHandle_t handle); -mluOpStatus_t transpose(int batch, int m, int n,float* d_input,float* d_output, mluOpHandle_t handle); +mluOpStatus_t transpose(int batch, int m, int n,float* d_input,float* d_output, mluOpHandle_t handle,mluOpDataType_t type, float* workspace); + +mluOpStatus_t conj_complex(int batch, int m, int n,float* d_input,float* d_output, mluOpHandle_t handle); mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, float* drA, float* diA, int lda, mluOpHandle_t handle); mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle); -mluOpStatus_t complex_malloc(size_t size, float** workspace); +mluOpStatus_t workspace_malloc(size_t size, float** workspace); +// mluOpStatus_t complex_set_half_zero(int batch, int stride, float* d_a, int m, int ld); mluOpStatus_t set_half_zero(int batch,int stride,float* d_a, int lda, int m, mluOpHandle_t handle); diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index 0f06250e8..39807bb4d 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -1,5 +1,43 @@ #include "cholesky.h" +#include + + __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; + +__mlu_func__ +float recur_add(float* input, int length) +{ + if(length == 1) + { + return input[0]; + } + else + { + int half_length; + half_length = length / 2; + float sum1 = recur_add(input, half_length); + float sum2 = recur_add(input + half_length, length - half_length); + input[0] = sum1+sum2; + return sum1 + sum2; + } + +} + +__mlu_func__ +float kahansum(float* input, int length) +{ + float sum = 0.0; + float c = 0.0; + for(int i = 0; i < length; i++) + { + float y = input[i] - c; + float t = sum + y; + c = (t - sum) - y; + sum = t; + } + input[0] = sum; + return sum; +} __mlu_func__ void sgemm_fixwidth_device(int m, int k, @@ -8,17 +46,20 @@ void sgemm_fixwidth_device(int m, int k, { int id = taskId % 4; - int span = POTF_NB; + int span = POTF_NB; + - __nram__ float rC[M * POTF_NB/TASK_NUM ]; + __nram__ float rC[M * POTF_NB/TASK_NUM ]; __nram__ float rA[M * POTF_NB/TASK_NUM ]; __nram__ float rp[M * POTF_NB/TASK_NUM ]; __nram__ float rB[POTF_NB * POTF_NB]; - - - if(id*span0) { + float* temp_result = nram_src+span*span; + float* temp_result2 = temp_result + span * span; + float* temp_a = nram_src; + float* temp_b = diag+iter*span; + float* local_result = temp_result; + float* local_diag = temp_result2; + + for(int i = 0; i < span; i++) + { + __bang_mul(local_result, temp_a, temp_b, iter); + __bang_mul(local_diag, diag+i*span, temp_b,iter); + local_result = local_result + span; + temp_a = temp_a + span; + local_diag = local_diag + span; + } + + if(iter>1) + { + local_result = temp_result; + local_diag = temp_result2; + for(int i = 0; i < span; i++) + { + kahansum(local_result,iter); + kahansum(local_diag,iter); + local_result = local_result + span; + local_diag = local_diag + span; + } + } + for(int i = 0; i < span; i++) + { + nram_src[i*span+iter] -= temp_result[i*span]; + diag[i*span+iter] -= temp_result2[i*span]; + } - + } + + if(factor<0) + { + if(id == 0) + { + printf("factor:%.3f\n",factor); + printf("iter:%d\n",iter); + } + + } + factor = diag[iter*POTF_NB+iter]; + __nram__ float temp[1]; + __bang_rsqrt(temp,diag+iter*POTF_NB+iter,1); + factor = temp[0]; + + + for(int i = 0; i < span; i++) + { + nram_src[i*POTF_NB+iter] *= factor; diag[i*POTF_NB+iter] *= factor; - } __sync(); - - - for(int i = iter + 1; i < POTF_NB; i++) - { - for(int j = 0; j < span; j++) - { - diag[j * POTF_NB + i ] -= diag[i*POTF_NB+iter] * diag[j * POTF_NB + iter]; - nram_src[j * POTF_NB + i ] -= diag[i*POTF_NB+iter] * nram_src[j * POTF_NB + iter]; - } - } + } __sync_cluster(); - + + if(id*span 0; int span = (remain > POTF_NB||remain <= 0) ? POTF_NB : remain; float *rA = (float*)nram_buffer + id * NB * NB * 4; + float *rB = rA + NB * NB; + float *rC = rB + NB * NB; + float* rp = rC + NB * NB; + int span_b = POTF_NB > m ? m : POTF_NB; + + + __memset_nram(rC,span_b*span,(float)ZERO); + if(if_execute) { if(k>0) @@ -143,14 +242,20 @@ void sgemm_anywidth_device(int m, int k, __memcpy(rA,A0+id*POTF_NB*lda,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span-1); } __memcpy(rp,sC+id*POTF_NB*lda,span_b*sizeof(float),SRAM2NRAM,span_b*sizeof(float),lda*sizeof(float),span-1); + } if(k>0) { - __memcpy(rB,A0,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span_b-1); - + + __memcpy(rB,A0,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span_b-1); - } + + } + + + + __sync_cluster(); for(int i = 0; i < span; i++) @@ -163,6 +268,7 @@ void sgemm_anywidth_device(int m, int k, } } } + __bang_sub(rp,rp,rC,span_b * span); __sync_cluster(); @@ -179,8 +285,15 @@ void sgemm_anywidth_device(int m, int k, { __memcpy(sC+(id*POTF_NB*lda),rp,span_b*sizeof(float),NRAM2SRAM,lda*sizeof(float),span_b*sizeof(float),span-1); } + + + + + + } + static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) { float factor; @@ -197,14 +310,12 @@ static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) __sync_cluster(); for(int i = 0; i < span; i++) { - - if(if_execute) - A[i*lda+iter+id*POTF_NB*lda] *= factor; + A[i*lda+iter+id*POTF_NB*lda] *= factor; } __sync_cluster(); - + if(if_execute) { for(int i = iter + 1; i < iter_num; i++) @@ -231,35 +342,56 @@ __mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float *A0, float * float* sdata_A = shared_data; float* sdata_B = shared_data + m *POTF_NB/TASK_NUM * 4; + + sgemm_fixwidth_device(m, localstep, A0, lda, sdata_A, sdata_B); + + __sync_cluster(); + + + spotf2_sminout_fixsize_device(m, sdata_A, POTF_NB); + __sync_cluster(); + int span = POTF_NB; + if(id==0) - { + { for(int i = 0; i < span; i++) { - __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); - + + __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); } } else if(id*span < m) { __memcpy(A+(id*POTF_NB*lda),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,lda*sizeof(float),POTF_NB*sizeof(float),span-1); - } + __sync_cluster(); + + + } __mlu_func__ void spotf2_smlpout_anywidth_device(const int m, float *A0, float *A, int lda, const int localstep, const int gbstep) { + sgemm_anywidth_device(m, localstep, A0, lda, A, nullptr); + + + spotf2_sminout_anysize_device(m, A, lda); __sync_cluster(); + + + + } @@ -267,11 +399,12 @@ __mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, bool tr { int id = taskId; - float* orignA = dA; - int batch_id = id / 4; - if(batch_id >= batch) - return; - dA = orignA + batch_id * stride; + float* orignA = dA; + + int batch_id = id / 4; + if(batch_id >= batch) + return; + dA = orignA + batch_id * stride; __mlu_shared__ float shared_data[NB * NB]; @@ -288,7 +421,7 @@ __mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, bool tr if(id == 0) { __memcpy(shared_data,dA,m*sizeof(float),GDRAM2SRAM,NB*sizeof(float),lda*sizeof(float),m-1); - + } __sync_cluster(); @@ -304,7 +437,7 @@ __mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, bool tr __memcpy(dA,shared_data,m*sizeof(float),SRAM2GDRAM,lda*sizeof(float),NB*sizeof(float),m-1); } __sync_cluster(); - } + } } @@ -314,7 +447,6 @@ void small_sgemm_batch(int m, int k, float* A0, const int lda,int width, float* dst, float* nram_remain) { - int ldk = k; int ldm = m; float* src1 = nram_remain; @@ -359,14 +491,12 @@ void small_sgemm_batch(int m, int k, __mlu_func__ void small_sminout_batch(int m, int width, float *dst, float *nram_remain, int lda) { float factor; - float* diag = dst; for(int iter = 0; iter < width; iter++) { factor=sqrt(diag[iter*width+iter]); factor = 1.0/factor; - for(int i = 0; i < m; i ++) { dst[i*width+iter] *= factor; @@ -378,26 +508,31 @@ __mlu_func__ void small_sminout_batch(int m, int width, float *dst, float *nram_ { dst[j * width + i ] -= dst[i*width+iter] * dst[j * width + iter]; - } } __sync(); - + + + + } __sync(); - + + } -__mlu_func__ void smlpout_batch(const int m, float *A0, float *A, int lda, const int localstep, int width) +__mlu_func__ +void smlpout_batch(const int m, float *A0, float *A, int lda, const int localstep, int width) { float* dst = (float*)nram_buffer; float* nram_remain = dst + m * m; + small_sgemm_batch(m, localstep, A0, lda, width, dst, nram_remain); __sync(); - + small_sminout_batch(m, width, dst, nram_remain, width); __sync(); @@ -433,6 +568,7 @@ __mlu_global__ void spotf2_batch_kernel(int batch, int stride, int m, float *dA, span = std::min(width, m - i); smlpout_batch(m-i,dA+i*lda,dA+i*lda+i,lda,i,span); } + } mluOpStatus_t mlu_spotf2_lpin(int batch, int stride, bool trans,bool uplo, int n, int ldda, float* dA, int gbstep, cnrtQueue_t queue) @@ -473,6 +609,7 @@ mluOpStatus_t mlu_spotf2_lpin(int batch, int stride, bool trans,bool uplo, int n spotf2_smlpin_anywidth_kernel<<>>(batch, stride, trans, n, dA, ldda, 0,gbstep)); } + return MLUOP_STATUS_SUCCESS; } @@ -490,7 +627,7 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( float* orignB = dB; dA = orignA + batch_id * stride; dB = orignB + batch_id * stride; - + int span = n; int start = 0; @@ -505,12 +642,14 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( float temp_b = 0, factor = 0; + + __memcpy_async(sA,dA,sizeof(float),GDRAM2NRAM); __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),GDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); - __sync(); - + __sync(); + if(trans) { __memcpy_async(rA,sA,(1)*sizeof(float),NRAM2NRAM); @@ -522,8 +661,6 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( factor = 1.0 / rA[0]; for(int i = 0; i < span; i++) { - - rB[i*calc_length] *= factor; } @@ -540,11 +677,8 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( factor = 1.0 / rA[iter]; for(int i = 0; i < span; i++) { - __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,iter); - temp_b = 0; - for(int j = 0; j < iter; j++) { temp_b += rC[i*calc_length+j]; @@ -562,11 +696,9 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( factor = 1.0 / rA[m-1]; for(int i = 0; i < span; i++) { - __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,m-1); temp_b = 0; - for(int j = 0; j < m-1; j++) { temp_b += rC[i*calc_length+j]; @@ -577,6 +709,7 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( } __sync(); + __memcpy(OFFSET_B_ROW(dB,start,0),rB,calc_length*sizeof(float),NRAM2GDRAM,ldb * sizeof(float), calc_length * sizeof(float), span - 1); __sync(); @@ -599,8 +732,9 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( float* orignA = dA; float* orignB = dB; dA = orignA + batch_id * stride; - dB = orignB + batch_id * stride; + dB = orignB + batch_id * stride; + int span = n / 4; int start = id * span; if(id == 3) @@ -616,22 +750,27 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( __nram__ float rA[8*POTF_NB]; int calc_length = (8 * POTF_NB) > m ? m : (8 * POTF_NB); __memset_nram(rB,POTF_NB*calc_length,(float)ZERO); - __sramset(sA,calc_length*calc_length,0); float temp_b = 0, factor = 0; float sum = 0.0; float c = 0.0; float t = 0.0; + sum = 0.0; + c = 0.0; + t =0.0; + temp_b = 0; + factor = 0; if(id == 0) { - __memcpy_async(sA,dA,sizeof(float),LDRAM2SRAM); + __memcpy(sA,dA,sizeof(float),LDRAM2SRAM); } if(if_execute) - __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); - __sync_cluster(); + __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); + __sync_cluster(); + if(trans) { @@ -648,8 +787,6 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( factor = 1.0 / rA[0]; for(int i = 0; i < span; i++) { - - rB[i*calc_length] *= factor; } @@ -670,19 +807,17 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( factor = 1.0 / rA[iter]; for(int i = 0; i < span; i++) { - __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,iter); - temp_b = 0; sum = 0.0; c = 0.0; - t = 0.0; - + t = 0.0; + for(int j = 0; j < iter; j++) { - temp_b = rC[i*calc_length+j] - c; - t = sum + temp_b; - c = (t - sum) - temp_b; + temp_b = rC[i*calc_length+j] - c; + t = sum + temp_b; + c = (t - sum) - temp_b; sum = t; } temp_b = sum; @@ -700,18 +835,12 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( factor = 1.0 / rA[m-1]; for(int i = 0; i < span; i++) { - __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,m-1); sum = 0.0; c = 0.0; t = 0.0; temp_b = 0; - - - - - for(int j = 0; j < m-1; j++) { temp_b = rC[i*calc_length+j] - c; @@ -736,7 +865,6 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( } - mluOpStatus_t strsm_rectile(int batch, int stride, bool upper, bool trans, int m, int n, float *d_a, int lda, float *d_b, int lddb, cnrtQueue_t queue) { cnrtDim3_t dim; @@ -790,7 +918,6 @@ __mlu_global__ void add_c_batch(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) { - int id = taskId; int batch_id = id; if(batch_id >= batch) @@ -802,7 +929,9 @@ void add_c_batch(int batch, int stride, float beta, float *d_c, float* src,int l if (beta == 0.0f) - { + { + + __memcpy(d_c,src,n*sizeof(float),GDRAM2GDRAM,ldc*sizeof(float),ldsrc*sizeof(float),m-1); return; } @@ -835,7 +964,6 @@ __mlu_global__ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) { - int id = taskId; int ipu_per_cluster = 4; int batch_id = id / ipu_per_cluster; @@ -846,6 +974,8 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in float* orignSrc = src; d_c = orignC + batch_id * stride; src = orignSrc + batch_id * m*n; + + __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; if (beta == 0.0f) @@ -887,7 +1017,6 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); - int32_t data_nram_num = MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; float *a_nram = (float *)nram_buffer; @@ -919,7 +1048,7 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in if (id == 0) { __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2GDRAM,ldc*sizeof(float),n*sizeof(float),m-1); - + } __sync_cluster(); @@ -931,53 +1060,46 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, { if(k==0) return MLUOP_STATUS_SUCCESS; - int matmul_is_transA = trans_a; - int matmul_is_transB = trans_b; - - - int matmul_requested_algo = 1; - int matmul_recieved_algo = 0; - size_t tempSize_matmulExtra = 0; - int matmul_computetype = MLUOP_DTYPE_FLOAT; - float *workspace; - int matmul_use_beta = beta == 0.0f ? 0 : 1; - + + int32_t batch_size_arr[1] = {batch}; + int64_t stride_a_arr[1] = {stride_a}; + int64_t stride_b_arr[1] = {stride_b}; + int64_t stride_c_arr[1] = {stride_c}; + + std::string api_name = "Cholesky"; + cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - - mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc; - cnnlMatMulDescriptor_t matmul_desc; - cnnlMatMulHeuristicResult_t heuristic_result; - cnnlMatMulAlgo_t matmul_algo; + cnnlStrideBatchMatMulAlgo_t algo; + CALL_CNNL(cnnlStrideBatchMatMulAlgoCreate(&algo)); - std::string api_name = "Cholesky"; + cnnlStrideBatchMatMulHeuristicResult_t heuristic_result; + CALL_CNNL(cnnlCreateStrideBatchMatMulHeuristicResult(&heuristic_result)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc));; - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc)); + - CALL_CNNL(cnnlMatMulDescCreate(&matmul_desc)); - CALL_CNNL(cnnlMatMulAlgoCreate(&matmul_algo)); - CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result)); + cnnlStrideBatchMatMulDescriptor_t stride_bmm_desc; + CALL_CNNL(cnnlStrideBatchMatMulDescCreate(&stride_bmm_desc)); + int32_t allow_tf32 = 0, max_batch_dim = 1; + CALL_CNNL(cnnlSetStrideBatchMatMulDescAttr(stride_bmm_desc, CNNL_STRIDE_BMM_ALLOW_TF32, + &(allow_tf32), sizeof(int32_t))); + CALL_CNNL(cnnlSetStrideBatchMatMulDescAttr(stride_bmm_desc, CNNL_STRIDE_BMM_MAX_BATCH_DIM, + &(max_batch_dim), sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSA, - &matmul_is_transA, sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSB, - &matmul_is_transB, sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_COMPUTE_TYPE, - &matmul_computetype, sizeof(int32_t))); + + mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc; - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_USE_BETA, - &matmul_use_beta, sizeof(int32_t))); + - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_USE_STRIDE, - &lda, sizeof(int32_t))); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc));; + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc)); int32_t matmul_a_shape[2] = {batch, stride_a}; int32_t matmul_b_shape[2] = {batch, stride_b}; - int32_t matmul_c_shape[2] = {batch, m*n}; + int32_t matmul_c_shape[2] = {batch, stride_c}; CHECK_RETURN(api_name, mluOpSetTensorDescriptor( matmul_a_desc, MLUOP_LAYOUT_ARRAY, @@ -989,6 +1111,8 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, matmul_c_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_FLOAT, 2, matmul_c_shape)); + + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); @@ -996,59 +1120,37 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_d_desc); + int requested_algo_count = 1, return_algo_count = 0; + float *workspace; + size_t workspace_size; - CALL_CNNL(cnnlGetMatMulAlgoHeuristic( - cnnl_handle, matmul_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, - cnnl_d_desc, nullptr, matmul_requested_algo, &heuristic_result, - &matmul_recieved_algo)); - CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo, - &tempSize_matmulExtra)); + cnnlGetStrideBatchMatMulAlgoHeuristic( + cnnl_handle, stride_bmm_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, cnnl_d_desc, trans_a, trans_b, false, + &(alpha), &(beta), m, n, k, lda, ldb, ldc, batch_size_arr, stride_a_arr, stride_b_arr, + stride_c_arr, nullptr, requested_algo_count, &heuristic_result, &return_algo_count); + + cnnlGetStrideBatchMatMulHeuristicResult(heuristic_result, &algo, &workspace_size); - CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*m*n*sizeof(float))); + if(workspace_size > 0) + { + CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); + } + else + { + CNRT_CHECK(cnrtMalloc((void **)&workspace, m*n*sizeof(float))); + } - cnnlStrideBatchMatMul(cnnl_handle, trans_a, trans_b, m, n, k, batch, alpha, cnnl_a_desc, d_a, lda, stride_a, cnnl_b_desc, d_b, ldb, stride_b, 0.0f, cnnl_c_desc, workspace, n, m*n); - - if ( beta == 1.0f || beta == 0.0f) - { - cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; - dim.y = 1; - dim.z = 1; - int nram_space = 2 * m * n * sizeof(float); - if(batch > 1 && nram_space < MAX_NRAM_SIZE) - { - dim.x = batch; - KERNEL_CHECK(add_c_batch<<>>(batch, stride_c, beta,d_c,workspace,ldc,n,m,n)); + - } - else - { - int carry_batch = batch; - if(batch == 1) - { - func_type = CNRT_FUNC_TYPE_UNION1; - } - else if(batch == 2) - { - func_type = CNRT_FUNC_TYPE_UNION2; - } - else if(batch <= 4) - { - func_type = CNRT_FUNC_TYPE_UNION4; - carry_batch = 4; - } - else - { - func_type = CNRT_FUNC_TYPE_UNION8; - carry_batch = batch < 8 ? 8 : batch; - } - dim.x = carry_batch * 4; - KERNEL_CHECK(add_c<<>>(batch, stride_c, beta,d_c,workspace,ldc,n,m,n)); - } + CALL_CNNL(cnnlStrideBatchMatMul_v2( + cnnl_handle, stride_bmm_desc, algo, trans_a, trans_b, false, m, n, k, batch_size_arr, &(alpha), + cnnl_a_desc, d_a, lda, stride_a_arr, + cnnl_b_desc, d_b, ldb, stride_b_arr, &(beta), cnnl_c_desc, d_c, ldc, + stride_c_arr, cnnl_d_desc, d_c, workspace, workspace_size)); return MLUOP_STATUS_SUCCESS; } @@ -1066,11 +1168,9 @@ void batch_inverse_kernel(int batch, float *d_input, int ld_input, int stride_in d_input = orign_input + batch_id * stride_input; d_output = orign_output + batch_id * stride_output; - float* nram_offset = (float*)nram_buffer; float* nram_src0 = nram_offset; - float* nram_src1 = nram_src0 + m * m; float* nram_src2 = nram_src1 + m * m; float* mul_result = nram_src2 + m; @@ -1080,8 +1180,6 @@ void batch_inverse_kernel(int batch, float *d_input, int ld_input, int stride_in __memset_nram(nram_offset, 4 * m * m, (float)ZERO); - - __memcpy(nram_dst,d_input,m*sizeof(float),GDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),m-1); float result = 0.0; @@ -1133,12 +1231,10 @@ void inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, f float* orignOutput = d_output; d_input = orignInput + batch_id * stride_input; d_output = orignOutput + batch_id * stride_output; - __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; if (id == 0) { - __memcpy(sram_buffer,d_input,m*sizeof(float),GDRAM2SRAM,m*sizeof(float),ld_input*sizeof(float),m-1); } __sync_cluster(); @@ -1151,7 +1247,6 @@ void inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, f span = m - 3 * span; } float* nram_offset = (float*)nram_buffer + id * 3 * m * m; - float* nram_src1 = nram_offset; float* nram_src2 = nram_src1 + m * m; float* mul_result = nram_src2 + m; @@ -1183,7 +1278,7 @@ void inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, f for(int j = 0; j < num; j++) { float temp = 0.0; - + __bang_mul(mul_result,nram_src2,nram_src1+j*height,i); for(int k = 0; k< i; k++) { @@ -1205,10 +1300,10 @@ void inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, f __sync_cluster(); if (id == 0) { - __memcpy(d_output,sram_buffer,m*sizeof(float),SRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); } + } @@ -1221,7 +1316,7 @@ __mlu_global__ void set_zero(int batch, int stride, bool upper, int m, float* d_ float* orignC = d_c; d_c = orignC + batch_id * stride; id = taskId % 4; - int span = m/taskDim; + int span = m/4; int pre = id * span; float* start_c = d_c + pre * lddc + pre; float* temp_c = start_c; @@ -1236,7 +1331,7 @@ __mlu_global__ void set_zero(int batch, int stride, bool upper, int m, float* d_ int num = m - pre - i; __ldramset(temp_c+1, num - 1, 0); } - if (id != 3) + if (id != 3&&span > 0) { temp_c = start_c + (span - 1) * lddc + span - 1; int num = m - pre - span + 1; @@ -1244,6 +1339,7 @@ __mlu_global__ void set_zero(int batch, int stride, bool upper, int m, float* d_ } } + mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, float* d_a, int lda, float* d_b, int ldb, mluOpHandle_t handle) { if(n==0) @@ -1278,7 +1374,7 @@ mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(info_desc, cnnl_info_desc); - + float* workspace; CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*m*m*sizeof(float))); CNRT_CHECK(cnrtMemset(workspace, 0.0, batch*m*m*sizeof(float))); @@ -1334,7 +1430,6 @@ mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, sgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,d_a+m1*lda,lda,stride,workspace1,m,m*m,workspace1+m1*m,m,m*m,handle); sgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,workspace2,m,m*m,workspace1+m1*m,m,m*m,workspace1+m1*m,m,m*m,handle); cnrtQueueSync(queue); - cnnlStrideBatchMatMul(cnnl_handle, false, true, n,m, m, batch, 1.0, cnnl_b_desc, d_b, ldb, stride, cnnl_a_desc, workspace, m, m*m, 0.0f, cnnl_b_desc, d_b, ldb, stride); return MLUOP_STATUS_SUCCESS; @@ -1346,7 +1441,7 @@ mluOpStatus_t set_half_zero(int batch,int stride,float* d_a, int lda, int m, mlu mluOpGetQueue(handle,&queue); cnrtDim3_t dim; cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; - dim.x = 4; + dim.x = 4 * batch; dim.y = 1; dim.z = 1; KERNEL_CHECK(set_zero<<>>(batch, stride, false, m, d_a,lda)); @@ -1401,30 +1496,23 @@ mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, i if(n <=recnb) { - mlu_spotf2_lpin(batch, stride, trans, uplo,n,lda,d_A,gbstep,queue); } else { int n1 = n/2; int n2 = n-n1; - mlu_spotrf_rectile(batch,stride,trans,uplo,n1,recnb,OFFSET_ROW(d_A,0,0),lda,gbstep, handle); - - strsm_rectile(batch, stride, uplo,trans,n1,n2,OFFSET_ROW(d_A,0,0),lda,OFFSET_ROW(d_A,n1,0),lda,queue); - ssyrk(batch,stride,uplo,trans,n2,n1,d_A+n1*lda,lda,OFFSET_ROW(d_A,n1,n1),lda,handle); - mlu_spotrf_rectile(batch,stride,trans,uplo,n2,recnb,OFFSET_ROW(d_A,n1,n1),lda,gbstep+n1,handle); - - } - + + return MLUOP_STATUS_SUCCESS; } - -mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, mluOpHandle_t handle) +// m * n +mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, mluOpHandle_t handle,mluOpDataType_t type, float* workspace) { if(m==0) return MLUOP_STATUS_SUCCESS; @@ -1443,11 +1531,11 @@ mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, CHECK_RETURN(api_name, mluOpSetTensorDescriptor( trans_input_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 3, transpose_input_shape)); + type, 3, transpose_input_shape)); CHECK_RETURN(api_name, mluOpSetTensorDescriptor( trans_output_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 3, transpose_output_shape)); + type, 3, transpose_output_shape)); int permute[3] = {0, 2, 1}; @@ -1461,26 +1549,21 @@ mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, CALL_CNNL(cnnlSetTransposeDescriptor(cnnl_trans_desc, input_dim, permute)); - size_t *size = NULL; - size = (size_t*)malloc(sizeof(size_t)); + size_t size=0; - CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, cnnl_trans_desc, size)); - + CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, cnnl_trans_desc, &size)); - float *workspace = NULL; - if(*size > 0ul) + if(size > 0ul) { - printf("start malloc\n"); - CNRT_CHECK(cnrtMalloc((void **)&workspace, *size)); - printf("transpose2 need size: %zu\n",*size); + + printf("transpose2 need size: %zu\n",size); } CALL_CNNL(cnnlTranspose_v2(cnnl_handle, cnnl_trans_desc, cnnl_in_desc, d_input, cnnl_out_desc, d_output, - workspace, *size)); - + workspace, size)); return MLUOP_STATUS_SUCCESS; } diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index 6da17bad2..9bb8f448b 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -25,11 +25,11 @@ void small_cgemm(int m,int k, float* rB = ip + CPOTF_NB *CREC_NB; float* iB = rB + CPOTF_NB *CREC_NB; - float *srB = sram_buffer; - float *siB = srB + CPOTF_NB * CREC_NB; + float *srB = sram_buffer; //srB:shared_real_B + float *siB = srB + CPOTF_NB * CREC_NB; //siB:shared_imag_B float* rdst = dst; - float* idst = rdst + span*CPOTF_NB; + float* idst = rdst + CPOTF_NB*CPOTF_NB; int total_length = k + width; int loop_width = CPOTF_NB; @@ -90,6 +90,7 @@ void small_cgemm(int m,int k, __bang_sub(rp,rp,rC,CPOTF_NB * span); __bang_sub(ip,ip,iC,CPOTF_NB * span); + if(if_execute) { __memcpy(rdst,rp,span*CPOTF_NB*sizeof(float),NRAM2NRAM); @@ -98,7 +99,7 @@ void small_cgemm(int m,int k, if(id == 0) { __memcpy(sram_buffer,rp,span*CPOTF_NB*sizeof(float),NRAM2SRAM); - __memcpy(sram_buffer+span*CPOTF_NB,ip,span*CPOTF_NB*sizeof(float),NRAM2SRAM); + __memcpy(sram_buffer+CPOTF_NB*CPOTF_NB,ip,span*CPOTF_NB*sizeof(float),NRAM2SRAM); } __sync_cluster(); } @@ -114,45 +115,57 @@ void small_cminout(int m, int width, int finish = id * CPOTF_NB; int remain = m - finish; bool if_execute = remain > 0; - int span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; + int span = 2; + span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; float *rdst = dst; - float *idst = dst + span*CPOTF_NB; + float *idst = dst + CPOTF_NB*CPOTF_NB; float *rdiag = idst + CPOTF_NB *CREC_NB; float *idiag = rdiag + CPOTF_NB*CPOTF_NB; - float a1,b1,a2,b2,a3,b3; + if(if_execute) { - __memcpy(rdiag,sram_buffer,width*CPOTF_NB*COMPLEX_TYPE_SIZE,SRAM2NRAM); - __memcpy(idiag,sram_buffer+CPOTF_NB*CPOTF_NB,width*CPOTF_NB*COMPLEX_TYPE_SIZE,SRAM2NRAM); + __memcpy(rdiag,sram_buffer,width*CPOTF_NB*sizeof(float),SRAM2NRAM); + __memcpy(idiag,sram_buffer+CPOTF_NB*CPOTF_NB,width*CPOTF_NB*sizeof(float),SRAM2NRAM); + for(int iter = 0; iter < width; iter++) { factor = sqrt(rdiag[(iter * CPOTF_NB+iter)]); factor = 1.0/factor; - for(int i = 0; i #include "cholesky.h" -// #include "kernels/kernel_wrapper/export_statement.h" namespace mluoptest { @@ -44,9 +43,9 @@ void set_matrix_zero(float*A, bool upper, bool trans_, int n_, int ldda_, mluOpD { if(trans_) { - for (int i = 0; i < n_; i++) + for (long int i = 0; i < n_; i++) { - for (int j = 0; j < ldda_; j++) + for (long int j = 0; j < ldda_; j++) { if(upper) { @@ -118,23 +117,30 @@ void set_matrix_zero(float*A, bool upper, bool trans_, int n_, int ldda_, mluOpD } -void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int ldda_, mluOpDataType_t type_) +void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int ldda_, mluOpDataType_t type_, bool diag_add) { if(trans_) { - for(int i = 0; i = i) || (upper_==true && j <= i))) - else + if(j == i && diag_add) + A[j+i*lda] = 1.0; + + } + + else if(type_ == MLUOP_DTYPE_COMPLEX_FLOAT && ((upper_==false && j >= i) || (upper_==true && j >= i))) { A[j*lda*2+i*2] = 0.0; A[j*lda*2+i*2+1] = 0.0; + if(j == i&& diag_add) + A[j*lda*2+i*2] = 1.0; } - for(int k = 0; k <=i; k++) + for(long int k = 0; k <=i; k++) { if(upper_==false) { @@ -158,22 +164,36 @@ void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int l } else { - if(j > i) - continue; - else + if(type_ == MLUOP_DTYPE_FLOAT) { - if(type_ == MLUOP_DTYPE_FLOAT) + if(j > i) + continue; + else + { A[i+j*lda] += (C[k*lda+i]*C[k*lda+j]); + } + } + else + { + if(j < i) + continue; else { - A[(i+j*lda)*2] += (C[(k*lda+i)*2]*C[(k*lda+j)*2]+C[(k*lda+i)*2+1]*C[(k*lda+j)*2+1]); - A[(i+j*lda)*2+1] += (-C[(k*lda+i)*2]*C[(k*lda+j)*2+1]+C[(k*lda+i)*2+1]*C[(k*lda+j)*2]); + A[(i+j*lda)*2] += (C[(k+i*lda)*2]*C[(k+j*lda)*2]+C[(k+i*lda)*2+1]*C[(k+j*lda)*2+1]); + A[(i+j*lda)*2+1] += (C[(k+i*lda)*2]*C[(k+j*lda)*2+1]-C[(k+i*lda)*2+1]*C[(k+j*lda)*2]); + } + if(type_ != MLUOP_DTYPE_FLOAT && j != i) + { + A[(j+i*lda)*2] = A[(i+j*lda)*2]; + A[(j+i*lda)*2+1] = -A[(i+j*lda)*2+1]; } } + + } } - if(type_ != MLUOP_DTYPE_FLOAT &&((upper_==false && j > i) || (upper_==true && j < i))) + if(type_ != MLUOP_DTYPE_FLOAT &&((upper_==false && j > i) || (upper_==true && j > i))) { A[(j+i*lda)*2] = A[(i+j*lda)*2]; A[(j+i*lda)*2+1] = -A[(i+j*lda)*2+1]; @@ -211,6 +231,67 @@ void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int l } } +void fill_zero(float*A, bool upper_, int batch_, int n_, int ldda_, mluOpDataType_t type_,bool if_conj) +{ + int stride = n_ * ldda_; + if(type_ == MLUOP_DTYPE_FLOAT) + { + } + else + { + stride *= 2; + } + for(long int i = 0; i < batch_;i++) + { + for(long int j = 0; j < n_; j++) + { + for(long int h = 0; h < ldda_; h++) + { + if(j==h) + { + continue; + } + else if(jgetProtoNode()->input(0); auto input_shape = input_tensor.shape(); + auto base_line_out = cpu_fp32_output_[0]; upper_ = parser_->getProtoNode()->cholesky_param().upper(); int dim_size = input_shape.dims_size(); type_ = input_desc_->dtype; @@ -300,7 +383,7 @@ void CholeskyExecutor::prepareComputeParam() int dim = input_desc_->dim; stride_ = (input_desc_->strides)[dim-1]; ldda_ = input_desc_->dims[2]; - printf("batch_size:%d,n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n",batch_size_,n_,ldda_,stride_,upper_,trans_); + printf("batch_size:%ld,n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n",batch_size_,n_,ldda_,stride_,upper_,trans_); int size = input_desc_->dims[1]; @@ -314,35 +397,76 @@ void CholeskyExecutor::prepareComputeParam() printf("data vector length : %ld\n",data_vector_.size()); } -// printf("matrix random:\n"); -// print_matrix(batch_size_, dev_a,ldda_,trans_,n_,ldda_,type_); - std::memcpy(dev_c,dev_a,type_size_*n_*ldda_); - set_matrix_zero((float*)dev_c,upper_,trans_,n_,ldda_,type_); - trans_mul(dev_a,dev_c,ldda_,upper_,trans_,n_,ldda_,type_); - - if(dim_size == 3) + if(batch_size_ > 16 && n_ > 2000) + { + std::memcpy(dev_c,dev_a,16*type_size_*n_*ldda_); + std::memcpy(dev_c+16*type_size_/4*n_*ldda_,dev_a+16*type_size_/4*n_*ldda_,(batch_size_-16)*type_size_*n_*ldda_); + } + else + { + std::memcpy(dev_c,dev_a,batch_size_*type_size_*n_*ldda_); + } + if(parser_->device() == CPU) { - for(int i = 1; i < batch_size_;i++) + for(long int i = 0; i < batch_size_;i++) + { + if(type_ == MLUOP_DTYPE_FLOAT) + set_matrix_zero(dev_c+i*n_*ldda_,false,trans_,n_,ldda_,type_); + else + set_matrix_zero(dev_c+i*n_*ldda_*2,false,trans_,n_,ldda_,type_); + } + for(long int i = 0; i < batch_size_;i++) { - std::memcpy(dev_a+(i*n_*ldda_)*type_size_/4,dev_a,type_size_*n_*ldda_); - std::memcpy(dev_c+(i*n_*ldda_)*type_size_/4,dev_c,type_size_*n_*ldda_); + if(type_ == MLUOP_DTYPE_FLOAT) + { + trans_mul(dev_a+i*n_*ldda_,dev_c+i*n_*ldda_,ldda_,false,trans_,n_,ldda_,type_,true); + fill_zero(dev_a,false,batch_size_,n_,ldda_,type_,false); + } + else + { + trans_mul(dev_a+i*n_*ldda_*2,dev_c+i*n_*ldda_*2,ldda_,false,trans_,n_,ldda_,type_,true); + fill_zero(dev_a,false,batch_size_,n_,ldda_,type_,true); + } } } -// printf("matrix A:\n"); -// print_matrix(batch_size_,dev_a,ldda_,trans_,n_,ldda_,type_); -// printf("matrix C:\n"); -// print_matrix(batch_size_,dev_c,ldda_,trans_,n_,ldda_,type_); - GTEST_CHECK(CNRT_RET_SUCCESS == + + + + + if(batch_size_>16) + { + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(dev_d, dev_a, type_size_*n_*ldda_*16, CNRT_MEM_TRANS_DIR_HOST2DEV)); + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(dev_d+16*type_size_/4*n_*ldda_, dev_a+16*type_size_/4*n_*ldda_, type_size_*n_*ldda_*(batch_size_-16), CNRT_MEM_TRANS_DIR_HOST2DEV)); + } + else + { + GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemcpy(dev_d, dev_a, type_size_*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_HOST2DEV)); - float* cpu_a = cpu_fp32_input_[0]; - std::memcpy(cpu_a,dev_a,type_size_*n_*ldda_); + } + + if(parser_->device() == CPU) + { + float* cpu_a = cpu_fp32_input_[0]; + if(batch_size_ > 16 && n_ > 2000) + { + std::memcpy(cpu_a,dev_a,16*type_size_*n_*ldda_); + std::memcpy(cpu_a+16*type_size_/4*n_*ldda_,dev_a+16*type_size_/4*n_*ldda_,(batch_size_-16)*type_size_*n_*ldda_); + } + else + { + std::memcpy(cpu_a,dev_a,batch_size_*type_size_*n_*ldda_); + } + } + + printf("end prepare compute.\n"); } void CholeskyExecutor::compute() { -// prepareComputeParam(); VLOG(4) <<" CholeskyExecutor compute "; auto input_desc_ = tensor_desc_[0].tensor; @@ -351,42 +475,93 @@ void CholeskyExecutor::compute() { auto h_output = (float*)(data_vector_[1].host_ptr); auto d_intput = (float*)(data_vector_[0].device_ptr); auto d_output = (float*)(data_vector_[1].device_ptr); - std::memcpy(h_input,h_output,type_size_*n_*ldda_*batch_size_); - GTEST_CHECK(CNRT_RET_SUCCESS == + if(batch_size_>16) + { + std::memcpy(h_input,h_output,type_size_*n_*ldda_*16); + std::memcpy(h_input+type_size_/4*n_*ldda_*16,h_output+type_size_/4*n_*ldda_*16,type_size_*n_*ldda_*(batch_size_-16)); + } + else + { + std::memcpy(h_input,h_output,type_size_*n_*ldda_*batch_size_); + } + if(batch_size_>16) + { + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(h_output, d_intput, type_size_*n_*ldda_*16, CNRT_MEM_TRANS_DIR_DEV2HOST)); + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(h_output+16*type_size_/4*n_*ldda_, d_intput+16*type_size_/4*n_*ldda_, type_size_*n_*ldda_*(batch_size_-16), CNRT_MEM_TRANS_DIR_DEV2HOST)); + } + else + { + GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemcpy(h_output, d_intput, type_size_*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_DEV2HOST)); -// printf("mlu before cholesky result:\n"); -// print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); + } + interface_timer_.start(); float* workspace = nullptr; size_t size = 0; mluOpGetCholeskyWorkspace(input_desc_,&size,&workspace); - MLUOP_CHECK(mluOpCholesky(handle_,input_desc_,d_intput, output_desc_, d_output, upper_,workspace)); + +MLUOP_CHECK(mluOpCholesky(handle_,input_desc_,d_intput, output_desc_, d_output, upper_,workspace)); + interface_timer_.stop(); - - GTEST_CHECK(CNRT_RET_SUCCESS == + if(batch_size_>16) + { + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(h_output, d_output, 16*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_DEV2HOST)); + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(h_output+16*type_size_/4*n_*ldda_, d_output+16*type_size_/4*n_*ldda_, (batch_size_-16)*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_DEV2HOST)); + } + else + { + GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemcpy(h_output, d_output, batch_size_*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_DEV2HOST)); + } + + if(parser_->device() != CPU ) + { + if(result_mul) + { + for(int i = 0; i < batch_size_;i++) + { + if(type_ == MLUOP_DTYPE_FLOAT) + trans_mul(h_input+i*n_*ldda_,h_output+i*n_*ldda_,ldda_,upper_,trans_,n_,ldda_,type_,false); + else + trans_mul(h_input+i*n_*ldda_*2,h_output+i*n_*ldda_*2,ldda_,upper_,trans_,n_,ldda_,type_,false); + } + } + else + { + + fill_zero(h_output,upper_,batch_size_,n_,ldda_,type_,false); + } + if(type_ != MLUOP_DTYPE_FLOAT) + { + set_diag_imag_one(h_output,batch_size_,n_,ldda_); + } + if(batch_size_>16) + { + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(d_output, h_output, 16*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_HOST2DEV)); + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(d_output+16*type_size_/4*n_*ldda_, h_output+16*type_size_/4*n_*ldda_, (batch_size_-16)*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_HOST2DEV)); + } + else + { + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(d_output, h_output, batch_size_*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_HOST2DEV)); + } + + } -// printf("mlu after cholesky result:\n"); -// print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); return; } -void CholeskyExecutor::cpuCompute() { -// auto dev_a = (float*)(data_vector_[0].host_ptr); - auto dev_c = (float*)(data_vector_[0].host_ptr); -// std::memcpy(dev_c,dev_a,sizeof(float)*n_*ldda_); - float* cpu_a = cpu_fp32_input_[0]; - float* cpu_c = cpu_fp32_output_[0]; - - if(n_ > 2000) - { - std::memcpy(cpu_c,dev_c,type_size_*n_*ldda_*batch_size_); - return; - } - std::memcpy(cpu_c,cpu_a,type_size_*n_*ldda_); - if(trans_) +void cpu_compute(float* cpu_c, int n_, int ldda_, bool upper_, bool trans_, mluOpDataType_t type_) +{ + if(trans_) { - for(int i = 0; i < n_; i++) + for(long int i = 0; i < n_; i++) { float dia; if(type_ == MLUOP_DTYPE_FLOAT) @@ -411,13 +586,13 @@ void CholeskyExecutor::cpuCompute() { { if(type_ == MLUOP_DTYPE_FLOAT) { - for(int j = i+1;j1) +void CholeskyExecutor::cpuCompute() { + auto dev_c = (float*)(data_vector_[0].host_ptr); + float* cpu_a = cpu_fp32_input_[0]; + float* cpu_c = cpu_fp32_output_[0]; + + if(batch_size_>16) { - for(int i = 1; i < batch_size_;i++) - { - if(type_ == MLUOP_DTYPE_FLOAT) - std::memcpy(cpu_c+i*n_*ldda_,cpu_c,type_size_*n_*ldda_); - else - std::memcpy(cpu_c+2*i*n_*ldda_,cpu_c,type_size_*n_*ldda_); - } + std::memcpy(cpu_c,cpu_a,type_size_*n_*ldda_*16); + std::memcpy(cpu_c+type_size_/4*n_*ldda_*16,cpu_a+type_size_/4*n_*ldda_*16,type_size_*n_*ldda_*(batch_size_-16)); + } + else + { + std::memcpy(cpu_c,cpu_a,type_size_*n_*ldda_*batch_size_); } - // printf("cpu cholesky result:\n"); - // print_matrix(batch_size_,cpu_c,ldda_,trans_,n_,ldda_,type_); - auto h_output = (float*)(data_vector_[1].host_ptr); auto h_input = (float*)(data_vector_[0].host_ptr); - float* res = h_input; - for(int i = 0; i < n_; i++) + + printf("cpu before cholesky result:\n"); + + if(result_mul) { - for(int j = 0;j < ldda_; j++) + for(int i = 0; i < batch_size_;i++) + { + if(type_ == MLUOP_DTYPE_FLOAT) + trans_mul(h_input+i*n_*ldda_,h_output+i*n_*ldda_,ldda_,upper_,trans_,n_,ldda_,type_,false); + else + trans_mul(h_input+i*n_*ldda_*2,h_output+i*n_*ldda_*2,ldda_,upper_,trans_,n_,ldda_,type_,false); + } + if(batch_size_>16) + { + std::memcpy(h_output,h_input,type_size_*n_*ldda_*16); + std::memcpy(h_output+type_size_/4*n_*ldda_*16,h_input+type_size_/4*n_*ldda_*16,type_size_*n_*ldda_*(batch_size_-16)); + } + else + { + std::memcpy(h_output,h_input,type_size_*n_*ldda_*batch_size_); + } + } + else + { + for(long int i = 0; i < batch_size_;i++) { - res[j+i*ldda_] = h_output[j+i*ldda_] - dev_c[j+i*ldda_]; + cpu_compute(cpu_c+i*n_*ldda_*type_size_/4, n_, ldda_, upper_, trans_, type_); } - } - // printf("cpu result minus mlu result:\n"); - // print_matrix(1,res,ldda_,trans_,n_,ldda_,type_); + fill_zero(cpu_c,upper_,batch_size_,n_,ldda_,type_,false); + fill_zero(h_output,upper_,batch_size_,n_,ldda_,type_,false); + } + return; } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h index 590b8eed1..ec23fa895 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h @@ -29,12 +29,13 @@ class CholeskyExecutor : public Executor { size_t size_workspace_ = 0; int stride_ = 0; mluOpDataType_t type_ = MLUOP_DTYPE_FLOAT; + bool result_mul = false; int type_size_ = 4; bool trans_ = true; bool upper_ = false; int ldda_ = 0; int n_ = 0; - int batch_size_ = 1; + long int batch_size_ = 1; public: CholeskyExecutor() {} @@ -42,8 +43,7 @@ class CholeskyExecutor : public Executor { void compute(); void cpuCompute(); void prepareComputeParam(); -//void workspaceMalloc(); -// void workspaceFree(); + int64_t getTheoryOps() override; }; } // namespace mluoptest From 7b2d90835e0580245e12a1cdc1466cbbdc3aad48 Mon Sep 17 00:00:00 2001 From: dglr Date: Tue, 23 Jul 2024 11:06:08 +0800 Subject: [PATCH 08/27] fix nram workspace, update doc --- ...7\350\203\275\345\210\206\346\236\220.png" | Bin 0 -> 220151 bytes docs/design_docs/cholesky/cholesky.md | 211 ++++++++---- kernels/cholesky/cholesky.cpp | 70 ++-- kernels/cholesky/cholesky_union1.mlu | 256 ++++++++------- kernels/cholesky/complex_cholesky_union1.mlu | 147 ++++----- mlu_op.h | 4 + .../pb_gtest/src/zoo/cholesky/cholesky.cpp | 299 +++++++++++------- .../pb_gtest/src/zoo/cholesky/cholesky.h | 2 +- 8 files changed, 575 insertions(+), 414 deletions(-) create mode 100644 "docs/design_docs/cholesky/32_128\346\200\247\350\203\275\345\210\206\346\236\220.png" diff --git "a/docs/design_docs/cholesky/32_128\346\200\247\350\203\275\345\210\206\346\236\220.png" "b/docs/design_docs/cholesky/32_128\346\200\247\350\203\275\345\210\206\346\236\220.png" new file mode 100644 index 0000000000000000000000000000000000000000..7cdf0be8855f4f174e49f268186a1ed444bcd44d GIT binary patch literal 220151 zcmY&dpC39Y>bn2{6eNFtfPK%Ak^sPc{u^28 zx1L7(w;pLaoiGGGLNMsMtfXv4fFGO6ITKb-95=j;oW6`!rsgd^XW;^G!1AJGd*o zg;MhbecWc@0L@NVoPL!~I{frwUxG+>7QeClD(W`f3Pbx~OvmF*$xs~d~9MLXLb+B)?@;QIAc{1V3G9VXwZ;UoH2nr zpvpFedd<*m9{3cddTbo~##ap3>r>%exctmF09+Gbn9UhO59P98#R0RVi+#5FpZ(bX z?|zIa*`XASIp1*y6?K`m#eZosgFAqRSi(TPnuy-d^6(Kh5}1Jz89afvV~zq791=`0 zsBFw6JA=_~xh@!7!^032l_Nic;YEQ7GriR@3X0qL@4ix4|2IZ>y_m!u(rV2K45vz8 zMDW_>Ln^Sy!V|br3f*LEkz}USjU6DHydq+N9b3}DnQ(_u2(ebdbd|U5ko+9#fckkto%W7+4X1@+`_ol{k8n6`}M6 zcjJu&mfn+Xl}G$1$E80=U~v>Xelu)P`%=9xTu}R`PD&6{1_6_w>&)%2vBQcdFM!~D z$!w^7H7_mt%>UndA5#CnZ;*o>+EepDQ%Vgm|B8Y9h?n~@@!p3JVwUzPq8aSGYsIz^ z^xHc;s!+vy{~7>egtI_Pk68F5@l2GdZH&MBkXLY7|F|t{!Zkdsu+OsyIu{L)5c+2+ z;RX;pS;=7g=Li?vt8{(7#)&H9?_+#+$y7y@opCYmL|+*{o%PcXMbr+Go{GJY^Oa=b z8h=}4@ByX8@k*Rht`yVgU6GE=)jJL_<3Cro!IE9X1mBv9ktu8@2Mrp+xF*&A zeG5lxWbht+!*CR)gHs8&gJ>!SpUv@OnBWr^L+$mbdvWwi>{Skxkx=gZ6?BhV#e{Myqy%O#e#`z>UtLs&tfaPLts*0LX_M72T z_<&p(f>-^}e~KTxuup+?#knnHH-v#)x~5NAqxS==Wk}e`hf`TO6vvQXsxjrX0$PS! zFMmSzwsAHGJ(dJHbjk(oMO{JqV^+KDCDS7%?~%dDV)g}7AB;v46sKA+u~QX5U=XdS z2hn;umcAf(&i3vuvr8q7S0)!s3M}{gb+aw$^rPwii_L67^BdJ+ya9#eYa)qqH+Z@2 zq;~(h%Rf2r#2n#EfFr}gy<$riO$rC`w@-zEA|9I-yCtkCeNl|5(gc`fe}sbubie;< zm)0vE2EYtF6q7Z}Wi{X+L7KPcOyI!r&^LVp1Ep4D%*G+a*((2JJ!XaJM_&VK8qdWn z2qi9ZdI!0Ai+1R&>n!G9B2lRds<+=%LvuB+Qoko;Gh|UbG*F0|(vH0M!Wh$9_dhwN zg8<0JzbG=(+pM3}1~si$?HJMinNadhsj)WmIzO~JG={aOiQd;!3Yv!2m&&^y#hiGZ z?%GL+AwwWLlY)b^9*f0*vwHl@RDnH^^f#Kil$(;EF2=jNh}?*-6Th&gAl zF+|?*);BAB&!KL1z_4P*=Xj8)z+?II=G4zg0ZHG=Y2qWJnt3gi#>n)+o39>mA&+5D zPB$r-L`H0s9RXmR+xIGcXCyzhekb8)r`rr~b>wbQfpV>p<$2TBxpGgxd6Ao&0>V@^ zzrIoT-zOsq{**T`1S(VX-e~+>+F`L(qHA8p;Ize zCh(;U(6Ed;7;nZo$o|2rxL@skr2avFZ~hs4oY*huyd$jQ?688R9rWOR1jY;=P^*xF z{~WcB9K~QS8UD{{jVz1b4)5AD+R*C~jFhp$5IkGlein2FEd~%?t0krb(JV{Y6kp#! zM&pP|692oslIe;!Jh-->=W^@{!?jJbLY}B3T)eIDW%L@%>5H~BQuZa1%?L5{GzBtrE`DS)7`bl z0t16M=q*DB3RF__Pq-42tA$%DTB==|Y^E!fDH*2!o$}sFydapt-qH+}n~|@VXW+my z!S~cpEG9*l$Iwg?bj)RczZgk=WL*ul|I(si#giINh_Xid zSM=$PKc;cg#C(UEAg8|LrB=ofyX(QX-t=i8iq^|1v8Im^voAJi#ogO|ZhGaA(Kltz zhM}|mN70b$#WhQW_HfdLdK%Bxkbm|)g@82~rn$>NAW`X<$CBR#W(3st?5tL`@a19{ z&=5X!99!^#T+i^`+$;Od_bxmAh$qoC!@&|PUNFDh0?WO~#m&p!r~y`rmRGE5=>iH% z5|&H8_j!=>t3;I#`cDDo+T6Zt!X`WTLB`{vR_si>7supUqgcbirV=v^P0bc8p7(x`3>jS+} z+SZEB zpmNIRZt}u%;Ig-1*IwYX!&El+W%Gq}gew-@2N9@kIX~2lCp88Oh1SRhJ|<|oC<@#x zcJv$T(a%x)WX%Pit(LPx|L=nhiJ5cS#w5b@26m^1+yJH!35W5r*qxdB)MS2(Z^Ed% zH&qt%4jUFm(!S%>5Id|NJN$L)|&|IBfOriwN=3yV?ZRQ|Ux}%yP&nire zPz>30PO?&MvFC0ezf~qq_G*Zug!^FEHlh4|Az*Ljp*9&O_QbcXqY#ml8nD?M3<0Ji zO9nzQPcWt|%_qm4GbC?PgwIA$rx{8C(Fznf%pu zQa~R`Ul^5XD%Kl_CMpM7iiV+S4iAljP7Hl0a-ziHpl949gL5se5IZpTV`d(*FT5CePjC^s}>TrTRmjn9+ zw8-EU6-Lb3daW@K&kX~Y-CXr2p3h##I|m<%d2UzGWwst~zObAL=GiRsrqh6uG92|7 zBG|L8g7zHVGMwbPWG2@m9j7no7whDh6&u|VrPK6exSt$f#^j$fTRj2VG3My7r(WBJ z*lrBz0<7Sss5igaOr9}p%k)oMuUntRxvH-DUgqmEx^9aQqc~U>I?ChoowV^4nHjm5S?=Wlg^LD(GzvK9E z>A4p6<;iS0P*p0^lFLrEC_yF$1qhG&18T&!l)iDd+_f$EZH@i8cBp@hp|N-#!@&&r zF6c1p@GX?4UN`YNyr}!UfgDLObyRjKHO;XT3zOIElqh7E`wDb2gw(J zJKxuUF(nfFutziUVHK0!4$P-m0KK*m+&hJ5a5T2X`$$i#OhrP?)t(^M3AMMT?l9+k zX5mQ!C!t&d<-e&kL~8BW8WO{1oeNTj$9`#xQ^s@JdMV< z5Bke|?4&jZq;B>B?nsRJps@|~!!DsBG8QAzbS1w!-3L$-Ben4ZYM*Z$vI2xQ3>a_bLVfmkkSwCQ)yrKePficH! z9zdU-p}PPQm|K%i3=?<<|LohP*{NK)Y}O%hmgb3U1Mh1eUaZeDpOtrs*wy>8hAFWK z#9#5VMMW{iv-(b5TLQryD4MTFjm~??j{t`8HJIT4gwN0ZZ}jHTRSfCk$e~$AgTNVB zdwQo@`<8Ff`&<+5ct0MbQZ9jcx@2(pYU3w{lt^UH#rC2)qL76*QPgNp0Ict`w>7lc7KpCq z%NN)5_diKmizxLCh{QGyjSgqhg{?mKK_$A7%m2omWGZ+w@>fR`=rq_ftM{c8qfmN> zB~O=C77%uJ_+CPTJX$BzIyE2&Xd{s`j8`Ua!wg{pz>*`#yxq$)hrPi{HJ6Tbd z*EB1Sh{u}QoD|%VQ)Q|d{hy9_n*49acX22k}l=!}d?vTfVRq>}5@{Vt- zWc{ECB}-xpB;kmJKGyxc9P4FPdfTrceNX>+^OULj~D^L;-{h-So+*bq5$RAn@!f$ zt*E}LtFVOa#%m%vUVqeHYigLWPhnF*X_OdDzwLcrRs?ir0jb=au+?}$t6>yDBi@#l zxYgwSW|txTz7=)I8iCr7Sl#5B4-U`#F48jTvC1;db-F#Nn;TqTbx|mJpQ`(2x?MKbRH@BWS=5^^xUqoWl#9>zY8h9TL9 z!5)jd3f*4<(AU}N>5kg_9lp_QI{(*jjb2i`V7yN(0ReE+jnS@(li#eISToN#_-r9) ze(?I00Sf@8^$i2|RkfPnKj4-|7kOq2rgRwVe{!jNU-0rQ7v6_Gf2rh zWRvB4oY(CYTd_q6?djJjAK(Or+G9EJ^-MK{;}%XeP5OvZqHn@+yC(3CB>xdy)N@-= zl^J6a_Su7r2lko#SSeD!2{Z*=4Cx=`x!4w|6fql&SpS2n%*-*X_%pE8}1w zqBma?zm~Xt`fOv=stm@Uo^p!i)MAD3LrFlq&v(jox#5m;e|n|+qJMc`&}NBcIl>6m znI%c;1`Vo5?`HY`!E~2T+>p7S(=JT_g+Lp>6M!LL>eULDLR;65zn$FFLVGYhE#~_O z+!^Ps5*W|#AO>~oOdvR{F#HksRw%x3FepNC#XBg3Zek|o5uVIn z*rc|q_={e)=9Q6g4NH{kgz6?}h0Z@7#_U@apO;Y1-&35t_^~{kIkrA&I)PDTz+^&AxbLiMo2~0% z;`MN}>nM0pWNzy*9K@#n&?X~^C^kF?6BhSGi_XMaExVU39Nj8ee5;AN$fw_VA0qX| zzO+v$hAgat%a-Y0!8-SUE6yi0|M8#tL-Rp6)F(n}F*v4bRudUdR1xu=2o7@oVlPWf z?ljp?jlr*!VXeyqn3;;bM-%7SXsEACVSW@-f$OI5LfsRyj%s7>o8mJI+b{M&dfHq+ zoSFY93m^LbKdLv0?9n?oNI1HH1eU@roVAIjc{7GFV5>1@M%20TGxEU9Cw1Xk+`hoN zaKNE$25uJn3X5iA43jD|pv35lJu|nXUNT8vAU8VOHv0v^crZrgJ%aw*@s|hC=y_BA zb_38Hm-mR5yr+hrJ+6JHK6!s|hS?h|_MRO+iSgb}N;KdEeRqjPzR+ZcZpY^KGxy(T z{XLww4IvQJ9M$qFvIzr*SMfZPNC_~%;;Kw%LM_k^(*U;rm1B-8tI!{7v|x0O%^n}k zA=#oy@ojqAuy#ixbhp~EX>4!&1%zS-$0o4v#Gumb#)2#4K&JNLvTiE!MwnROj(Oa# z)uaCawOmrfz1=%k-powDoMoAc0LSdCGk1t`xT1yAJr?ZdyiQ!9&UA-riNHIl+xvd*7qeWBa{AH5M(v43nugSz8a;W!d zz+pW`AsSlGZ`-irQ(AN43Q1s<93(l_JnX&KTJrKOI5S=*V7=s3v8IHc-S`^P8Nxd+ zmj-0=Kc}~q5H%TCBV=(#4dM~^z7T9iJ$luAO*qn8I`6KbAXG(@rHW>vZAKf?dpSys zn~1|%)0#V#4|(X*ZhFF>3aI(xY$(eLMW;8o3YItB!LBZ2!#1xv%9B6Ke6tg-dDbjb zSXO|5wiTdxn%%c211-w32Tfl+k}f1JM|_-D@vCf*gxVU<2t8;_HrRw2_r67>rY_xL zrx@jMZ>vQa-O4(m!tNgNi+f#+%Sl`XEY-@2NJ))vmZ}8G4`LvnF`b5q9Qr%e|0NzR z2O9`1&lkR;IAacPmqL!AEgp#3>C+v-9@m2<u~KD ztx*VXL$@oxjV{J3hkXW-Fj|p4Pm6{5Pkm`KXx0(=r{6KDblxNNCI>Y=Aoj3_5^v-o zyGgMWxJT@PHPz0~z&>nnkzrq0G*L8ovg zOBBvuVr`(4yG6j*_qs~>_87wFKbqdDDL@?N&bFk~eDG-D*RVuuf8Fx=tl>X<&SWY_57U!*?Gv_p)n6zmA0g2c z6R(xz?E=5UoYrMZK0*(2GHt9Ax{W>ts)NeqG@9@zF`Wbu(b=)so$s9$NhM@Y&E;s8 zV$dqn-^|E!$1PS@L!E*gl*t?gFrX5&H$9hd8%b-ujK5lMPvli=;Zes&%q&0=?;MqR ze6NW>jxER*N>S5akvk0a$Ehb;W4r^QlY5go)%MG8`X%pH6>OD9aPC^%*MGzbm}MU7`gsJ(Sc^4e`27w<^yaul;XCBg;5R zB7{Dk3e}8f`W{C_dIRQbm&G0YP-qy|!7)wlSn>)6v{3X$)l)oK@08adbgpt%d3MjC zG&_b3nA*pcGF!Z{j;Z=XYToDbIBt`b+BEeEM$^vZL)qW*7&6SVGkEMK{hD_(h|__T zhNAO!SNdXhF%9wj1UfI*(>Hr>9}hsmr1H)!bTNO3_Ic zuF;3bL3A4mgeBKBE#o`1jyL)LQK>9Pt>0O>VX_M}Kd3m{%WqY0Ldea41Ss9`pEV~$ z(HqhEC3-3JlrJZ6pX(>AtM;?=7G2T(NPE{`JbfRjTaZ4kgaY@6mPc*DKHQuoK2ZlP zZmK8HWhMkC&5sr{JJM@QT+j6aqD0;!i6GI`DK$-spXPjxQmE$9k#A z4~mJE73MZxX-YvPEwSg;TihDHepO`0ctOXAtN3PgQTM>FJVXjS`M^t}^wE zW9rvK+wsO$07M9e@XZPJ=25W%jUKhsQCR2XEm4_<@F!nIq>laI@80pjRo%!WOUW&KQ|UDu-{(`%4H^OFRA!1Dl|VMmY6rSA zeF~s0f~*shpB&CQ9(%R(r6xT8r55WPP4C=!hdS#Uon-8~IEpbFOs=9B_!?%MYQwjg zq+>6dWX7tt_a-(-r)0G9jgg&8NQ3=mSl!oXYMXFVbNl=5p?NXgz&^|wv_+w9C!tPZ zDy)J)jr%#kkm+OK*v&JaP&!j;V8w>g7cqb#cRT1q7fH*6%W!K6d;=}!jPtK{N=c zrCXYA^;|98rUkXBfSf5{b#0;K^ZSk_b_+M*e8YPo#B*tk=58~o#!Z9I$5wzVwgfxg zN`B1R%5%NF94Cg7Rj||1litUzrKKmAY%XzPGRwe$;SP?fVZR3Gygm@;9Q_{{TW z@YtN;qzV)RYQO(3B6gq1f1(j(&zyG~d=}x>LuSyE#Uss{W=A&|O};cof^pL<|Vq11RGZjV-kWFo$ssgR0WQ3P-q){bxYQW$%E z=sv>dZY>jAuDz#pxu#VX;9e`yo~GkSebY5ft=|e;HqREyNtV+wj(nO^`{b)^VfsBHPizu z+00~4tGi^Z!^H|z`Wz=ryvO;v%W;*pvjZ8m8e0aN`|s@NCu3it#rx^XR^Kg&c+ynS z8M7|7ll?_2DC8(R-g2PsjqP~t!GfLTkUlZyK-qJXJvpBj!B3NnUb8^$ohy{2#~aAA zRK8Qd6HY$D^U!>j6`Xlp_+jS)5YjSRW&v)cA6NOw<=B#Ey+!=@%2|3<3| zJ2mnsrZ}kw|4+nFZv z(zy`Hn`qQTb>*_uv7Pxo+zz~R^3Ki;nF`aGLR$cG?SHip$=-(RkI~OpPon)19|X(a zHNFTGGOy-6kUL8EgOdK9GH! z)Pvk^)wl5Yiz2g_OM=U_>Q#wkzOKm3yq4P>(L*gmX=A(II)=GSVe_E4s-trjFSqXn zSk;67bUZ)APE#@L>T3@ZUQABtZxL3Pw_eYxWPrxsd)%s0nTBb@E_wquXO73I2;Ri) z4c5MWS3%x=*_mjpyK;~$!(9~E;Ee$Cc^@*B;^Ppq5{*RCdNWBsSIKa2XylR`x53#U zSq*dPIff0l-YtO_VTQE!W57S>lD<7+Ean~?kZF$zk61h#;<7`7fgsfYZF(;nWX3;Wi^@_sS&BXq3 zfI>J?7j2jf)GY%T-EM_tH|;pVqarK5axrz#IM7lkRzWV+l+QT?I0!bdis^rX7cLNt zRe)tj?`fSMDL_pi>&tmkXB^F=Kx>uv2;^G#{4+{EBHhG+e7@t)MHpJgbg|gyeLgD- zcaT-RVP&k;^+Y z@fab}#ZLcuy)W*A_XS7vmjVCi(__*?TS{p+(Vsp6(@?eU$V{!|S5XpYr!II+61V&K zUM$3Bgwb2arJ}B7cQ?fd*P+2)j0$SIP&r&~L z8WEN2nJtNUE$~~9EtpS~ts4~FBPhOi&HRX_j>{R=14Ik9O}p0eXJUY7C5zN<#dB0 zpO#VwV`3A$lf*Dye|XF#9D^Jr@&_p=EEV5=@$Z%pDy9A+8w;}?Z}V|I=W{{a_CI03 zCR;*V0s@kP^2FyY!YlOKLgKqE)WD;asrprr+AG-`y%YQSdiBlv4-L|HVF1A5ja-cN<3;`PQgEpwYm-J55}^`;nSQzE8{N_sdA|E4LbDc`M|t zJ*NxH0eCt)Y@ol}ZHeXfAeTOgZ|mi@rJ#htFCIMlinGgA(cknPAQQm`S42K_IeAH^ z)l3~E72Cq;meQwoT3@rvZF5STjhqM+n66p&8J{pP=WIyC%28yoBN*__+tZ;74}FjO9S3`O z)&$Bae32hW8NPQc_A<@bssj)8 ziO1BsI4uA)l`~u=(~ADr*Bje?F((G*>TxXi)>2TkSBE1QdIPvm!UwnoFTGax#MV1q z!YNcix9Dy@nThollLm+sQYLOh!!;1O80A0O!-F$rB>u~UYt60gK>2rqS0Pn2Ch~;| z7mO?wPowwSY)tJkpE{lA&^iikaDw@WtN8*L&k9{a>oZTbf2CSWpCu;#`i10wAz-i$ zeh{sVTQP+Rj;+-Mnl#gc9tB4LVI(Iu#%Oti%9I*!rf-1_azb6Jp%I~aAc|3H3RRJ<-qB z_av|<55S{ueDD{|XMT<6X+1}_R7Z@PRG%&3&`pUZ0I@s{=F4xTiDq9JRcx!NcHF%h z1~|Fz%^Dt1Ulj~}F!ws8xBay)xU!z=*jf(#iK7S;>{j;!CoCaNc*@bfx6&rx+folV z%vjuwi1sSM--MeF{l|Bw_(ad?wKSabF56`RYr2_2zB&-E%<8k-SgX^C(KErNoVERA znIN|`o|Yq1AXf|~$q%$W$kptWpeX7p3DkN#<^6)=a4(;JU=1AUwmT*>ms~*oD%?fN zYEz%dETrZ6AE7Esn`PmQr(4jPxKHNj*((efo@RHc3ij(e56? zo_=gvKj2;n+Wt7Uy^PHC&Ce2yX@|CgIZaZ9@;l|lKsG%`nth0@^Z(p7xqLJgh;NOj zhp(sf>{+Ck8n;57QNd?ZvfOy~6`w+ak_frCLuH3n(2 z+*&SCkEcN^CgVh@(*ZRL5c%q-kygAoQ=0m}q-@0W!WMu=A-}P!O=Ety;}m9KiARcN zy@+|TO~h+dX`9~nK$uPO*#=UQB#ZMk&{Ma}PnLpMc2iU0Yer99Aq5?S7>=VvmjWN&5xlt!a);(#dr zjPLb;up?Q=k#)_#4S#UYI!!| z^iWW-t>qIbnm87CXoNY%dza*mE}!N&?^oBt(HXRC?kYOC*|WlZ;WxXjC^jKo0g~6E zmDAmP`Ob>NBV0)s8i1Gb4H)>{hz<_Y22P>rV%a}l7aiDzhD?3@XpeHsKw8dIG>=an z1Z+GqXq%f}8q>ymPx+g6ebM{uoyq=oC6OE7*QgRg8WF9i0ZG)1Wz{Dn>F(NKl7O+j z*vTL6i$P3tKXx;hpEDLl@GMUMVa~TBL+6hhzrYuLt^t3zSKCA_9p9 z7p^LWozBJ5gMiw{@c_FJ=S^FZO>=JBwCvK-CNp z?fzA3u+Ot%mNcQ=HxH&T`p})|yTR^1z^xQTk}@m6dltN?uPhd;;mAb46%%J&N=@T_ z-LmmUZuJb2VnPA6mn5V$zhhvCm4(N%QV#qLt=H(Seaa))R*-W^uE|ufxG})m) zg>E>rf~6qVowmNi7@Iv+Z;(1v^}UPUb{0H@zG;S z<=Q9>iR_99}Q*1)YGEkti z5s5#PgXAe9-p)I}WPe5g;~~81V;{L;KH6N42}X~E2rYg`yTdke%@#7Lv1i68n80Dr z9KV3R0bt;k%H+=H8|dbGCoPooO-CVQt2s7hr~r3*Xn)Q_ig`g$qDA$9!{f}bBH>f7 zCDn;#Q)H6R;M>@0MSz`Fy(hv-77HA&lxWo^6FE{vl9~LjKX01rm7vu|l4)*N$i363 zO0tNuAjuIUYu|!g1=KgwkwXKU`W#IOye;=x!Nc0Kn}}=OkxiX)G9CM86-K&xGTm&p;|>E1Y)ESmJ6z8oA%EpyG!le9lz`Lp=+Z++B|}yfH_u zWn$LHW?n-c3T4r!nP(GJr`zQn`5q0=d+g^RTv|DH+?!_I0XZ0Pe{??~INbo)Yl1GS zz|jU+y0rNffgXN2TP%lmU2Dd;?Sfn?qwid`MbI3IAT>i{Iegn>6H=xCrKkq_M~PYk z?dlUAHJY^Fnnc6b>6{m?YqS zW$aMcyRAXnQ6^mvvCX<^xg%wuVxmpH@^MXYsQl#Hm6+85@|@ zTX5)&Y>d3uXze@pAz@@b=gWa|>D3oaoCbj8MVzia^=%?G2pR4U;{4~ zgan4EiCJd37%OT?quF|v0Zgv`d z91)HoWx}KSMbvCMlJD@{LbXMzyk;x%@KeP1t}cvU{&$l^zcgAq{?g3iyGr~ujo7QZ zvGgStLoyPw*j<4&;4I$rN}QQFadkTWA{%|wBM9bi%&Er$z8h@@Q2@baZ}SeYNESQ& zT`2Z9+a)N!ByiW#*45F;i|5UJ4ggMPV)(-sPvt`Mm-Xt$3LWr9q%V$YX%p&&sCf@g zXK)F4DkowIbNEbJth;=7qNWhXRxQQBp=n3wEaYf-2FmDlmaiL?)WO?AU4yXI;yp_I z-ml$utWoa?wePa{?avq5I`nx$<8} z^b3yD_1beT=jDq5&Ai+W$~2J}Ke3ZbcX44r#kKmqbNUYbHSR&cyz-_@ zpRm{d7eB@EYmLHM5&|9xHkZG2`ie@a;qpRU$c?O^?J9*uirb%#KhsBTJWCt2cVp_C zvYWc=e0GNzw7L0hb3U}?-~g8f0u9R4&`dQW)Ei;hhCi$>B3vzzuUdPErg(9dU-%mN zY%Pjhbok%cFomp=vYHCS z{u#FV$%#^h)}XEYyooJ0OOKxNtI)KNxh2$5kOOL6rs)b?5;O2Bz%zAnx#V>np>hko ze3{!dSh5O#%vS+xUmkz00bkkPV(h0~)!Ka!YW3i(S70UsECiQfR-0)8odRVBD+kcF zn_PN}qsXa@$*-70?1ayT*uqqo^1(~TqFF*et8)z^-twV`a6Y}{-tGQWSNQ1FXE~PJ z4e>jxt<27cS%EIZ{p#c+t7aO;e#ojNvxj=hRL&-&pJnbh(PN3GRRc7hn^;&F!nRQ~ z(d?RZXusE_qNs$|zf6YS2!FlLauRd_eCqtq)g-zp>eUHvKbpc{$2nVLdQ|(m?@%`v zCyA~G-}6+w3*g$%drTB-0hWYA`}LlGOyuc#YPVce{4!L3z4>LB z)M>z}=wZXOSOxg21KI4?=M8}$R(`Nb(T9B`ch<#tfL7^xFc`K2Nv%!~WdXqTcBZs# zJEoj({X$&{uX|#?-h^mx@k=83KtTI?>UcKp^y%96oJE3=O!SPSeMc~Qitn8kT|-|N(H)Csj!#nP zI4Ci1a8D315*5kg0eJaG`Dj(l^-|(kz1}dH03KJ($>-i~p57tE7!cLBId<}HPv!oz z%mhp|Ral|i8VF{=iyu_nWQe@S+|(Jl zXG6K5?tL7FAgkDThHu{~$@C)8kGMwZlsk*pjo3z&_I)KWNWO?@IV9F~n~*_3m*vwW zK`Izp7Lf|l9Z6NRa>Z_q80TF#HA0Nkr$Xd!99&CHV>kPVm_Agu0X@*8u|>j4Vih9?nttrWO(D7~)CK@8aD)WH zJ|W$20VG2VYEQGMhWLk`#6X1VWPAVUk=7`P3&kv8x$ov8QA36b%hR0?8qb7MI8lHv zhfRAD;s%|KvZtP-DNCQgbJF?%XPVB9m)emNj@6S8@jSHZE|T+^)3(0ijO$pPLR3^2;})|c?RcX`4AHM=M5u0B zD_J8tQ9pSgJ-Jp-n2)Rt#(B1fY#8xzucp{~AMGb7{x&T&wybwmkL*S8^FJs;oObUu z?23f1BnpH;V}_hny{EGrEIs(Gb>CKcF^BsC?#$w;NnAkR)2$WZAduE2>a2YpF{Vls zDS?5i;0XXdT+;9+)Mf>{q zdzV=ibQ4Ab|2h&hKY)-w3x7vmhte2U+p_+;k}`*BY}gElnUt?se}+!#CSFQwC4+fc6c z`0jVuGvBj2x_=O~C71-xZ1+1(wCAl0yt3i-U*CMPZTU>%fyA>1^t-(`U(-KEuA(vn zhiMb4)qm>>72dJwdIOxv$pqd)&txEoh}`bMaD5lIsu1B&=VRYv?xz&p>*8~54`1yw z{O*0(g~!0Y_U)5wv`k7hx4?2^PJrl_E8-cnQsPRN|7icYnAM?~b{r%n>XRvb3=%D5 zvDY1buF)6i3U=zQ&55cGxpIcDn4$&wXRn$HhS#%~i>D-Vnzs0{k+{^p;5xp-4o6q_^98wvRg;ac zh_)wsLmvN(X1#L+-4MJ`I-!OJ72|t$-8Xf(47Z*1 zAq!_kb`>EvgOT~#5`@%{AD9X;%X!C~OQszUEk5-I4FUXwUt$BEwr&@4qNKcSFgKjV z>}NZQ&y;XzrbQE%bQc8g8zcJ^jwm2kXE?u5aF=!PhRJ7X<#S4fn-Ym5!r>A+CIxi2 zhY@51VL4-z*=6RI{cwD6)f`J(d-O6!tjA1JmV8+Ggi)P~*%C*wkh9-T)SdB8;pMSe z(lXlNL2rRVYJi3r?=A|38$`aStoq-$aCrEq#8V94q8acW`}cPjjMt4fZxw*j08M{c3;mN5Ic)HrQj5_1V_9g; zDxf$qDtIQVT1$4(z)_DnpBr*cIKS@x61U>(A4n#UIK_B*B1F9Ej=+V@c}>vdefFhh zjTyqAHL|kb_+(p*@rqCGIpq~W6G{gBDegLek*K27xQv6uI&nF(!grO~6xvGpRaM1H z2X-f)SUQB-B6lQ2mbIUZDc=xeXwqGDO_5MqO*oqVLVMj&Xur2_!Z2(h>!fB$%d8t! z%UixF$h-4ZRiK@GWwt7r1p*d!_lbX5TOEGz9*lgQH}9us1aXTSr~--OUCql1-_#uZ z5o}^2YFOLoo#G&>dmbRRo#4BiU@>nupM4Sw`COgl_n&$1e|7w&ZcQaJoBoy*OXsz| zU5;rbSr>{ITM_<}8Q_d8uYDIuP3|zp$V`&-$r20dLHf*LSVux2>98h<>9NqSC_d`b zjNFr$c>NrH4xHd+>8l&a&D9MG{t@aU?L}eyExtbY@dF6Kc!gh6}UmQ zitMe#I=3WSW^0D5g;2vUa|`0%{=SFf_)>|UkmE!-K~3|tI))tk)jup$>76(nA-}t@ zDH2z=L$#Tb=PJSXwZFm(Wi2k3%ZY8`?9it42}9Ot^QnlFQ67LHH6>N_A#+4U#Dkg5 zZooXQn;pdT>IP~r)sVTZ@IEupGc&``e2iafHZ~^S#!^+?;bvGYh5a6r&dbK)pedfm zHxh5g)<3pSZmCw6X2)fHa>yn61tjV6GNg@S$gwGc)5z^CtTfrpZ~gETiJZRaVm##C`-XiP_1|I zs!q16H|M|uHL_=mE4&7t9uLQ&M!WreCDj40b*#E*-Zl++r;EpFw=4Cd2LcbZo0iKT z`RiEIw*G(FtAhr?zqN3iRA+WcOzh8IUVHr9gYLMXTuZ99wel6C|0k76kaig#Uh4fE z)-i4rIdrsqU1OU#UqVRrxXgUpj73L=3;28;02)G>4TCiULggmfKgGA~0!fizyE(n_ z%L<%rWw*@z)bQrJi5~h}k?zs#(uy;L1>pXDP1%suVmjVVz8L&P`05qn-R2CJg`{Hh zQ$Qkl_9%T*O%BN?+u=_M2=p_>Rsik7hRJ5bH6HdK!!i|rBb9^Ue@s@=E?lKvy+7|x z2QpgjDUW(h9fu{So|N7Qyuinj(TenpDVK;otI7owNo~xA^2-}^`mfvTNMh12jsIsL zI|uv_H~_Dxg^600>cAxPYF5oeyy~3HKq<$wG@^Fb-Dg6heMvp(H(Wc=@SW-lo-%!` zxD+D9n@DlBFM;|Y2!xL;#(2O0lMbil+gOcby>?Ctv#61c(zRs9(wJoOT@ z;w92XxLU^4@MDR-=~oqZOvEusq>nEvB9H>*405LAYu>4*tM1_f)nebXRI$)gVK1$d zodrN1eD#f1PeInK>iV%@3nFcLV&1pc2^g5N|HPaXz-gOR|NReyMHw3fsk7Gjzj`5K z3NUqV$-(~$pB#vl{|~SuRQwP5fLC=kLJ4}TB%(FXTg^c%#Iq{LX3qrNxgL7!I7FzL zD-=-B*g4>HP*q$8_f7X{)1wGo6RnzY8d5Iz$e4xx0l%PRsrYRFPD5sZPOjm7CvVMA zH1EuSmL4;rHOsobrZ8k?F7IVG!s5VA{r}z~qaNnqj%HtV0X?pSvJXPgTEuR_`(hBSLu za*heThYTb)^S+c#ZB*5oLu!8AS@#ZYPV1M#8)ZaY zdh6WJA^2qxSgYhh8f|d3wN6b?M~3d)QlfyDl;}bYiBZ$Q43^RSIiOy<_Omt zI&5c@AL<}8b!oXzYY?|g2f|AR9qMM2eT)P1aBJ##Z6>{pap^mXKT_q9bmVtMiSO?` zUxQT1>msWx^hMmX?+K!D6LuuD{2)0y68VYtX3m|HO<7J+N&3_w(1g#RX2{zQAj^^= zKU2(@L>5z}q{mwZqYVfQ` z)3ciR&!A45m;siG2u63(pG24&jH5X_C}tWGphAWir8x-UUcl9d4MykJ)%zE(81~Pc zJNNXWvDsc7OFUl867;diUJsU7iwLbf20&gN6Yoo`-Wx%Dn`8G;Mkl!97Mlp+qWWg% z0ef(j2B#ba8fE>d7&mN>Mm@{$nf^-zaHgRDoemq@pvKE$nsqZn>W9ROiv|8{7y4=~ zJ#{v{v+i^Gi5Maq6)5IPajf#w^>+2uh=FvihJ|`)Qn>_3Zn;uI#CEVaP6I@@$>@x- z^YU0ln#oy)ToYVf3oG%@L*EUk$Dzc$6NhrL63T8}rrS?QO!u;x4E3Dm zUYPPa_3{tw^2n;kcJpkU45inFT?IsDlP&*>VNNjYkN-lufUGmUmh?vv8D(x*mTBX# z05G~WqlACZ+LTZVv;&n?q`^zLlbDNm(}(RYQj?gTOF&_{$6rozloRQT_#FiOdm-sr zUEme1&Hecd8vR2o&DZCbV+r$`w7Ec;H2GsXX>yQfy!WO}AcD+FZI6#P!sCjkrAPTq z!q3;xs?<8Lvu-_~7n2f8+=xJg$Hg=lfBYV3szekD}KhGwCbEv+fB6OwX$(o zaAG|d?iRR&xbhWUrp3$pyA>?066Cw9eweIUS*?1EzOc;&rPiP+DjF5oV&1Dx6*b9P z%?&wG^R!xvTxhfUh?TYGU%b&LDIs@|u`%Zl!<2p_`QY}6C zs@KB4kE_%hn)S{>-H=yz8oBxmlLYW!7JnzVkBV+7Dxe-kaBXTmud~E4;l2c3+*@#j zvcCpM;&{hAtkr*D=GAU75xw0;h63>PsDNI#O#8f=h(hatyb8buOMreRNqR)HX75|h zsKV&eZm!=D!CT8Jd@TgNx{*W2^4S@B-<1D_9%FVq_Sr0soAO$a7ZRQ17ww>|2UV>( z`>kk!C|ve|2P#fd-$r4Hmm29ZmX~{XUU`4#y|t4#@PlHab)xC<1=D(K(o&)`rAKxl z$@~y!BwZAav}i2L*Nl&gFC^p+;6kkZ^~`^cUOW+ zj!RgbC5W&#TeckbHVPN9dCfJ*C|AOq0(H^(Oon=g`VwV70L0M)jXpZZb=Dp(H@7W? zQ!*xs@_mdq-@$SbD$LBQPi#Pjui7;W zx&a2zL5j;A;E^9wjM6r{@A#FSIpJ3Q*;7W@#dbCfD^-lxMtwB> ztQxd%`O9gTsASze1@s7R<}4WdqfsyVZ2r(0<_6ypmU>c3H(J1>wJHwLpGX8hMxQlB zr5J+K>UmJXA}D8S8PmrLlT-L9t~89T+UacBOE%pZT&)%RH4RNFe3`H!C2rfFb&D&b zN6~YQ8EVo-K|d`))~T7RnkBDthstnLiwsl)lhkO>KQ)F&1P)zd9_Y31%< zcT2!~iRPW1kaRWlE{fNG?M@s+e+&9VlU}`&bZWY6&-Oh$;3R>6eI=}q&W0y^jjwmW z3rpPV%FQ$3@>iamh8af3_#aLs zoaw9{Ft-|SU}s9be{-Fmi zO>hM|5CO|5Kam-!DFc+~>gkrYg!Y(YInxVizf(6AkNO%=YoND+cOmFKKXl=S5kZPr zpBH5^9a!Pn=%cG9U*bs27Rls$ImFBTW||(3xQ)a2 zQ1%P5V4^$JChrSz-DK~Rk&&InVO&2~HN%Ack?s*WIZWk+#N7J@a2hWs>}hK1UbQSm z6AZ|^rOqwaf>G(qB_0hQ1CE#`Fs%t79V;2@Yj>}1#|7LJsevM2YRyiZi+OGJ<)3M= z*t$_OI4j~4VwN@iT@WviRqDNZaWc=e9YJGlvX)j$zjbkf?Nim)pSSglVM8lNA5|2* z4IG+ojYhDmh05#r`SaRtqkg!?%RMhrV_m|L757p|7kT-f?_1;t@}7Gqes688wtnit zbIea`TXFU%63p`!@`|1R$TFNx%Qx|`z4T-Mre1*b4Yl<`JC*H<+@eIoli)M-8omYB z*Bc&rfTZ#X!y#?ykPhpzG&5mxt{erCoh}2rOmml)Gdy;e0GZ`iiALjL@1jss@{(l3 zmNM3M_kwC{kV_`5h@4x>m(~={-w*eCVe&uJdksj2xjgnZEvVzSm4n=_sxIuk*WJx- z5LCCmpXLsyQfLnCEy@j!E90%WxA1d$M&Ht2$0wkhHNSSI`l`0_ux}jI_9S?6HZ|t^ zMINqje(pE#3pri?L;gkA*2dA(FcaLIhr0~@v~g*?NP-CYdr9QwPG69C3%jZTH`70+ z?ZyqS*G>}c2Ta*l=zKUAPus$DTElg$(dMfz$+BfFD^>F3vB|kQKiDxc+YjDFIg>i- zX!HuoR|V=N8Ybps?0ATMs7wxzhCMi+dREuNT6Od{??iRP@s=Fmdm>dK$LMA-h)dpl zI^0M&Q}AcF;Lf)4`Ii)%PAfN>Jm9TK{VA=0;$?`^5Z+jbz)*^jzth8mU|Xru$$f7F zmn6tC1@G?qZI}EW-6+>EWlGoKC$yT5_35H)b<)P-?f#6I7e?c zBq|z1RJH0Y_#l=10+XHs)vV<=kB38Va*}w=u;Xj@y_-y8_7L5#KZo^TIwL_3N#Gb0 zUT%_0l+|j(#g=@i14QmSTl+)f%S(iN-A!JNMZ*EdVlHd3eV9w;880R<7wYO^8Bom) z{kqHyJ?|EIkJ|kj{?^PqZ(gICk;{jOS(^ok#R@YJ+9ggr>17U)ha{iBnHdyPX*x3NxN<;Y~~+t=qA;E}lbHewLru-F5~492;~rp7nP_ z_12GuB!lGpD*;!evz+FmcSHCJyfZ8d_SAH-PdP4d(7kJfJb2+Rd_r@66Jtwlvik&M zooa2Ftq&UV<#A4bXc{QZ!@hHtJ>YlTAXV?_HT@KOwA+nDojG?T0XfZ|@)lhXdjdbI zz{yN_LznfE-_Jale64tpJ=j(`i#fWxsS8~Ajmq{s9sFa~`OUuGV(*5g7HFANw_ET&Zm8fC zSm#&ZEt+}irN3uV0o~l2b|IZ6==N-A#^S1?35W5`CN-}sw{csf&wu3N+44v@LT|o3 z?`-&zIpllX|CwS6FQa9=)_;M}v(QbyLw=+Ep^FT60&}mO*4&HmwK7Iq-<|jAzdOLA zaTk*h({G0RxR%-!`g0DY?EJ$8$Nkad~e|J}VxTM|g2u9Culdv_Y1@!l)>msS|z7R18jKHz?y4tLz1S&Ui zI}9NcNxW3|6sZ|@W!Grw;-L5T6j^1w>uy|=9{=m6f0}lSsSl^-VXdHz4|%@v+|M@y z1i>TsYlG|iVD#>5f2W@5{7*%eEB;P?x2qPX(#gm`!A{#)6EvpDfKI|>dZQjaK8utt za|TxO`9fY@<}9)lFEcxSSmUYwx8B!eUot*YDEdO-K*)D7rH%R6b*uX{p1l8TZu0Z~ zIWd3#`_qjG`MJSUHtXZ4__O8?-m{&_)8g~=J$3wX=CeW1!{js9d$@VCJo)qGvj#6- z+cPOtiSlXoSzdK@!8;MG?qrKx2^9!KCTq^Y;NUrH(pU>XLNH$ zjCR*5=i*uOj)0@fO?>287YTa1AFN!x+l*Bn?WN*w3YVehn8w^I3|0yrVVrDwe)7f| z$1L*aeXx5ug{dp;NdsdjSApZ}qedvo=P^!Wmbj{Et59m_$7aar3zh^ciT-xR81>)t z(a@K5%J~)f8Y1Q{>D(FlH_NHSRn}SjCSC!wDh7tWef0Lbw+P{VGKqyeaSkbUJ0U%W zjdmSZa1L?pWtE>=@Cp?xwV!7}vX~(;Ums2c6hLlL)G9en8!v(7KF8L8sLqRj8{&hR$osu3{oJDSCK?u)G>P8WI~pTS5d}#nPT07%sXTa?m(K zpb0|SK-FoH|Ds;RP^2V7E#?qjVCv#`78gO=BUuHT5OJa(zwzyKLJ|(z3#fS|qo4Rc zpLbEN*n_pN>PLM0oy7NKcwx+G7g?Ov03d7r&pWt`B`BbfVVrDBCt2x)-AqMG#r^fa z{|_u1{lGvcU@99DBU=AzI^=b;cw+yS`vrT9(nmG8%)c)~pTF3DK)u`FgXbBL30y4v zYBCD=i@fNa(s!xM)bK30*j40;3rsGbXe!1U``nfoF2jv3zT*srtF+0Sy)Hr451fus z(0{sAS81FM$-7dTh?{x!DDy@7e`v*W(Q|O~Vtu)`atH0*?6dKcH}LC|{49eS zqqxs192a;bg zKew9U#RBJ^a3aZ)V@*!x4MEPDVi$J)^PMkPa`@Mfz1#;ypatHxHyrsMU-BEunhg&_ zDe6=A_uXiXR{c%>_uHVaF&Oi0P0zORnWJx4I;ksVH}K1~{-OVo#Vw+`tk`ggG$}_w z3mL@Zi1O4o$b%c%Oo4xJAX+fSmv1g5uZ*#9=%hIRjtbxtvk+eEvf(4{rm|{6BYFjdwP~5e+!ayl9(Ya-0pH4dy03YfnM0O8x$~ zGk2GOKes0m`h0Dfupi1`-K2;6@m}Uq_hR2v2QlTp&OX>+d;;7A?D#@ai<+tsVm1-> zM)xfAT$8KQOI^qLTyYp84xFxPltd;bC8*=4ALX*zAJr(F&iYLV#h&fR+h$KuA7KN$ z9nX}B)}dy1UyC0ydsl8`(OL=@{IjN8d?#lw@chnyGb5E-0bP2a5$tYL8h^G|C=8ry zM9%c&fqexVfR8-n&`XARQs1v8w#ac>bAnrK+7$|KIOKswdSOC;X;^9X;MNG1^@hsxmM{X!w6rZG;VW|{}-FVo`%4mi%=*2q9V{QmIgR3wiQjy z%t5N-0Z>l}h6_pt4!iOJjc>S+d@$|al2#0Jk&{&5AXC(O^)K|@I|E&RM!#%?y6+Lq zyjnF1%SkZcB4c=YqgN6T1HYt&lNs&{m3*YMl7;SQTei$FF`%zD<=9+o#A+EL>Y9GU ztg`>Fk2tX3xAwo~rHawcfgK<3lp~XiR2h?_{c-$=K<`JYGiw}b(4MZ}mCRWHVf zNds_@wE4zRcdUKggp7=Wsfi3DS#^y0(}m?%s5sC=9$~gVI0cmsVGz_!rRmH3qdR|u z637f>mDc2Z$)GhD2P0bVlQ0{m>D0@3qi=;N)S$Q5L1hEV{+Bb-X7qm>>?ri^8e)_0 zqLn6cCFkUz6#SHl$})R6Q4W^~x_baeFy`v}S?s0c!fRY3Sp&bbczkFMLMcK!!1lb! zy=1b>MYXOKF%r!kV9y&ptAF2Fr{GSds-G-8){{;Ts5wPq5EfsX?cLJ=4#By=1XADN z3e<{e4Bfq-A5$0!>TD#u5v*Ajq@V@^-~J1e+_@J0zXuU=YjhSGmSm!!TVn2rRTZ++Qc!VzL|lMSy2B8Dp7G^Y z#Gubm3EzGkRh5E$K1CEU^$>`r_$(`num9(e)yRRbgJ zesb^wqw^FyY>Pu6BWFf3t)F(~%79_M^|P(_YD%VE|A1^apEqSzXy3!NlGZj`CPz>b z4B9<7AdE+^OX|&qoFVX5exQV`&HrXYIcRN@;5&0yE3Jm|N9xmK-Lq|b)1rW+5Yx?V za}{^uvq2X9jOez=ESDRzk(Dg)Pc(}F8=z?~GKFfW3&N&~_lze+e6RV9hn3coSeji( zVAK>N)6dNenxfPQ0cxK4{Kp2O;U#h<40?NRMf1wrU4?R05rXiARoT;>v%}7yrDFrJ zZWHiOyU|EXbV(swGsJ_;Ofm5$vC`ZSoW7qxpc@h4Ot!A)1BX68(cP5V4c9f;1g=}> z$|r&z{=@3S`VG5sk)^BQtNE?!Hk(E-Q0vO6^ZY~fb3^9ZJ~LGR?(wFoMKEd9by-<) zR71bhV?cSZZ(?P+dPwb3n@N@3rab&?oYd#Tg2MfqxTpAwE)S9mTgjrTyBJNDR$87~ zh5JQeEcbeF-xUM%FyWmzCE$3m0c?I>0biHq1I^)Up`DfiIyaNs=-91@$1Km6uNI?B z>i~8N`vc1^y0hNX4GwS;>TIa~SDRROw}2UaOi76%7hp&!XG(AIPS(1(Fiwis@#?Kp3bxW;0 zC9UtxFD8)^pBdMC+LxYOO_YB9i3u$o{*kR5sks5T9!5Bv0clW{k!8K9)= z$7~F<{UDl0gv7jc&I?f}6Raev@pYLO+^zaHQpjQ<8G+=SL)>xvZabb=P9zj@5nniP zfW3!97@tI_lebwx^!Nfe!wkUpN`!q|vED?in`dIE&(;sg0touFANl}@HGoU`BjtoF zLP;$}yJi!!l0PfCy;rX#{=BypN^`#v0{cWgn%MYg(cd>?w4cv*l>*<_*j``Q0-ePi zl}|noC2lL@xsyoM;A9M^x3rgjehju)Xs(US&T3Bj;KIJ`?fv{H;Z@_$iF+FE`mmnv zbpO1Ge>|C(&Crwl4+!y(-!iAXXS2|pKrZCs=O?9lV+6Is8tzrbt(l$V|AmHR9Jm!A5jaJC&g!KMD|^ik+Z zGFZTAFW-|)#J-rE-#y)8qEz$rS0Q!|>y5HVDz}x+?&HZtUgYjVuIuRyg|I`o#n{+l zG`4#Iy@cLeBxYwdVC4q`qSUu_m#Y0%j^93+5qOoaRD=LV4~pv5E5_oL&A%<1Rw|#& z2wlF6MH77K$}a=s-lxZ7b??EHUW`o)(o;t71B+_523h#Xd@WMK+mT*7=od^BpbdAwhvJs z5G4;Cc)2?Eu2nD{{zt0~{wX&QY=27gf^w=7c-vTHPn%`#p z=E*`@AJJ0)FD>^!^y^Hiw}NYy8?CwhQlR42(zUXsfWE+4BmnQ<(&WN!dntaLR&Q-Z zrER5bQ7HIraV@;5lJ>UDq}B&jWpef35lZ&iC>|UL0*#`6D1zQwvNOSG`cBNfbYH zp?c9Zv}O9YsiGZcf6np9k84Y6+R##fY^_bYI22anTNgPJphETN?606*B zJ#8_UWoS@QtC1%jU*G1yoWd~{ceLr_N6c@hvQ4zBReO-n9z>m<`wKGFDKX1s%0XeX z7F^qV&DVcZ92@(J*iD$!5l1F2yNM7?0xy2Rce=rqi+0qHqGp|fmd<{VldvNKZ%o?n zr_vd_)mv|v#UM5oOLhM9jW$XAu)FlVZoO;4tSiC4Ybdaweo0RXyb7B_?+*Yzc=I#~ zV^M~d8VlL#)rKjeZi11d7uqtYvkr{72+bED^dp0A~jcVQfD z*^b?GiCc!GxQyH z|ACeac>}0jqe<~C!3A-eEsm`rfS{1dzU1Ej3%Y_G(;6EYu}*FkJic~+5?*E95E5+P zX`FJ|fhY2;NE$Y+9(&FCw!?AjtRLJa0@&cU@`ft8%tvQi=jz6f{$ao29vN!nbVQzP z2i%VD4|CH2&}?$;nux*CA9=6ldRW)`*738&R3!c!VRx>^-z6Y0V0d1cBL#5V6W)0@N^g7wsQdoFj81XIDJ6w*|L^AKq>hK85G$7$0^}1k^1%Uq3>;CzSTw z0YbedQ#j=v05axgOLMw-xniCWo{iRd9l*=uv+C@T^5LVunn=~sb9gD`qAPEd^;~J2 z$kRp+|HB@^CILa6*yd+;L?@?Mo*95%@bZRk;oX2+Fz61^2f9&qJynpk1b73~4NMnD zyzVH4`01>S5wZuzhfM>9?s2H-&lZ0?uttrV_U}CbOw8#_$)7d%1mswydjDVzRB8g@-qCuyk5h@fiS#2A7!Yqr4lbhHj_FT@mjJ3&^O5wHEGde2FFvG{9yI-i zb|bKU_Dq6GGTrE{eXji4iJ@_Y;`{@nYeJK1VR8+j@ZnV$_4Bqt4zHmxkA@mv;iv2Cdif56^KKfv$S7LhG z?0Bud_tkJG`5WHg+6(l`r1v^T%@VFU?~J+)0PuFT%*8zr&7e{cXx^8qNYXR)j}`R~ z*XO$IoT*diSsSUdA*F$gRU~iVWllqc?m3hGz42r7&F}gzb?=vV^i-BM8R?lT7_aIV zNXYA9aI@wK5EWt%w-Z~0G70;&$S-l|^@i{E6dD`JvJTDo(XtnMO4Nwg7negJ!>v#p z4}c(_z!Ax}&XFQC6#LW_!98`})d-{>3YnjV^#Iq@j2?}@Whz$wV+aAH zupbZJQ75lI>~|&tB;i^eMhE0Rq04b-_4Dd0=bIRis^Hulz{ux3|9vIGVM5xZc+X-L zP7fjpywkmdvyb49KAh!fX_CZ6pe&cs zWBr4eKZaRy&bn7*GR#2vJ4vTBYyfZ~nFnH1{%{Y96uN(qZ?veBcu3*dr%{K0bU#LJ z{HXjD6fg?#v?XD08ZH%H@Pe2`Y8(1KX3RGW%Ezq;*7GJk`|qKD9`4)DH3`|=CTPS* zT9CU19BQ)K^e3$+`D*17g=|;vT^RjlGec`DH*gnunX1`+Y=Eu_t`AxrhR@}Y&Q|mz zBLvLPIrq0KA5yzaz z+t1gP*;DMiZ^DN79RGp(UOwDC6?kikL`%-M7G!YR(Fz!)_VC6X+-qI;Uw8C#jm~Hd}eED(UFSP&h1$3BUk|ZFGJ`e_EzC(8Gs9Y0+|_FRSbK4C+)*- z3xi$(ysg4xNn+|JB(IKMlgADwg9OXzTJ}YyMd5+kI-b_D2jzbuK@4?Pk+uV|vsYbv z=*OA~XNZXvJl5j#lGSb2PYbY#9?sNf-RT4Xa-ZukY-hj0kniF2+M=?zuKo$0u7P2T0Hy53Jbol(TI_3`Bj|o zH)oUHcXE{E$2U~r&U+@!v5|ki9*5wG%pkm3-Xy<_rdh#l(^rWUSz5tcuv`pT0>zSK zywe}aXv-$poUe!tuEo#-vegIkXzkGFr z#+=S>7D>S+w!L+8Dt#~5a(dHmmxgYTq^L{ZqQf_L8{GWl@K$a>{!JRxV(()9?FSJK zHrg`%taVjnen2HEMTdqdP{b;Q0GcCjm1UA-mZwsV$7C2CA?UpQXB;M?FhunqUM3~g zsAD7!G9hCY8kdN7T_3K0tIE`K4cKCd+gomDEVu3`ZAnnAa|EmpxMyV)l?&bKY(>L` zl!ePe#W&bR8IKJ&fZbs+^sPFy9A%u8Dg%IY?y+Cs@GTXi`K=qg*Anzk50kw|>bNV*gY1Kw&it|E0v^^I7jc0D(D- z@(v+C4LvM;cS_7tWZsBvVX|;xY2+StY)VmQ`E}tc@wwL-486o69{LXKcQ_B->$xA9 zoO%88ow<-@prLT#%vbZ zbsWC_HS2~*5Vmug3-u7Wj)LILnB4il3=(Li?r1U^ac9=KFaaCcl%@}A8*bvJwNEA}S{m70LG6bd35i`f%+_|xZD=$cU2)G=5S*SVus_%VX zsKY8~iu4-JYh3a#{I$gBH?FBrX)Bc>~JeappHuu|$lCwna=9iRC zN!B?yv%U%DY|K?&xwY9;$iu@b`bR*GC!oa6Lrw7F!Gz9`!m2%@M#|$Q^Jq)IYqN6e z6iz2qN)RhU0TVhEZ{vu;-2lMRn|+vU`&@9=3GF<};TS5BaH|^cPd2Cd!EUNRjBIi; z9-%v}%5?1gTp+Sz`*a&-u%A!iB9x;4o#432o=<;^BvKx*E>t)Nv42@8y?rr&pyuwp zv137EiiiK=k}xgdr`h+NgdM$pz{0hg*ayIc5eoWyTWBk4&nO_qH3Q+dm@SzQuS9-O zGnD`|aKzqPWv#j`%^C-cJ@@`B zD*Ep5(g`*O;`FT0XHw^e8}SZ?^z?2yaO2Yz*C;_9n9;2NYT7TD5graDRB90;TeDyU zs?m6kM@4FW39x}N;8kd5XqQI3n45X??#-pC*m_un$UA=tKn@Gb9_l|9u+5IwGy8XZTaTpC`g>T#5K~`@j4k&GO_AW5e&7M#;!tKVeW( zN?0HkBa+z1hN<%*IEKI3%AQ?u*Us&K6b5;TX3hzk`sX<}Mg9)^a6lZRb`xd=+-WfPco7~X{1>aVpovw(YwVD9&Mh=H8&{pUq$ahJ1;Nu-$x)Noj$O7iO5jFl>5<2Y z5qz{K^k^-!y#&&5YYe*IyT4)U6%jP~7t6GZkhnwfJc)kODz;}lg3^3r_k1&&v(NT& zOl+l3eiff`XJ^9u{`i^IOoIzRxu`T@+5Ih;o|cfHxd?Cl*K$ zrYRkcKUE<2$=RW@Su<5iy^$$R*|l1{f@>CRj~J8+)KSm8F3n|~fcn0ikbl#?e z_dA(w!@%wbA>{=5gc5e|OYsE4OT)PlJT+vy6z$8nwa>q?xtLyt%2V78KPcZLB%Nkk zixg5C*xiq`G{Sllqw(-t1VMgZIiAp(5x$|t*V`0O4{hJDbF@A*+R_`5J~-sVA2uOa zJ=AGpK5~h8eKp#dPZ0JKi(nd5lzi;q(^1EE=)?ZMhzic3D{;dCwoFQT3|4>@I1%EH!kt<)w-SuZ@%4|r@=IUX6mA+-6{feqFd z)!Pn1;4ecA=aocER!!CdZ{;9-eH4+CHGq;Thqf=)CE4MII3$E6f zw|)N2(L`3F)|^({2U{NowM%SQDmQ3`?ab5&a@ee`jX|sH@fKW)c7GXd>eumT6J3l1 z3WArjl6EMLV{xO70kZdknUYA{h?^PqRC(S$BeTh`QdKg%9X?9)E8-$*gT7z&;YM5Y zlCEw~jOTbzzkzTd^ED7(;R`~UUYlxE?zcZdn86)gD3Rhh?=ixbY?!2;q#OE1+w0JH z3O$3l&sHB0vyi3?sqST|5fsTFXd?^KynFU00fX!es`X@cBuNVCVrC!WGvCHxg$^C@ zMIx&r9Sl@qDG7uq{V4BWfR^Du{N}7cMkHTFSi4!kWZn^yKj^8v?@OWTrN|xp35g?jK!_LHn({~f&hX(iktjr9=QM^S>4QXX-Qjd6&~GGm?7gbo z^7W6qJTB$=FucG}JzN15QoOX(d9gF%VI81zI^pDa-HQ2TH{&$5{y1-hiDxqj-7hD) z;CXq;K`VpI&?&7C*^TN#UyxG?VX}Wz+&6f>x88P^n<;yfWj0j(y%-?E-uw1GxYuG|k8O|?Igrr5F+Tu7G2Z-EEtyO4x^AKT(KK7+#)!~=Q?4EDp&fpYq=_PC2cS`-& zsdLB|u^5`muY?L8N)*F|(lO~--w((J5IxlIzTY?ruk@YlSuzX&=MgZ1f=ecFLd z3`1Dj<|lTrf6)7B2~m8rc1{hF7k>t&AP$*taD~U^sBm(4I z`zc7(5J@8lfSqp0fC!+jY{muB?fp86qP0StDU&9N(Tch4_cYoN1QN}BWGw=5nq?-< z#ItEUV179vYxn<+jx}=TQ+weUF_di`&AH4>3Y*`V2T;suwnFY{VVT}6=Tp||wW-&L zvkX+!X!?awEb`*iR!Cw)?Y>=r4}t)MIjD{!=@M-P~J^iGQgTp7hN4a1rb z;5vOIK0dt48T_LfgQf(ZqXVjRSnGmgCZT;Q1EjG>{^wZ^&^)zIYc&za+ zoS`_Yk6_iePdznTWy@k)C-NBe2q#r~N*BIQ*%fe#FUm+Q63uQKQ45&N@Cmv=CSPW0 z?dZ*f*`_-0S`^Ddt=}7}k2?;65I7#_a%w6Y`Vwya8=6a%pofSR!{JDCfde09{>OG! zv_aY$lt3H#HbX?16C#JWKx&8ZrTTeco zwmo+|^yzL&kjrdY=*2RUlSA>B_SL_YCaL|~G{Kyr7r1qSjUgJy+@Ana3;)Rw7DZ9; zk-lsNNjvfVFQp#Lb$^*bK1oo&@@ohnKfZwrxQG;ap{FB|ncA0`u+QL%${es?=B&q? zm+Q2{2yxtHf*n@)UBqqfCnuB-kQ~_ql(ENp(TagA1`ay*OJw3oXxKq=m`&zK;z{;) zabBmnZ(biB-foF)iS|4e*G9=Kpi(CxWPi5nqS!>v29j^BL}p4Cvo2Rj33dg({E;a0 zznV_uM8kjKEtux1;T+W9f6s#W4dI!89@*O<$thwPwi&#KuIf)xib~fpOD~5{=jSly zjD!&l0(oAAD1AerKEW|`nM|}26NAiKi#;8^mE}1;rHG zr(J`mk3-6XSUm{fP;{#|v}s*Z^o`;IxALzmQFG~zno%o2)Q=tj7@Rn`=h-ywc6xSY z+`ji$=P|=CUw~TBD1G`|;g7=btp#=SI_^|-74b2v>;hQ_fpDF&*T43{SI}pgHnXYl zP+H`@UYIQ%-;Ga{B!x>6dXdkjH?ZR zTuiO~q7=0c2AaV;AalR%|2{gMxf;srVVYTq+lJs0v$)oDG~OAd(nDYH|2dc#a}ym5#jz7`ANh*vge!JHwM9$f|b3a(#T6B&ELi>p9;LK>#9l2T`+f$q8jv81K)Vkv= z3MQQ|X;1c8WqdTEayY~kedmlaO`TAx_r)KFvR!cSqq+L`hOvbK(%-Gk=gYE=DqBQ1 zcR)T~5w3kwbpjOZU}%F;IoEzt!7{{2cZ6d+@oMiFRn<_%fr&Q2*TFHIh*8vN4zc_1 zi!=>4(q0Vz5%J7^G7~v!g)?SifQ6MyjLkjpr{&l}6PsP>nzDyZoP6%~??(i*1cpMY z=?=q4LUT^WGGkR)?KD>9koZ%kMRVsIBs$q1EP;w-!xW5fTlQ1QugSg+(&(2(QVSUQ z2(!MmZxtpm3C%{T*~Z{;7VV=A!YT0Y=&PhK9oZn$@YX;owV5sWq4TuHJ?UW$;_}qT zLp9NLwX>!1{1TdScVei1IfFfEr666sd*eCj?LewNmeohh&B~4QiTm9x&RkjPs_`Bv z{Y`J=Ya*~fIaza-Crte34o4)p=!>e2jCV~~GLSfYBE|sq9s=1MA^Wdol{M~?a0zF_ zKm3sf;~{E;QNFbb?yw4y+!n)eLdUczF>Yj4+dp{H-Cu!FQzy;|=$A zQ^PW6v^ju#;Bat5F2wH7+R4PU(e(uri-9C|&S0Y>`E_xuKp^7KuTWpk@O1?U0=p_S zTz%pDWSh~t7JN0T`4BM==tP6g!{56C9+2%Lhmfc0_WQFmS~CK$)}}?rClZ^hpX`xr z#)IQ#cN2LQ^0m9~3uj}8YtQBAD@creMFS*nI~KV0eywQ`u=$$E#bdAXNAicg-xy5p z9p56OyiUHNekajO*K~PR^FrIQf(7RrScM9#6mIC9@*%el6uiS?$mXv0q7)99^rRoz z5w%z^;{iJnUH;3^Jp4M-oM6EUHuL07`2A}EkGz2D92a$$+o09Y(s%baaaec4DKW)3 zh9RXkjZHCAB-bipv+YWDwQNGeE3N_+KKn|3UtpG^d|9FRht#c`Q`vM@+**sCe6^V< z;wmM)*Pf^#Ro65_(uMWmSAE<)asv(Wss`z&$?ep)hz8dj@(6WiP#KZvzb)4y)+6J* zm6)2Ao2l**A!O(6pR5U7LQ$`DrK9#$9e5pYWQcqyTpEc*Sbt|&1hBaRmRxR}+7%!I z{t8sMd=}lT%>RckO(%txdeF*L?PbLek(XAv5fzGMvsxFG8*(UaBVWY-y&Az?%Ut^N zZlF*1$fxj>>q@Q@XxA2;;&(qa(21#RJ8t!sJLEn_9pIJ7W%RCFpXe^h|-V~z|7SyOmh5ymRx<#HcH-gk=aCn_<~vfFl8ur2eJx6e1YOy z(SS;a{Of}*qChf(j^Vo9pEm0se%U^yGuOrnT;>2~tgied5K5XMQsWtM$6kY&f*6Yw zjO32oZn&LR`}K#QVdUu`PK$C(4v1r9%?F?qMae*Byv;8a4rw!zA*3^hu(p9*nhR;W>R(b{=|(ChVA zlyB&C;Vxo31n21y)@F}VZ6A;6v>#-Zh+koO(pYm=A)vS)-6mCl9QlKFkb_>x zf>N+&`|pv(9^;G5(}|B4L6`52RcZDK?>c^Mc=gaf@q)jU`9;pPj-f?f`rD65%?vm9 zCPbnP`6N{EcibaX7!uZ)O{Ypvw=u;gmZ%4#v|ManyX)!aG7W2Rtg;}`yGt8r}qVBlM)KpBpE|_2BkEU zJv4*JDRB4)tu|q4t;A9|3%&AaI#I*GTmj8Shz;yHpuQu>#`-O<$NkTi0qq=19lqSI za1ihBlfObsM|oA3gU=3_!uWZJcqv3S23YVGY?+{QGfwZs_YKCuA%m}8_7S&HN z(J$d9PU&ZacdnbS0a!p%Loy3-3T3cXU6m=TF_uobpA9!*Yvt`=vw-ymnT@$h1MJ^4 z)ggg`brYIn$cm3KGWDapb(RIAFQ~HH%_RQ+BkC-}+UlZh9UO`TDOMy%vEtAMPk?k(=_8lbqlL-3pLoO_@9FTeKA-cPdDnsbivj^y6HFoz8x z7wbC;G79RT@VCBc1XM5OLWW6SpHI0$i!{0kYLvQ+SKV-6?J`vgFPinCCX&hfIs(3^ z4?v3R*WA;)WYS9?o8O*if6t=uDbji1$!0iak^0VVmT&sIsD;awU z{bN0*NO}DdD~zAs!K_`b-{Zk?wXdHXKxZ7xNlrzC{6rsYuxf0fl%8G*Xw6z@6l)dV z=M7_F{~DRuDjezUs1j!S0BGcSlguO*^qHDEQTZHei++ABssYC17=-#+3apzJxFvLM7^$tD+nUckU7H+=Bl4f~J+Qj?Tu z>MD;u+a$Cchn{#?i{uA5f2#|PwJ>W1zdkCz*nh`TK1~iEh3%6qGk~!l6RuB_w+x?p z-TfPEu>Omwh|Z$=7>)BTy-QC>qK(G5cnuwEL|0C_L^>K8_*uSH_t29EsI4r=e@#FN z5Lo9F3qy3LT-@#ulEni8{fez80|Ag>p{oCrV*AJnsN=%Hu^O`SL9hYEtbI!z&c4B#BDE+Mue$}urxNJtc z_vJ>WntrSbrQRDBn<8v6GUd=_!DRY<%jCZNGBU(5yJQSqh8W$*EbYIQnWM)>KZ55w zwfo~VYH4sc-rYun1j^7xg+&T8AAODE;KM(zO$is2aonku$Y(TK=Nj3}1(b~|vf$MxP z{V$ltW(WDPr^cd+Fs8Fv&rNEP;S(TlUEJc0*Fo5v*`~`ZsD|^{ZCkTW&vvPeL29II z(R?H5j+2&oJPG32V!3BEtMp>u9- znrmgo$gk+cNl^;BKktMd?VkRQ71idv^B>Sm{x5braL}pWl4XWJbHHP=4X+#>*pp*S z5lM929!JuJjooo#EW3`6`<*aT{wWLK_2g}MDNc48k!(0&m>lHE2JPSRgJAvyyGxNf zai;Fm^&RzdLmrpyza)NdQ@@lbsHW`Y61KKS^5cev=2@L3)C>)3{YKOJWS+b{Gdg&(12-hdi*F}OaQf9QuU3Oplt6a=is<^0??m)7elABzT;K#FE z?(&$eu(L?j>hR3UGI?3PZAn60sxP8705s2gkblHYma!LiG(5*lG+Z>`QdHn5My(TT zEyCl|ePuRey)QfM%U7_aTf(pAUveohGu|~~dpyhF92?P1pPFqpXD9m95GML&bXLXK ziPA@Wuc9v!FJiygp(~N>l5@I;6E?GKGVP>gY)wVGFmVt%L@GU{1{PG4Kdc3d> ze+KqSdHz&tL+7JeMQ^8%PdL8zlyXcFe0H4f)9$YXk;t~367E@vU-?hWD{o9g%{>tUg+bXIqcQ|kKRM~gG8qGt*qEL5; zsO*(kWOyVNx#KxWawQGsuhyORHd~h3gp&~q+%Q8_Vysav_$R-faH2z#Tt~NRmj%?0 zgaPr1np@@|RB7kcUXuHJzDVd9*j?GXCyclpT_EB0in&d?Rk78cV7|k~e<_#B|E)a% z&+W}(d5BWv$zD&VY$_kWk<9iKw$dz|{lkNhBBoL~MnDx(8uh8zi6tYh*k+>qlY~@c zB55&Yf3#&X!3{2eRiGO5AU2YpNcmD3eToPWdwsNAXgwtrDr|U4fNqV_czGWk}~_>h2rq zFOl%CWSQc@$2+x~!dzfl=XsTbTrRtovB?OR$P4Wvjiw_+W{m%~#AQGP`&>UebSwxU z!>Sm?t#*W#5AK2>xp>pIpE8qP;nou(J0t*3h?anA%A#rXxnT#-_1}9&Z;YRu6aw9l zoCO}H6=7#m-zeVhO`(Dgk^Y`ic1&yL8*Aug$0nzflrcz#?L4A!aL6ZbknSPk{rDGt zjbMPT+~i!dM!M0VMq7O-{G=a8$G~onkJlA)(E6g|M=WmbJ23m z59}aI2?zoZ+nCJEEljL1PA?E%pcJZrlar^4m| zE{VAS+u1Khr6z^3dYKw&=l<4C0Flt;p36+S;}FMRbPL@*S@hm?+0nbGeYl{9pyB+A zdI2Nhv?DnaV=>M?_PV^9)y^wt{f@^d^I_a2X?3w#Sw>H-8wc4NZ2RwvGTU?aLWH`I{%v6pzub$gLVDvYqu9yit3S zrp0K=0N0E2C!@(gYRprQJM)| z5k6pP^bJy+W2J;2X#ef*h?X=E)2B}(hN2?wr6bQffYYCq(K9DRwiL43xcoBJ7I#dc zd-!fX@r%cw*d&Jt#s#OPj{1adBlZ0G>X=&GJoKcyEjH0m>OF3;Icr!e$vS3>Q}kIZhbB@tri179cns!{{0NziYqUX(IiTEu z51^8igc>5Dv*PKe{kBN*bBmst-sJ=7aJrXzk&i$sC)E+M55rd{PzD~d*df!7)PS_- z5~b4Zu1Ll8Hb;#YYCO$}xADm7a zpFcgQXyc9<;IG%)`T&1c4teG@Vq`!##j1rG6USr~zw6%Uy*ID4)gSq`zcg3uWtyK{ z{SVJg zp}jc`72~c{>T=`DgH>-^QF#$3=0>n~7afUSE-ag;DU9J5*3f9Y%Cq&D-x8tpOCJ=Y zOp$`{QKSBj{A>82@J&A+cIai+KeV+%vxjybr|g$MK4qk6r3CS>?pJv9wB}AYcyyMr zMMle9BgkhNx{YX+x(z&<|pJ01Wb%JPJ2(IG;Zk`mwQ(yi5bQF z@7AD?8W$TUyz|b!tps@c+=M?Hjy&F(s~OyARE8PtVm>pRE%KnLi^Cu#0SpsR@1E_w zgTLJhx``M{L9dSG(o3{C0C4rdF2l7g=|W45LjB4cjxgQ*zM@qksxC0W_W?@97wA%a&82_$MYFF^t!#E;gOU(JdSZK7)~l8D+$l>mgO*O zv_n=;(I*BHXEx*Ugz2B{#!IwQF}~{=_W+SLoCNaW(&iAGF~5yFIlZ7$Rmw+w5?tVL z48TW+J%-5MzVCi@V+s2t`+Y|Ygdav_Lg)Ln&y(4UubDi{Z>*@g4~W!M3V^{*6wl~V zCfhYnq3#+L;PL#y`oMg58M+T_ym~DItvkx$mv*-=$~Er5+>APvlJIX@cWqCYf8)t{A>QWgrC5@GNu91E=5P)dp=L zm3LXcM?VOH+WCD1BOlhPgbtYwna~tpsgxA$6m!|4mcBtMl@v?HripXg-I_+m; zv@G50hFcq~(%%jBQAd46y#Y8$vOXC#ERq($Jy`wNmRyy=74r5ur>CEG?+0+0tslRPQOqk3UQ{e|OC-ksmQt&|6b8h`n0A~CXY=EcG-o2PV0oT5Z4PiSgjS?C{$)+xY2-AkwUPDN-F?nI)&Jxw?LHUxU97ag z^#CZYkZ4bD>JNkrnWer%>%28z99>6)Oi9aW7V=B<)!!P2b?JP;qtieNOe5BF4sx&6 zQ%*}aj6P=D5KezqELZHsOljqkhUi8Z3b>YwsXn_e*SI~Sa-&-wOi{KX|9xuwjF6}3JWPhlX*Zvm$g_{AGffUUZBm~%Z1cW6>y zXF-#`;kd|va%ph|N5}pg<%8AoMiu|>rmQrQ;u162q#AQsi{%w2d&$r=&TzV6{29h_ zC;2}REb>5e%Rv6SiV&mbuT4+L+XYQNZ~o!Cddz$dCdO$q5qEv~A-v!>&bun+>el2PIPo#gO5I0 zg#p!+ijR-sU0{n#t3!ap%7N?7ut40zSxA#t!Eu`XBu-Lw-#H|<~+#m`v6-mP@JBAIq~{LK~_QfD1b1XvJ%qK+~6 zcfJ=3(RhND;ozA}e}ME&(#Gvu52X+7?$E%A>-gdxL%+C6v^MM@wY%E1&N19nvv%mwd6v00X8U2|2<(b$9GP>fW7`mK%>o$GAVIZ++I( z%{R)Pt!-ub?piChJe0?U=7VTaZ+Mr`Fb(WE!&X`#iw_s!*HocYj+^Y(k1Dq3iNr4; ztKHh`tL}PuYd-{Q-TD@u(S}qFkO)XF8&GkFOFf=bl%`AoRS6sg$gPZZTrm4cCkit91_Jt7k5YWb`U#6qBr{djz~^leO; zZZsnEw2Z1vnPm*}My<1uw;Jrr_W;c23LO$&-^Xa8mkOtTBAYwqy%QR}ST_C1sF>-Qb4h6LC zzNNEbr|1UsXfW0)(JGVEQ_@?Kq&C?k9O|(Cn~0xDEANf27O_<5lBuH|;@0Ok!7+Q_SNp*{!DtXygvgvT;>8Kk_221~SQrS)&}$qn z2SEY#NT+U=+IX8K=aI+?Sh}ipxh_(Lo5bmGr8z=yTgUS_Vysq{NJ^~1>o)YXF3cJs98^eQkG;HKU7Z|7D%|WAyhlMo z2)uTq06Ug%3xj&vN-=Fo2c_n~k`JM~;A@^j>w z=ZfJjcNJ+M|1l8Z6u_8l%$-Cx`^QAhrwkd`aCj+gw$Ec4y*15#2E9Nll?+$7Q_|>5 zHsf%}{CmSbPz}-@$3nc&2|x8Zgrg>`u(|_eQKiX5(&@4AHb*TU>EVO$tX_q!nQoqNUhZATLYEvL0nh1~RLqqGlThwkcANFAa?93kzT2J>$M>2PH|u^84d* z#%+8M_NUduZG&Csv%8dJb129^Qx&6%o%4*V#-( zFzQJc<=V7Kc{5&ja`e?o+bCGnV`#k;yhlY0=6iLY+yk~h8+6h5mDxCcpz@)sC+3re zE@L$dlbs@|JYaP7UG6w(7M|}Qwcl%n*hl3d_`yfMEeZ`^K#s=64$fFh0N=8~0Tk{w zpS$|CzEnGMGnadV!dy9{f-Q@q_?m(-5ZlO-EL0TkQ|B>^M^J=X6}8Fi&2F|w(*~_%GFblt2z#{ zF*PtQ4Sz44Pl_?0wE5<<@xQ+VhWOA4jHDNE zI@-rw>P7GHDOpdZ>Y#hrtqbZ8wd|~m1@!oV7=Mimev3_-oUf!hs;T|) z{-`|Vag=5}X0!~Dm&EdiREBl02l{`BQqjQm^jTWRYBeWcW_-)e5&1zl+6^S#!{{W> zE=nib*ErC`n$k&7V3_?Q#L!ra#k*w74Te7K*!_0hekuOP{^Z9h1RJf%4@FPvo*-_{TCC`#ZVY>9uliUxQeUS+ zQ6W+RA*7dJRms*_wN?6@j+jokxk&O$`e#(EaS;@4TsSz7>jvFaN0om(SC02yu(I2O zN9|8x*LUVU4-s{XpS)UKsxUg*F(I0h(_H%YuA=5F zTj98e8_zzOB0gj{ZPaPnNH|ukdP7O zGCTQDMV{->IiOQ^bb$v&za*J&+WfHW*S1>j45$r7izXGlF(d(!g>+2~&tdCgwHD`p zLJ2l@FluoGNh@sJ1u|ptdMP14fUOH=*la=lQEG=lchRw!9)q6qK_g`wu`n8 zcV&-Rra(F$RdbtBbVUNy?rq@)N4@;!XPUb0%g}>hnR`hg@bVB&`@VHL%*Jx10zN;S zbciU@iYDDS&Z+iTtCh-Ei`Cqmr8W|VnmFa%8+qR6_`Ub!41H7h?)U^UFs~4&w{fEoWdL zE0arWweCKFS3{@G7OPExO(rq}k|R>S)aCby%|Z?7;$}ac_qne}i#lOu>6Xk!qH z$oHTdr8x0gI#5i`-|eU1g$eHw_HyPxJKw{VD4{K5<>Z0C7Gd)FP2G4*dlw>0+-g14 z#xlxuZ!E{6mj06v&4dd|uhZE!%Fkd(3xI~#QkZw)Cf~?cKxFT5iOgkynV0AgWy+T(wjG)*+&(-KBPxMFlL8XBZS8!*zNeGyI>w-nHk=-SA}Niqu%EfU}#?a4y7o#_z0@)J-tQimx_%^ zHynexoG8!;V4TeUaGkS@Tr%UU#3grFFtvD@vJrutjN!2`S(Z~c#vH-%lSgD~1fDgs zS-%3l^h;PKWPNyHxEhOrb(O4%l^A^n%H+u&`|BNdIp*T|xikz77-6-KE!dk&5G1kh zp^@cVQj+5DpQAM({@TLkW_N3X!9Tr?)`uP70eGdO5Ackwv0`w}o}0!s`0wC}%d}!( z=$&Ip_!}u-Z!^wt*|UKhnH`febkASI49+y&Z)nr<(mdVB5YY+}vxBUoQ8#aU*fD!q zxaSlNs>!6bUcQ8H6V_5bKD~}5X{kWZV7YUDL7UO03MgtCxJH~71l>-kf!myqk_d2` z$)(Urgv)Qrdwe2>`CR!dR!6vw5rFr$S$Ed4QWdvawcIRjlfuiLt7FC1(56ZQEMTKc zHl4TN5##NTNn1M|AcuyQ$L(-iE-^=Mj>YyP#UEzss?a-y_)k+O}D(slnbl z1qxFH-BiAY%h)c5<&AQ-;ne3Ow|*^tgI_qbzO@R2d6zZb&Hgu9l#|^iqCb3NHg2&+ zX>N9$<>o`e^!P8QXBd+t`9&zcuV-AFUAXWUi7>Lb=>+v>K2N*9FT8en6}ox;wArjK zpkDfZw(U5Zh;L+8LC^uRgT}7A9--!)O33}cw}tqc*9FS7uO=IiyIc!sAOC1P4q6iR z-?yLJI_Y#urE$?idabQ-HxfpPmIg^OyBw{M1?Y3q`J@@ROS^dl=codIbH5nN|JUx> zZLmxl68=fYTH(^B^2$7;W1##EvGcdBP~iIDK(CKh7F!J4H zha4zr{tMJyQE!F1Wz1_`0kTd2q%|GX5QjXEiYR6SI-34t7J*o-f`sqXGidj=vEiug zr$UwRMtE&Z0&xXRrD@WMlA&8LWQ^(?-g4f*QW=wP8mzkD> z4K9|9$u1O#DZAB$!~`Oh53QPW>BY8e`|sD^{HydSiN|pdDxOR)w81Mh6mnUirt3pX zi<#AJR6#l+)iKRerL3@M$4qg-jKC0mJxHb#FC0CAij!|tV-n%sWAqP7VbqPX{7@KM z(dJesKBkXapi3idh$y+(le78(moAeF!9`-)zegZ6ykHs2fhwf;xaGWG^Tj`s34DCJVTm!XR8Q{PMdAa~ z=mO|{BfKsGF}Y8|N986;mINjW?7SnY+o@KtegN;=f=CH$mufaLp@Xz?#93#(Qd;X8 zuNp$%4s3ONStqn*6%UPEHqGg|tGy7+-$%(Pj#?JYXkJw$4;obaUWE{?iZ!#GH5t5% zRbZq_-DWlOQn42KD@>`^LEj;~&sIyD5@sl3HTQnt(lhXfUh`ppYa?_sV7T zTj*_g-w5a9;CO?L;BUL)#kI91jXSSCU)qqvI*|MBLs& zX@FFsIYX%BTn#mi`))S$Ym&SE9X1ZW@j;Z&zDAVFSKj|(J7v>;y}{*W2FbI9_*~z_ zObc-Hz;*N@(#sok11vfDpRaEp24TR+(A^z^2aa&q*vZlB>2@A$_D z5*a#ZXo&c7nRiK{HHDR_$O=xSrGBgBz7(vOx0(2cK%<41@ZOS7`l`v<3$(%OtnI#D z!7F}QPU&J3Kz8Kd7YNUAT;Xaww`pf;8V++|HWiRsToGAIa^1gsYGf-ZR$fcr3BZOa z&h8Jt7%6F=d&w`w`VPPeq;AIZrHa;4JQcEG+^y?gal|K00?(t9aP-5l)t6f~YBE+M zO5=5q#?pN2SORrnX6b^)uvw~?j`cp`YUB+Q%ADb$y~9p`mI05JW^P% zx!{%Su3veZs;F2FW^gBE5s=$H;HXURSnLNYu~@uoBsvigO{u?r6O~A5iZrTg;ZgVI z^De#)bZId{0 z%qO98Z&cLHUX#*HNN_#bk<2MsuQ(v=*CK0onLFLdnPQ#i;I>mX^e$!aAS|idO7u_=y>vmCvQA!w=f^q ztB;q54-YZw?Y7FF(LDWzzhpjvbAB%L>wCNSpQ~T`jrM7Gs)vja6}EY9WWjlRp&S4` z9mzRvvi18)I1j;Nz*cQp?-N=_DZESN+mFgNpDbWb)(4UwR1sQHQZ1e-do_v}^KZtA z$TlgtSYO|yYSN%J9hYKH(4|8jW^FvV2{-J;Uvin7Z^^LpsIuRyBmeXpLJL{EtwZ!z z&HWg4B=)>Xm@Or2xqzeoTZPxWs$UI1znh(VeUvs{|Hi=OZuzKixSQ-~OzrfTnr=gi ze=lR)w!i8q+x4M2S^+LuwGlHC$zL=hV|Ipg+;@7Y+jxy&{9$yQoLAdS?Zxq zC@16OzFbZR4C=INY2?*moV)Tn)vi3KK3Jypz89+YWCAi}*SvoCc6vJyp}u8njqniS zp{>&7*ndpmPk2*xH)Sc5@h<29=*`>0bSs%|^EaJG`POio=7{|VwfQ)7k?nn4uLuj` z?|27!0Vo9p~8VdL*c*@3zw{NQ9aE2Gi!>~;dR^vHN1r)Bf(V9g70YpwtXxYyF7 zq!FdcYr8--rOqqU@wlBE#`|)|i$q%W^|L6;mPV9vnk4X9$6f8wNz3JHR8ZuJw*%Dk z-k&oT>?rj1c4e<4e@Nn7SL(38rnxM>sAbUYv~FseEG|f&Yvi#sK3>%y1v(v=`vDIgVfkYnsWZs~c9+wi8Y{1Y(L;qmazTWp>11y<$3)m9Gh4riV9^v9lp zC|k;}aHed8o^3W*_TgrwVTwc95UNx+l409dX0&cpU2Z;b0AS2`uJDV|BYF;o`3c{1(~zExSmVNTM5pk(`0aT z#?=?Xb^fr%^RUKCuUnFP414x^&!q=T=MPDa_eUH#ZfVC9_XQ;0DX5Y=Y7g=n#{VBY zh5!Xjg!?BD_R?-{zkxWcHimR(egjR#c*=DK`P#jw@`2@Ylc6!Y=Yfsqyqr;Ng%Z_< z%z))Veg155uCZ#P{ z->E!DnN%UXL`B!|6kks<5@C(~24IX~+y|YCK&Ha zC*ijXYtLrWejpBJX}Bu1JlX8&tCKlx4NIv@M4tD5K)L>U@+_CyUD*vqBv`Oe9#AV;#0ipW1=svayQB0f z=4l1D;X>y?guK)jVQ{GII_k5z6fft|NDJa7w2+CL@VIb1ng1s=3TSV zk1wn<@9bM%fhHTML`o2P#=L5uEjAVu-SY6w^U`LXIfB$%*bNQCa9h=n1;75e<(J=H zD+^7Wxk33Z=jEZ6E|YVNPtQU%pOd?j<7(q?yLz0EWjNX{MG%nV;0g)U99% zOf;BQ+_-2)kTWC+nE*$>3N&dK!mlfeNu#90ys@Hg?$9M$>WXI~lv1=2TBR%w$2!wU zM~}!%=%$-#@|17K1O7lOv<-dbGCE|wnqju;a{2Ja8O-FOU$8kw?cyuaL<$pY=lZzt zb$)jPA|&Q=cas)^+zSI8^T+!*QSjuHthF{0ot|n>*L0b?q0^eML}{}|nP=29hg%75 z-A#IQ+1IIe2k4LX~DJ)$|?86HKY`uN{vPxPe*t0d-EGxA1tsyUZOe z)5O3A!7r92;$u{-3Hwsi0bh)W4IO3bu4IBwfdZ)Bg?BPm@o)EoS_kHwbGV8#mYEwH z$NecD9Ii#~8tDx`OE;#a_ltZ($hIuAH5ke4^r6EIj(=1z#u@6!Nrl_-os%d5S?63| zMT&OrO~;p(y^KeOT05^6618{Re;j0_t8yj9*-%xW93xmft7{`4`7$yU@8MXlD=w44 zE#zw7G8?}*TZ;%?q~mpo$DdbGRbE^~6MbL3Y6rt1hmXH~owPaVL)fI=2b8RbA=v!l zXm_^>>~^V==5UBrWMWV}eREvqfe9QiKX198K__^pk>ebNRaD-25!Z6a?!$6brT3a~ z;5jtM*^?*-dF#$5ix#=#e;f!aFR6}-?7NNokf+A#r0i}`VUn)eVW=NjRtO1eOi8fD z&`VdeHkD1$EvYYz;c{6r41K533TDSUENl9QuV?+CtPyIcb-)F(dtpac(Y709v{FM{ z+jjGN`p|NU$4HM`>EyzwNbdIgj4wJlC`T#lv$rHzZRK%lGN#C!1M&9rKZAh>Q7f6F zyZ0Y;#2sFp)6jJ(c9rXBIsMc3s&~0{>1xw`8dElW@~LIOpi!Qg-Kv=sHSE!IarzCA z_AAH_oE7dVA7g*YT^Rn4yYhSy??O3Lzj5d&(^CeU|C)u%D9&tIztH6~&$ZAx8kA_8 zj@ZPx|H?4pcgoJv%+ya!>OcM>3`Ns_p6oQ#lVSK0jiWQPHO!7Wt~A8bi8Ej}%=QOR zH(c(El(oP=IujWX_tVK$7ghB7@v~s67!|s-OF$&;Z`RH>+k&;?2UggO7XEooD^Vd&?}ZWF{Jt?aEl$dKRN(Bm-}xrXClAe$ zG)iB$Wrp}1Bl9^14P2M(PR;2RYd&baJqU;kvpowOoX&!e<0nj5yT9D~ z2Zg^|^-e#LAKe$9336JSAvix)l2>4ofbg9RW_4_ihL!Cm`VN>LW#$j*?NNV>G3kO+ zV?D1@h8CoW{SCAjM7_0GV5J+P5MnPFG&KG$N6o0)JZH1eJa2>6APpu-8J*qElTo6n zG1;DRGno#;6?$T)c#ajsyCw@d7@&q;PSPfM8eMr%aN3d z_A)szwV3P9qF&3dftkLw4a33L*LsN7+I=Up-o(FJW%HSkr5yEru<+KxAw!Mw*AID( z?--Tn2jj{ zs62LM1Ht8npwv9hgq3o*4eMHmkEr+-JooCz;t#!f&F-(9%^ifJ@$1#wU z7YECaWI5f*m!DgND$i7vWHuC#R9Llh&7H7w3NWQ!|G$_N*Ntf~>hpg_h_a5OxdtQ8 z64}BAPwNykl<6l`$-I0zHGl9ZS+l^%%+ib<(x%pS9u^cfnB+8C*HgDI5Cy5#i2_ZO z_HR;eyh^yvuh}*@=tr9Ul?_-tc163+22X&{@v;Jr%EIsw1Xj@YpRSh35v^ItT!`oT zt38oz#j@iM1!@^y^6Snu_=wjX3gNE?v`R$O_mQTyiPuDQ&Ldtx`2ACm)7Wamw-Oim zI|qY0*O=42$6XXY^mu2wSCX(y(%Z8%7cOpci`DI)R{J2TxKts60%{zPm`~XuKf?25 zTq`Qb1KsMB>kKWBy%lUuii02!-^lTl^?)Rt_t?l;_Wp#g9=DNdE-s{83@s%VDS+He z^60o{?E2T3AXecy`Mi@LIx!1RTKhYt^BA$zSALb316QIR4i3 z@494(!+I_H2-hjNX&p{ag;4TasW?K>Ubb%Z6X`uE&f|OF0C^ugUV6nnDR@Lfa))jd zL0ZdC*g{6MG_o6COnvEK3E6RmKRJ-v+23x@SgOEv|5UzFswB{V3R^Tw!xw)UByMT|+lu zf8vB4@=)``6Zt6^B@nttTz}s;t`p`Qn>NtIiY&p6Wbm8iU=zGXzeJi5h}YbjWg=); zOpy4F=%Lp4L@Zq5Db)b9fO*ROZl_l5zZ(s+kz7k0rM*OBJEvVs24U&_Ol5<*yX)zq z&eP}H3oS@EnyF>wso|0(YSx=oq4MHVIsCzsl5MwO&`Nmo=Sa(q;f3|l2fU#ix;oD2 zAv4$IEmvNTo+!|shjgjhMb`n;|e`tnSGWsy}6U|84`SWWZpE#*^1KIbi&kDld(WOuZKC_NDG2j8Nu+NN<5Irb zpDDr=kq|A(j_D!l`IG4}pjp<;i&2v3kpvkJxPmzLoAV1-s5M>l%_WCbPt@jk;?ho) zB}tRn>FJ&o!=;pRqcS}7A!-D85?(F06B^POe&j%rzm0SAIw%4i&%!$~@}2xb;Bpq* zH1_D2TTvx;lb3Z~q8r1CCpBICI92kDrM-WCN$5Et4$i;Ngt+~tC&$O%Yc^9r4>pES z$B!2_J6;1S8dcL42SGhU6C=&Wkn4TrD!WJFu7iujUi9bJnz%yj&y+Avjz*AA8UlwgR*>3i|_#!zs zaW}Ye;BiQ|c5I0M+if=TSHx%TgQOfuu)phpSHI`&ea2+E*U7=iW=<9LIV%s(G_N{r zR_1<-)@qM&A@{OOV{J3Pep^1Go>)~$(0K2nBf;eMjSdpU0hF|Ej&}%U=*mZ&Ttv;K zgC5UCs@xB7V}Ja#cwgzL5+MuFgNSc7sp~}Z4kW-b=H2gdcdB1=z7lZ{;N8eN0_Y$= z`c!joy#CCsTWecj&FM)1EWX&~oKP=ZT2k+3qN~Ltf!KjV3^tN$3c>2;=(gwf+zY2m zk!&b`^LhO&hhyq1OW&QzyoS;~ZDmicQ~kd;WH49K3^$7l79HJ29(ekXggj{s_eAb9 zK8{;A#*8b+zG;wA`l(cDlMPjj$gikyvGL<7sN`Ab&V*fbZ|*mH?4uaq^^lQU@QsMg z!|jYH`nDH9S#MPP9hzaCbdLwzq4`(qWYo@$j%--HRZl5SwMjQSI-7Y^^K2eq(_`Q3 z;BS6tYh*Qz7xO|>vz%3=sy|2EJVTey#kGtUy~fr zK)?DX>#IerPBrb*Ii2Z69SE{z60`(VO}ygjSc0Tu`BcoFPEH7od{^6l>YaMkXL@FG z*7CShwplgOqLJct*uJ$H(*xeC=lBgJIU=y|xQK973DbnVWa4fvslA_DL^{p$`?&2# zErGTkj}Ui7v+UrI>p(%uCD*z3+lNP3@`L-`(}eVpUP6-2{3(wf^&JaU^SuC=o+tb+ za;w0LY7#hJE{5HeD&}EmPGQ7%bPU;}$IKc9c2l~=LxGhW2Dio8W&@2|55b+5Zfet$ zpdu-@chQ&F8wC%Uh&c!xE@_CG$UHV1yEV&*eF#3fKjtJ4L)CsEE9XpU&SN~f7fYsh%&zrj!w^umeb z=cNXFP!vNv>W%66M`pRPWin^I4%r4;A~Y?x?D}r+$hj`WH7=27@w55IxDb^bhPc_^ zsvd$#Fy)i+Q+1A)R-lWs7H|8PHvd%8UIC5I*V~u4+ms{8+6K^9)B_!?de6b&Z$Z}? zVSh0`#1h^DR+oR<6I_ku^?W=mO%=+&LJ;$$I_!}d6>O-@cy$2P=J*o+cZw_ZTe~PuIFc`BLv^vzAet5s}`%m9PTB z-d(><^QrTxk(;^RfDJ4sPbs>oX1Ha1(+fJ_r)Xlx$;D9OZ$7hi+9^63`c**@rx?GW z9^x{lIZribbFC_|FN}x{ZX|6fY5YRx#hUiWHIYAIc>dS~giMC8AYv1n-w$H}%f=TB zQyJF(qEFdRnK)kjXk1_6H}6nHba1x}kE%&afJZ`z%3&90-(BGfhBoB~teyJ)d@kFQ zs`Rt3H{)lPz)d&jGZAlXIK0;Hd6rf)Zny?^v7KXruR~uh`T`SN*Ru{LRiAY}p18+` zxMiG%(6_}r6EUfrrdaxK2S7g5+Dxh4ER`nBynC9~8i~b@k-16v)JmtZqoFo+Acw}7 z+q)?2?Cs=Ppt8{(WIppOdtnh`=c@xx=*RFRntAe+K;vC+f9t(#(dNJN zs&Oy4r}WNtujlz~YIPpl$m2I#jTYYoa{vqL*vZ^~*bWfn+qWi!bkpC3HM&=9f8xjN z-c5^_04O>vyiaHFPFNhl`OSv)EQP#YJ+d|F+vs^^w*-F@jR{>Jq64`Kl+ z1GbH=5dLV5(MFo`Pxo=2HD*LvB48Q-=0}UOL{6^wod-)wBG^)4k!|PLk+`#yih@hl zTi$SkP$tK014PPiN2&+y<~BVyMe+3-DPhfYp4B{tslAu4cHq2u483d>kW+}~+PA?# z<)x(m@b^wPa$YCb{E5Y>R?uZ~LXF*;om#uOvZN!m3>Y-V8x6u85=?&WdgP(I^y|u+@KYZ+UCVEcs@1zmyR%OLyCIKn;SaH<)g#lU&VIEqLC`3Y*`P`iU z0-3#DE+pPmh}@M^_fDu#_Q5z zXgU$L^(VvA!OI1Hegvxrx80RTpx3%51DyX;_DM`uJp9@OuT&o6hSl+v1WEI^4 zA!QHZLKT6y8?r1 ztZY!i((JfG`dDJ7bAyo~txkI5UA>C@=l=lCKrz2ozT_eLn{`j0{)2SB zzkk^ZwFetFZ!fk{miLS$e*68VTG`&Zoq!8KH+%)s$nVz)%CgUR-^z=;oE~9JVQVrel={+S)P6iQCg3U(l0zR<9xY%x~I3UixFl zu)hp^fAWxCJBATio{^V#$OnWMAmO4AhsYhezd=R{WmN&t{N^W0(m{oD&Ln^9xS zcLhKqq8@;vJx@Mu_C58C*)9v9+4B}@8cIR>s4)}F{SPiT6DCj7Bgo{PyI`>h#9Pp> zTKlLB>{ra0vlqnJoHD!j9?)II4I8&A$cA9z>n#rn|by1 zx6S#BUzwL*drNuw?4g&``L^X$gwQLD22o-xT;L_w3lPDLE|AXRclpY-+D2p#pKx>m zBdb)aZsv$FN51cUa7uNP-6M3Iw&EC91~1{qlxeesmxs+BnSePWIYHV(`$%U(T5$WF z8U;3dB|DgV)$1t^gnnr;l_ZEYOub zq6g>=NABS>zzaux*j>h#7K$dw`S=KD0`85}Duoz{5IO)8DQ= zdYcui*O^IEW~v>Oh9rRr*ccyoaO49Dsw$ z(ekyTt3--#+Okcfr?@TyURhg#4W74fvGN(YFOc*CfU{UT5)tp>xB_6K6M)i(Uw%W| z^xX3@cZg0@mc8SO7wXZuL*@{!{y@&Q4}g-4VeAbN7uYlA3;Ky(lAqHpADsM1opt!Y z_+-8WJgR6r0YNjwN2z*Ssu@3VsxF?;H*|-xX(PZ=ipU92@)_aJQvqNtd7FuCU%BQ{ z^OV>TuYhTbiHUqkG3LsPFP^n{>;0&4Tob1{6O zkMOEcc)`3ocm86H(%3J&Sij+MU7Vm3Y@Qc3ZebSQ^N@K`Y~42D>qzk_d58Ii`s3S_ zoamz;u=R9{a=P{CqjnJ8$L1}*^B$SoTB@wLvIAe0>j|tMJ8!`qCRccznwDW2wd!bA ztbNSPm_1MDLi)kxB%4O`10DgaV$-k%L0~O);tNye%~exON1v@ zJ9Ci_k>}*0{}A^Ix z1I*lh$K5)A0aDMDd6s@xuPbwMZVOXO_!6X#v|;$Dv1*%+#B3^K1bGm_$~6ELwN(&h z^oy-+s0+R0$R~0_X3Te^#*Ei-H+bks9q)UedRpg(y3!B&xkUWg7Y@Is@_YY;cghF* zjL*!e9`Xd$T^^zg^%G?=&KPIhFyHHyo`-&~9sNaJu1`ZysYa@7VZ6xzDV~ufO zHy&84aqRP2kIdYe$|?Fk7*uwb)WcE_$Sy@ z_=sJ;MQkXir_PD)aa9SvPJEjyLbMm2zxKvEfnY8D=UNT?aCpoKBlrTq#ReXMDeJ{I zq#>U>&r6!zNwNTU$^OqP@r+AME2#E}Gf%78d*C zoZdn&=_mQ{wS&M~{%7sMytY^D1om;V_%F324ZWnDj6pVanZsFgV4ryx{}Er^UV(={ zj&1eUM=$DpgTC0hDnj%F`wWj+*X$OXLEsFwA0F@}esqa79JUji#~K5DL|*u5tR=Yi zX~|M=oY5E7s8$y60-n8gI2d}Vw@D#bx-bWwX5?j;n*0pJaMx6XDX`}VWkts5xokX(oyjPzA zI_}sAq)yrnkiF!t`!p(_xdZ5z#I% z6nv1^jvUp~TNcYs$+wan~6U1o+o?)-pIJoQX2h1$-L*~mog5GQqd&c!k zyi569kvC=fEcNGi?c1;81gwa48hx~5;7ZaJ1#3A9htdP!F!)eV7Vano;95l|P>6b% z42e|Rw0XPk9pgUKi;4$jA)UQ$ECWXoNwgvet!XafsIj1N&-EZixJ$p_Hqa_-y< zBHDr?;Dj8K1wCL61LroGWUZjzegC9R$~aR111#i-PGFZ9hnNYRqoo*q9AL}KiBqOm z8ZQ~d7-@jwPcD3}KqiKX1q+6TV~z|u3u+&b#p>7J{Yk-a90>-&E^&Nu*4bsF9FEki zS-(+t+bV;XlreegOf{rT^h6M1lrY8^5sV$636sRr&%7wI++{w9IdK4K$dri!@D8vR zpa_{`#5l%}&IHw49)g!ds4iQ+Mgd#yrUF>fU2ti?XS4_DvlFMz>EwgKMlT=R{DjJL z;^Y}B-*-fB0WaVqATy324q;`qlXkGFf?ncOaD-*uqZ{?Po#ZfFL?FX2CdfD6Ii}Gh z0LD!6_r?~u@RF-D&`a(s!&yMLDJQ7zsKnR-z{EImbo0pD$FwfMAC6eV2O=36E9mC4 z&%dk-7vv9LX~Q6KB#|%2YZzA?iFr);NWTGEDnl==(Z)f8j~oF+7nu}DBaa>HI7abc zEotPVUA&LI#JQs%=r{Qw07u#lF!t^Tr}X&$lxZ`iu5>+yg42PWu*bO3ZEGhg>IgtY zr2z&3e4jmkNu57z0k)TVsDq>S@R>~;^cwy6N^}UD$~eXG!6vh~!Wlze7euDmX6}%- zlO+7WF5)Byfwh$1u(5ZQ1onq9#gR5ZRgU_P6?=vLvKVLch-1A(zB0KZcj_e?miY<& zuy^YLD#1_HZ&@s%&w6}V>^6=Ki!|B-KWGmNDBef^=^O1~3?UC}IyRO3JY$0c*<44)!-7K8Y3-sf$P1|H_mFQFu!6z1j93N&w1z-Wk3Y}ol zJ8{a4O0gr%57^aJYaZ1x3O`?y1&TdZNF4)Z!GKO6hhm+OelB0NR&|DZ#~GJwPBDML z3+8m3bAX=8XeYbp8MSkDvlbq*VTxQBqsWjv@Qe+BXJrnFkE|sP#}feLqjML`X0gdS z4n&uUV#jV%9$*pU6TJf{IdzXF5+Npiqv3sAklezv)Y5Vb0AB#QNp=0r+jHllH z2WdL}#z(@|JpbaWx(UL@7QkSt)S2J7g^b_0B#tE*eVuD0LpC0 zF#gcd*WP?r{VL`T79I2f+sk4E{pG&CFT@V-eEJ2mVe<}^|EHgQpb?|dMJye|XnpMLRewn6q?{3bxc97?bHMh9fb z*YdLuJp6bCT7<6}G;DO*YWD-v$KiXn3x7#oHX`^#F(|tiGkevY|Fz(@F>efeEzua@ zQ?&(L8xlTs>auI3(=NF_#}W@lTl|_b7ESmd7k_2zJo5O{Ss#AmpAX=_ZL|Oz*y7-k z6VnoFY#H<2m+AZqF3vmxxY(=gDj7HI;>&OUn2wCK0AC~P+**J?bTkCO$vk`VX=i46 z44x2k$|Of3(9_KJ=y|{$o99#66&dS;jT<0ZfA}hOSlif4;b)9p^9x%i3pR1|aVKW^ zJ^a|yc_GPu*@Q(uT}xcy1KxQ1!yF&sSzYYbLcPelFH~Y1^OZPt-Ssw#wyP|r;#19g z0K8^kEgJ+-%A#h!e51c&V;h*{tqnlHufm?J)yP~N%7?jc(?q`<{bYXezj=sXlklQU z;dgcj|2Fo_Yu1Kz!_K_b!&f+`+Xe&wZSEp#{?MAk_^|FUM#ZrZjiTdDJSF$n=I7;CU6Xw^nP#8bqz}EJ zGXA{b(g@b#(>jLjvq|xiHOf2hf10-5Zl@eu*<3Z(Kl{SVd5tK+!kc%-3Hzl_^Bw=h zN6?k|(3}jk)ka_Yp$%hT*FE;hG|j{md4!|wKp z@nJpBmhz^Oxh4FB*cJ}}Y2)qPm^<>uQ253VV~zx#ngfgfr8~fLcVI0Z#DV0ZU73{T zT5qqKAK(Xkb8}J=U&mhb$94L^ z2Y&b+b0EDMb46D+r}4uFm219Pfe-pMruokA{`fC>15WHI-X^OvV=WJ?)o+^%+NN_h z{qbkt3lMC6g z3p(*e4Lj#g@IXGmee$P^Z~yQw`Ak@SD1Ee1_~kc$%=W-vv2%8JVyyk=nQy_g+g|%- z`KgmnwCUj}KPcZY`+4;xD*+wW8z*32O7Bz^elmpQPXQ?ji19FYh1qfg8X*2+>A)aG+dgD2Tt2B?L3 zmiLHJIdj9aGPeY}>T7&`v9DBQxlsf#f-}22bm=--*_9JmnJgAqXps!skfq%)&n>d( z(9XU9v4DPaLYDp-0|47sWAK~E3Rnwp#iIew1_TY9KS#llvO|u5BQ*(@EVVgdm)i2m z{xcC=`QQ%zT*teEy9Jd12zE)P9F>5PQzUo6830!%P=7h#Iu%(O13+LCt_5NNYb{ir z31xxpi^+I6J8aa1Dm$J%lksV1o|gjwJI(^%{*Wc!v}b3|q;E15n9(Di_&(r}GDSAc zz*;gFkbBnn8WaoLGNHclb~WkJzj4RL1)UWT@tZtO*^fRh5<0VK@H2b6M&oGXhT*KF<> zk1|rp)hlTh1HiMKX&B{z;DD8fAAKt8TYa)vnzc6zg$^=!+2o$r@##qB30b}xa|T(I zO-18$tFT>svJ1Y}8H=|?!0Y^1x>S$*`yOye4&1z5KwtceI`kO`r>yJx+|ByLf-HC7zJ96^eTVLaT|Tk0B2V@?US$eY^tspc0JGh3#9|Y z2k|XF{vh*PrkVhDv(O8EWq-dHeG3J#Si!Yh=CpH{0ZgN_1?9_D7 z5ywWI-=@=J;i%5SOYbQ982TMDWvMi3>QZ?@#JAX(z(4;kRs`l)%pQ5niLq$>Q(o}U zmp4lQLF|qU#IUcw^3iRr5<(k@fnN)hOrst)YA;C z^_N^Mx*iL^Vg6ej+VJ4k8SIcs-Mky?=2O6 zy=h^5@EduJ7W%iY0IHLOz9ulR&M)q`j24j>O-Sz#U#Ndhg>5)XMT!CqVn1xaNb&#=4DjV2`&=j2$4hxkGkenNJ$| zYKJ_hKJqB>xG{=9^R1&e#OnC8uFYes30TXY(VsZlCXRjr_PkZV_H5cJOYiRSE7o`F zy61sM@^52AJ+gY)318Zueu7T4LAH`JHr>3+N>)ExlI?8VNanDHGxqSRpEexDUgTwS z+PJm3rC*}fdDt4(q{wH4_I+I;)^Gw+y6a8-lYhB0lMG+U-p!P8=KGd^f!QCRp`S~yB%|}*z z{q3J~-E`;p2j86G-}pOO!sfgKBCePxV)1z+wi1t9%ZufT{q#iUr2P&!G{?i%JJ}|h zl$%{5BWu97-}@xT4>Fytmp_X3WViBN;#V;pAGUCm_jc+dk3F4}4~%_dgS^bw<`wqq z%_DNgFB>8UKF=7+eF~Y`U_R;OGjf|idYfu|lrhN0;9TN^Vkgb`?=5u)*5Z?$!6w+J z{y55sPh*#kKUEH9NPlZZ?4=(*{ttxahw%vbe)8EDbD!Pki`k3~Ib(ud*7=zeG)KW& zf&r9q5Cl_p7j?|NdPRMqY)~D8y%|_5RoN~dM>&54avX&T-9Y9%IIj}W*s-CEGPoPt z>I$=u<~7BwLlvI#8KfqmBE)R5IQ%(_^m&7@=b!Sd2UX~QvU&%&A?ji zbF!Q{KJ)p`VXI!rG)@Gif>RDrUpzaf#t&VVS8eNHEuh)hw`0d~o2 zA10V`tv|+(=kc~l1WLe)vJ-DySj0N-&l!%|LY`>`*5cX3`CbIz7LjD{bvHq*N!4pP z$^Z~7EP?!XGcCOBik3-(&$THb$aM6FanT?BMCzF|b$VW~*x44;(Y5R3t1tcnZ2_+` zH~F$*BS&S>`H81r$oGM@{0KRC-guM+#P$Kq>Q$#8plz@=^yv)*fgYLDuh*T-KNfwm zAAlP-#Dc;cfDg86!KYnkCDv_1JKB_iXT!p#iJV`F+RyeIl*BgSnD2r3HZrE z;G&;48(x0xt^A7FFXovq2S6r%*%8`T=HNp^e{s!N0}y+j4{`hp8v(SCpP)h?>N;v9 zeF64(-B_JoP3%vbsFOO?Elao&tQC~%hyL&*GPJ#g0c0!Zh&wAvAP1}+Gp<^6k#Ds`BC*W(?{;hqypG>-~W6h zLX@qabxW38Y`uN-G4vht96#bMr^h4i85w0Q4D`)-lwrqCwJl>nmfqq^FPqO*^T83e zW4=S`}xF%e;z(P78aUf7tjm-{@)S^m8suug5iWcd65uY#Xc%S#7+@7I{&*!+uAl zg+@%w;Jz3Eh-55~BVa^X^{Yz^&bA!kQ1_|++JFM~Tz=gh>56M^jD^s>IX=%><mD8LxzGOD9%VMt$*-@wF+KX^3u*k)>x6#BrBhEkGtcuDZrW#yHj-tzoBMOp zL+IBl>|z#c1oJ`{Xk7W!p_5i@nqi9w zbUk6xlq^Tr^!JGv)7BGYVEzy<@Tpf`eQgHe06oR-B;AOKtnJvCe)gw6*pPW&tWf+L zUdr2Bz$0&Nd9#!6(GT)*&8x(YNl*??^dhb(V6D3Jkv&x)Lf72F7m0VZ={a^s-|QQp z@W3NaPL61D9+Kj0_ z3|4Ja8FT#Mdf~797NeO@#F%0<>(259h$=?REvANT(d*;|{Y}leVDW84*<=j`BS>_JwCpISr;YWYoOjsY=l%pKx)aS6YKS30qgYiwKR;JuPn)s~GKz;pG6zzrRVEFWY{MVt6g zt{GVC9({R})aD~u=;t}-Uy?JxC2Lq;7X3et@;9+Y*l@+xbldFNj+f1sJ{BvF#Hegf#t z=OwWcuk;4kX1@ijHJ$78snj{UhQ@fi%dE9%L$T;A zgKx7fx5SnKQYz*_u_jKCC-f^YW=SX*B8 z*H?=^zGds}9Ffxn*2)g#7ya)VVn>Z&?SipbwFvn1^WVi{{(wB;nhyjD$_hmIeHm;f zV9%(>qSBaKdbuu}&=2=yR2ZXzu~Sb!ry9r6rpdm@7609VwRqK*b7J-1S)}(p^jHp5 zhm9DM^(z1>3x4hM7lH)W*pjo71QvAc6&|vek;*SPjsl-W{@syrYtbjT+9u@NolOAD zdTSGPA~&|)EVI^Po6h%%zTwC54$hAf`~z#1BP;W=4^&#rl=0F?uB)@z{R~_-0c;UF z`^7Kki@A0GJVsYob*=RC_`Tr5i-sJlK9gq#!X9`e-Fo}o`J*=@CQQj0w(eOd@P~Hn z1#ZYfP4?(AE3cl`Tz}*AO56vUoN?CqnZ!R{sUsUa4~f2B8Ge<0^1H=9G}6ehtHGnA z-P#yyN>MHYoblIhvdr^v*onoE1)U>C_{Tx!u}RaCb3|ne7(ZpPVBWUy^A&$!t@@1L zdS)$Nl^3M*!>d@xj;TJ}#&4BZd&!zU0VprVe6d$7vdZ`)UmLykKH%Bh*jvCB-+Z49 zV~aO)K0Q@7GBTZV>RGXX_#!P8x*i<71CG2xZ+v-;8d!@jV;gWfXs~lxBhCu{DSLH; zjjBymewdH8DYRzv`Q`8fXJ@ds>w5vk|26ilCvvu@cAk0R)$HfUL7&`HAEEg^?2l==#+<(v4aK`|#UGJ0}4{o-GR|9W>8u-2xP{^GUxx<;_pJ>wKm|LGUs zq)oTlA#J$LZt3-yKX1PEjttOwgTa_K53!5Fw+$B{m6$<2uHjKz05(4L;!7@1r<`$K zy5_bAGLRsKC}q_{kIleZbN=8{^0qdsBe*BG5<|iA)R#QMd|gBMQNYE zE9dEv6aBJNzVz;UAL#h|V94a!hz+U_HWk|&Hfg0<09*agU-2ZJ@CAkUMpD+js?J&O zw<3KvzckCN1uSIHE8<&kZ5Z=F8=Ef58K(d>pbR;d`Wo9P*YR28+z#oR`N}J=7U=Zq zaL&4_O_o0Ph|O0w0qRCC**9i*WY|U9dyW%S@q>~M>SJG5o~`N>ba;&HmsAAaxlyTnzGg>0A1deEl% ztbn!7R<+T}{~vtl5gE9%{s8bj96pM#W7pOf4?J9LcovY*{2;S7Z^*~Ip)bw&)nB&X z8-3xGJ&48e@7YEhAX{K9yDG;sc+47Hx{&aPyt{ zH!AHlg0(OV>kmNL&b#cH`)f2T~H5q|Wsh!yV6_1=BoLpetA20-zt;%%Et>NM9ni~XZd zzs!CfSSuEC42aC}Djt6*`&T}$9sL!+(3;tNZj3Z5hxEIhI_sogp0P#{ACQ5B0y>u| z%x7B@SqocZ@IQQuwUGJD>;KwSm%lcH8o^q?JJ6ed57D>5hvpJAw%voK*(p5Fvpz}m74-w4(ctk=W@`v#xi5O{S^x;C_hjE3#=_d z{BGo@8I?(aEF5qS$ly3FY<~K)UuOQ50oFSI#52vnS^>N4C;+2#c;;PTq1;EtRocRn z$wzR$#FESAdV9mCb{MpbFHY5~f%&zykOw{sSgU>V@mw9OB|DBmF!5B#Q}8uk@TshO zby+Is)jflOgRS$@6DiBtc_r8mkOhjPE`Iv+c`}dsz=4(tQ`?PTEe9-nkt~6w`pn^| zx5%sf^;*bM%Pg1W+8bGNJPU<>e)5k$&)}R_FYq(}{0rrF3Rr7*j%|=nBUo!Z$-pWA zqa88*(TBP$1&ZvFZTCi&o@GB?trlcG_54fu)g-dTBfbh)TXxZB1#5@pS2qDcuK7)# z%GP<&mNqPgdOLo|#;d!3|C66(c>xKQkN(aZ{sf-|tUW&#g3Z9%oY5A%nru8P+pF%2 z>aygEJYzB3+ud(E6MO@LWMPm&{XThULtsOuK5l~#JRmk{Qat(8vvS{DBY%ro0hMvb z7M#QOk3aoIws-zcun0sTC-T_;z{7Ii0exQ6DSo2J++X(0?;-tfM<2?tJ%E)y=B&LA z-^(ubj|>V}3pDKsthGDs8N77|*4p)!nW7w-Xx}~a3K_C}fSW93Sz7GSm<8&qkFEXe z7xU&tJ)qmJJ|5IVUKRNNbp)jwciIFQ8iRbYvHP=M%$MufCi+GG-g+=ckO{uXuNhd2 z=ty?7bF006=QSh1DcfMvC!Typ4ko;YHP3tt_D65X<&=whNzU6imSR28vJ~RVsWgiKmeN@hI zI@R1UDC?^}Mx8my-_cPs9=iidR4?v*r1fL2*BY_=3glM z#Hd&>gxv||%^TbAv}?9IJlIS#=Xn+P)UZqZv+?L9f^8 z=%cPvyvih5dihn-^kr78Vym9?L0=t!t!K&HNBzV)`yFs_wDmz+Pxf}SF>J!Lbmj$@ zrVk@7lkMa6Kl2OQcE&rK1~x1ddiu%FevxAp{zSGeUg^{V$+dmL$NX}UA!)(k;a^WV zBWF-}9g}U5fknK<&t4Lyc4i(=w}15BR<<^MtS|)IHqSu z$yg|V_SLs(mjjPTzZg6sZ#Fq5iCl~?qWF#|-*Wy#fw0N{Zfk!dsFXP@m2OSyx`aEal;HSu>JFwO` zFebe&E4$_3SOjXzM|6h7yfJvxq;&jQm*z9j#r*sc-(o{W++sY+#NrD9@air;mCe+3 zQ-QM)%S~P>4WHNr*y^J(Y*W-2*nF#PJLg&UQ?G+8XJ>7L=*74J#9CnY86+4Q1__HGGcF42pl6eG3S-ay%fO;9SvOxEJ2Jvx^co`_74xfw5P=f&U=u#xA{C}Zw&WXg&WkLm}YOTY9PANq?YK18wz zuvR9wvA_mxBKii0V>*-}Hwi>+j88ScI&$E;n{Epm-YwT9QN_1G>K{1LFZy@n3!mDX zexfq|WYz-6*>ujH3x2f07XjhCLXJYW3_DN zwIV!}qbYXZbH7fT;797^^rOT@{_o8=#*eq?9F4Tk{s-kbpU=XtzKVg_jP-!^KHc7V z&jZ=-+Q7CsqF-b%Ic(j!ln#v#F#{h;Pd=8-)?D`;CPy_{zZ%oxd7HXtho4rLGsxjZ z%{DFAy>YIe_3Q1B_8m3zPVghMmfd*U%iO|__$vBR4w-Mnx;9+M6j*ERW=zvT0c$1A zc&h-Ad++^^W*JHLSZ&R9vrlO}1_AHpTl0M*k9{Tc%Nzx34P+miv4DQ=g;(>0?!aw9 zdvV5jA`fRLzyPKhY^YvOGxDrK$O%iGSGRWT-T*^@Ivl+-NIv+uIywM&Mj;mA&Ym%7 zv_;rn3o}^=SUC)@c>|`36Hx|OY69yi8>W%)sQhUWpsLp{GRK|2_8CT8&XpAC+g(@jCyGA{Kwk-GFnDB~j2`GL{k?8Ttq7oHasn6{VA=o@ z(U;e`0ZVqc@g!reI7it#0u=2aeA)psXk~)wBYhYE+9-N6ad9AC8Ra}0`DsR;GZ=im zPgaKm$t}8N4Uz4hd+(PgWI_Iu&%Bf!H9pHLHi8WNk@vDItdtib2Au=L0ewDVHD!8r zHY@`4n4B%-rp;*mwZ}sACCks(U}XZB#GE^&9;vUXWbozj4Ke z_1$>)MSfqqw}5cXL`~+(9d^{Q*&otF@e^JRWB2usyR;&G6pULe2~^pQMUye)OSx>; z7gc?T!G{fiZNOT<7eL+f0&jW{c=22Mc7z@6@GXuF0M-_papvpCaosMapveN2Ut?eV zHGgI7TacQR9TmWCOEAKgWm4f&mah5X*b`39fRS?Avv9Xr=T#au!N19#c1+TAY1f!s$+s>q_UUWfUL8~h0OUEc;d9vYZ|;3K`wD@Cd5nFl z7a#c27aw%eA2wrgqy6&M1N&|yZIh$FvUPwJ<{1Gpo3N3|?utz~lg+}#SSz+t*L`=S z{#|$9C)X+C+j)BABr@EL3p%%OF)n~?ciw$p4w`|r+R_g@@$@RtH+RslXG>r3 z>3mrL+eVvg8P7Bat-bEwcqBrRh4E*MuD`*?X>j<_eAaD8kjNf~htIMAwwUI#EkwwB zaPYiL=jFAF%+Q4AH`e`!; zkBI)(jfIkrFdZ2S@}W7tY8A-F{SLs^81m+cBWx^~WUdw495Q@#T5W^P(o1pg>EKH& zQGGT3qqNO-JLNf3Oy(8A)6Y6TFM8OhzwFQA$VW*4d(Iy8I{8@_UY?$O;gwu&^YByn z*AF&2e&Xb8@5XJ;02|%4=%9E0fNiTM`{K|6on*i^p6O{3qzyXb|HK03EB#?d+R|qq zb$#c(PtpZfU6*dT^S<A^xsJP7EiGHZwCbKTgWqe|GMd;Bo_wn-$D#`(U$ z+6BkY$e{9wkgElZk5GtXffRUa#&aWSgG}k^l{enaS=i@ldrYE`_Y$au~j!n!@_m}zkG@@ZZnAg1td9E0I#xdyjE{+ zr)P1Ce&?gOqE5c*-H*OXr(AG#%qPdC>%vFzsWuD63w(z0XpuvHGUUbO>az}D`xZs! zUvYSAX_Z z_>c!5&;MIgSs05K*q8VU2&`|8KOi$c(c3WOqn<85TE!C!jan)#JkrG9r;(M{ z!|Bplf#zlLfw|2qsXkaGhIXDJ{>422KR(pl=p28sj}L1B;H~8(ZP2slE4#ZY&lSVM zrtqmPUt{;yhmG;xzU3uO@tkayj2+H^~2TJTw&W1Bk*9PDR;df-}6|k1y)DQCt zzP;Ju8S*&m+zYZ#U_W)RRzKE{IgB6if$rYeB{D`O7-`o!ncuA6*9P8T1!6}>1JIk; za#Gk7o3-(r*NEXO0kP(6A5gN+vSI5(qp$<{;x@iD|dn-JzXu`YdF9&1#`a*AC~JM)~Zk8l6@FWL6Rkov0JxFCH( z^igz#PydsfaYv8keq(=nBK8F@!@tc+b%|RY-=MDijg^nT%xg&FgYNW+9Qe}wrb4XC z3Rvsd%;phvqm~OIb)7xeNdU zjl;5lb6;l!fim-aZ7v77ZBC%Y=&uTaISa`|FPn#x(1zC#IY4IvNqZNZa6%?_yGkZ< zyJv7jK~(B8P)(F3Q4=i3C*ude$bl5WGgQHuz*yUjJT;@@Oue@9(kpM} z`u#7%O19N8$Dfqjz^@?6Gx+o+KCh%Xce1p_AOtWLO=M~D@9X_#Va*wEJSH9hoNMhG z43HBjWWhj|0&oBbhc5Ur$=cn;ldpajP!LEXL#nieM}f0|QqZE$Uir4eX&}%Wd6x_& z2X?nR2@9@Pj0q6X?i@KXBukmSWJ!MbViOi^0)D?uI2@C#E;6d>8l5u~y>7*Evl-9K zSfct$A@aNcS&-QqXK6w-A$it>#3tD$ozXvl8hBztk?}?@+5<4MRqc|c_Bb!#0v)hR zyZM|*k(vIKb6Er?MK7&L|1J3RU*9dvI4qMF*$RkFFaRVAs>PRFHuEV%0*Gao4S)1$ z0i!>F4Z0S1E0aIk6z%AXzwAL9CLzC@@zqEGHJbz`RK$BPeu0~Xg*NCzprd_R<3M%i6XgZQm~lDSxA4{vlbZW{ zgm&2(fRu012fww0R~ucvK29EFb#eGR<3r}MuMle&sT%;MKK$8;;LkksXxNL58UTR| zcVMeCO10yhRGU<;u?L{)$+5^-B>1-T>Tr2>$DY0AAk$gkP4;|({s=t$2F$T-d>2{y zxRr6@NQJMy{bN2u%`@x`Sj+zSiQ_^RWsD1WZFHyy@$UkLY7Tibbp2XqpdirkdX|kv z;8G!*D&{F=w51*ySb&6&IWnTL z>R>lE0NJaBcJXtS97m@OBbK-$eBsMs^K5VD-S*5npdWK1p3OJrEIwDC%nkZO?ly&F z_TwMuby6toICvJ3o=H^|5@H0K_-2W$!l-Cm+ely_^UE(I{;e%bPs?n zo}D>uOcXz1F=#C6i*aD%>*bK~6VJSq4mtMZ{2U;`94H>NnJ$Y@`;rOR?Fup1@n<$y zFN_5ZKOifY-_pjZr=OJ;9Wgcyo3U&f8uq+;jJy20U@RoXwYeV+yf&k3JJGgt&z<9J zd_Mom+u5gk(?Hhheg__!byUtzWM}&8tX(oCLp}_sp$}|Kj3uLT+VX3rL6eusavvVH zO}D_JGQMQz@lya5F;(%Y=5s!SA5fop*5hF8>nRLVnXXL(Gbed}b`1H;vv03B2 z=mVHb7XVJ*Ltsm6X77l}#2|0H{b4%t_|x-70-B zd!_YuNK37}LGTdsWruIHX$Pz_my?5b%y;`l{e)a7fJ4xWs7jY6045&a(NPO)TXP}t%-cC`kO?==^cN=}(5!*J-ZoKu5Y%|s7 zbQR-~gZsc-c5iO5sk7OZ+ve-|6Bn^FRD3Ehh24lteI!o&1E5jH!e6$U4H?He6}^yI zv7<5;{9g!9Hg(M9&^Bbufo$0Lz{fgaC%`Jao2#v@@WEe-4Q*P9 zO^rwV7QI*#nA-r?#b5IszBJF)#m!~zunxAi0m76IL^kr-6Y2LB;lH#C2PnF#lYX)v z@u{REzRJ9BUa*$3o~}0f-%L+G``esc!@uFtdcyi$f{Qr75i|!KdQ|=`KCiwlQymjw z&1~JS4Y3!%HjmMe4I5u-+2ymotlRx=)M-zI7a6eC#&8vnBQ{0W5~GY8NlSdNSK)!| z^v5pv4QnFvnzoG-W6U+jbGfD+vhrIDTl~6@Hj@YJrkXp%y1s6vf@uT(ZvzJh`Ea&5c}YVp7hb? zi?t=$u_w3#{qx6sb(H}3VvZ&8qYhv&pjn*4XT$jbjIIdSjL>j zkCgt`%d{fp*_WgzbA&Mle71IE@9JlJ#b(9lQ)eum{g^kc#CFDxap0QGLt~gPl0_^V zT!N2GbmJUgYYsIh(Wmyw4AuaPDj$9AjlNpZ90zL=hU?CU!ivEl!tvUyOibV?;g);7 zrN+AJ*BgH~UNgv1xn93d@Eog!rNz5oPT6w5H&ohgY`?o}jlapVGUi$9;p{AoeW|nT zqSc@84K>zXzux$}@f!Z{S+3X1`0Ku%xJSd;0m;kc+`=4z+r8Fm{0O8uOOvSG=5nvM zRN8KAzq@Po-|hG!*)?A6Z2|%+ z&GM5fprU}%1QetN6a|$gBE3tO1QaCH5cr{>(xr)XkzNuA1Vc*{rT3OV=q>bK69Vse z@B82T-prZFOeQ&#Ir;YY?Y-98YZJdOoz(q^Y%Nf+X&Z#qKEq|we~&EBMKa!p&Me!N z7L*@*mX2%EPuA8H#0^iPDtjx1s!r%eO{Q5EvYV-aqxWJZ@a>2=)x8n5?LsIp)0f9qq~2Gr_Wk~I{i*WG&;fhlr}gj# zNBMs~E1rFQ%WS2sy5dYkiGIU@6*DRz)yO*EX*YQ}3eS&9Y)BG*yySS~ZPOGABC^zf z>ZlP`pJ@5m8T0VVqJjX0jl5J1|JwRAQtkzF^FrCOH*K?e!F}NS;$iU-mz+H3Q-NH| zd_CdGwpHRNtQKv9XsMQ&T|;v*|rXgbCMnzxgi&>9%O``gvkVTU}O z{h3y>Ta#DTbfBj;xQy(ky&gVT)XqEa3-xYN77ShJE`G%YlMitEbnE-Zt!@PCc6ga1(lK~WRf&>tYz8Lu@A6v zTViknWZou{TJ5;fAEFEA9@w$p$)d8ML8{nCU1! zczN*Mv9G64+|Ie7QlZ7qFmV1S^|@>Qe#m9do$9rSHZN$C$G!vP zYd2l$yN(aFV1y?zUaXG&3eFb((s9>e%;yMJM>JI9Yv(GmaSI%uVAx3TEd9WlIC-U= zSj|DQ!+4e!wA9C$Acx}v=6urEJxdcj_AD%41QtOpz0%)ZOjUeuLytW#KCE7+R7)?Y zCP%h~`A?NwH|$_6rSY{#CX!8Tyo588jB|doz*M|CrKQ5^pUz;5kayPF>#H^O*2u!@ z@+j)b*qDcL-Z8&xj#SKZ;K2h8*pW5yiJRKSM=pgnxEjg9<=y`Zg1kAL{m{(krS40= z3_mvSf8g0K^-=3m$MQTh+j9vuoX=H|7+lAdY;%ljt9JUnsP5$BZOug9^-^&UJ$>MI zQ614%oTHs&q?*gVfBoWz6R5wbctXJHXX=LOi3Q$f`dblQHY}BP{5&s6SM8JDm|?PM zwRPSl-u`IA3co}JrO@tc#PL|?PnP_Ce@y7s=^g{74VK#qfx7fMqoy*coOpXaj59cr=KQQpMDApCLUL}ypd zO0@pjsTV8iEnDBznR^Zb^^>TqHQ?T(IZ*1Ttc)pRPFKBPh=g_-p*zESea+x7L zsJ?Rza;o3d#4j(QiU!?>K0Rf%8`94)xLj&j&;FuKVWxK5T5EVIh$!cff{|$KHSVg| zzvxm}9k&!X@Oxsklh1iGO76HO&e{WGDUPqLwZRX0qiFIqW!+S0z0M)NrKr|3 z7m6eL)+?JOD=l>^zRp~al{p=|Yi>9E(MS^{Qv3LJHT=}_zadWEInR;VTjb~pIUG#N zclldvq)Lj)4Q>2Dzo{0kOXTtyKzps}p`gouSZsK6=EOgBE@i8B(?5Gl=jO6^p;m^k zNax1Y68ZTlCtfTD%MUX>1%e7EEe2&o$v7W-fHao9jdn5dXl_&i{5NY9j0+-rX2`2{ z(3`qZBvgCTLu@u+CjTu6Yc%Zs#OBw~<(%HgW}P)Kv#1`cAeh1GsPngl=p*E`e|_2W zE}K)(%ZfLl?pqzKLu)SQvv$BN6t)YXMhVa-e|zw42HR9_M_p>~{Y%oFG>Zd?eN7!J zS(lH2sPe#@7GwOG6Yt`T*6-SQSd!i*m|cy()+}WJk2`PQU+!!fJMOnWQwazMW-%e! zi2CeLC#L6`C$1VM>o$r?j^hX@-=%5HPX2MNpW3wf0{mb$x9@gvsi`X2Iay_CjvqxV zEY?v`bNA#^ar|>!8|~gH%o*^IoGD<4@X3OCC?Nm-L4TJqm9BJm$S7k|t>-scvBEllmZY2g z9El0=zO|a8G*P-Xcqn~Ry;CUQXeE6@TZmpM8o&H!@bTaHDW_+w4HjCP(9Qkwto8N3 zt0>E9)+x?)(1uq(>&kmtH7MdtMVI1=AKuG zN%I*adtWQ6Ca(l#KQDXcv(LBeT1W9HZ}%Ve$2A2&{1<)2gqZf^J6GHje2D*9!4_kv z_~rL)a=C$Y|N4d|*}0SPNV@^wdbc$f7x9Tq9J0a+wqb?nWIu0$FmTk=Onb4?rc{vD zEOBvYq^6wypwstvIMk3A6HQO?TO-|UK2PZIM?`riEDvQsuq^+K;OSb>5sU?~saVB`IWBal**jV`niF&G(pq?}&XaC>7FT%REFaszx1j zt`Sl&FTfKOO5g32cj~0$chatff=H`@7Z+#inV1@+ueIw|la%lkluM=K4YIq4*iDUQ z>$%u+C50vaxLjga{nNErniHN& zdDSaNHvz}^Y{rOkS%*>5Yshii`7Bs574a6z#bDm{+sFe$dp{jyF?UUh!5Z7f_K0A# zx=i^HXTL4(XKj?=C#JU~A$k!D@c8*%LkM0OKg}*Q6XdRB|NY>lvmNe1YK8+co?AP2 zJfFG|hQhDxjh{@W4N{C6^H{eE^fQ6RqOi;48@7@?*fxfYsxH~+bYAC3|GQnq8RO_J zyDVy;o8I=)>~8gVx`>|5wWu_9TRV|&;jz2uMe%p;NgpOUxx4DKWel1>a1i93ovVAi z-99ntMiyuHo9GFPOqKhbVbi_o%Ru5tt?+EF`qcc5l~oCq;eVrFy!E1p5#?0Zi}{22 zt!6g|0-Mlz$jhV1I}S@rKiwizmM ztEfB~79UWXz|kiL<>)LeH?t}15U@6p=T3HunmZn8<+&~0*0aOs)jI2DyceFOQvK)N z>n@5osy-c2WN%d0#qrNgh>=?0v##Z>X!l#q{&$A-@oneRk-4dDC1Sg+;pA^P{TiTk?lUO5u1ibB0oOQSr5YZcpVl@bw~@p)ZU z6~DO=c0=Ay^1ATyTkJxbOvoY56jOVDl3JI-C5hgYP7LBNHF;O49 zAVqfs!zOm8J-BVj-lcjY))LT{g}KR@bK|2Ak63YAg}&~_^Udni#YBss-R4n!M2_~1 zt-Gb+3`^G4{+_&K&N4W~(2h}2@T7OS`0H{&!0aDTm6pTOin|u#=i{$`X2!X)89h97 z(p2aygP~Z&KstUaq)ez-cOlu&+|sC%=tZot2)z1{VDTwB)XE+ zWx^3Wa{*dPy$Fz&1mLdlt??*>7`|ab(%JWHbbJ_5FGVvs~zZ7x6H=%Q~HhVyjXiC8e&)VlC83XITDN& zn`R1LO-KP-96y)nxp%z$qOGGuG0LKzqX#nOFyyLp^cSXzB>FS++9EYSXFv7 zN0PoDcgNW(DwRa*Fb?ZUeQ`K^wE|O>LA=T}@KM~3fcE>;<( zF75K6O}@`NVDAOIk&-e05!&W)(O39jR3K4i)FJ?u3_W$J;cOM=<6z{5BE;b-w|H5F z|5sV>`Iswrh3Q_#F~onTHI{{3_QW-2^0v|frNPD;8GeH5ttKZYuEx;qHw143h zZhsyfes~XzZ+W!jX5c)K3?>IbF4!FmdxK1!Nh?Nj__!9-)49T`OGf*1v_#yQT};6glzBNh`OP{b{P_$72~krGH$$JD~VS)NBp64_=%&33zI9LJhMHc{{6M zznR+~5`HkRknGecF>bEzhGt; z{9RXJLW$mQb1a;>TQCOS4HHfINLxcbgT#(jDPC>QcpPw=u#|-akyl!t3P9;t<#pYY zEsqmm9C)LYnS5tk-wcE%lqTqV@G%+_957Vu4U2b_&XDHWEa;0r zxnKN4cQ>*b0Qizs#M_b^$Ut@ioZ3fcByr)H(4D;4)dAwC<_}CFdfFQK$!z3??R&r- znyJqQrXIh`?l+FCVWK1E48m3i3Oo{nR4!yYkjhMJhfK>5MbQsIq$J_<)X?USuB6|I zuXwSYZR|au|G9Rf=fzCj?!$UvSFMpP9;k#tFc=i+#x)b={HUgy@E~FyWzHai9rdS~ zsh{zF<#x=(-MWu3BWblSfg=6@u1XQ2Lf040M1B!?p1HT_ePGa>yDeS!HgDUs^#Qeh zf@+w|3KBVqUN)gfCpLWb+f`CLR<-bOgFjIov3xys|8Q_h9MvF&grS3N)@U|Dm4&N) z4FS~04aF#&9(8tm!tqdQ^o zSiD>z1zumN9tC<*Ikm)UZ4X}WPQm(JD5>km3A=6XmKHvtQ9%B60zt+)HZXDNXc6W0M|0wuEr$aB4DE;yA{DCy|f`_9=`4Gkz>Z- z7X$r3w)RbWHMrQI8OWsgz2$L`i{2otu)yXy`%>4G2e?k-Nk?fpsKqYtZJxc`C3p}y znj#SXEZvS=q9a^2X&^%Rz3A%F%!@^Ip`$3pa!?Z;VG(d5O_1MM>zafW7`pw) z-9O8%J2=&E?y)S!Hb%Ei2G>YT3b+LIO9Cn%iOKuIAeq{CEC@5N^j$BZAh){LB;EjW*oqsWn+$!SO?b6y~O}X7_ z=@(U)mq~LY%kHuFP1HpM$D}K)?dFWOi+dqGS<;GbR}OyHn)L3EolNeuQYP}eq<>sr zemB~l-5v$oOBw)!u8Hp|1qLM!4n?{RhF?1esnp7YY9oX1`hM@fG4s{gQH$XLqDo)& zZB8I^<4ID4G!6>6l9^^~<25#`;jl}CkkCs@ZqCXZ-rjRNQT6`mnI%4>e${M`t-Qu` zUDJE2IN7JXb=U~UlNWgzN4(Hfs*GSO^TDcMz->*0+2Lk>hIXn4P8af!d{>_R^lq-E zEMOyG<~Mj(7S$!`-Sqjm)xTt?%G%t!s%i_9Zhz-^X}fP202}Og(ZVshsAit3@|VzC ztpD7RcPO7O4vuVEYGApXh8Y|qLXI77ByV;`51-mdwVL@@nq&_lMQ)}b7#~of+iIFuB(0_7^iJhz=H^dnD35o^ zW%<==-HW`F(S5e~x6$iMjjSBMY*gBF-C0MNUT1L6bT@Tv8L;#`N3`*>j`T`XsXMV@jh5A(HmS*9mNd8m#fb^)0mnwD9(u(-rlcDzhIdSND{2yBiMV zVVE(!blcbID$DJq()C;OAXA!x`l{f^+n#~7LKodQ(M+#j)n58=TBm)tih&TV=}cXGMhs2*cyBjQN8K#e zqQC<;2ww~d7wW!}uk*vrnevSz$k)RT_6qb#q`lh1i_oFNF`2h*+65?0)J|w{7Py zcbaejY}`(MzN&u9t8nPqgq@@ozQ=n@%ZU=3lP+guPvuE4fND1S(sF&~6}i7BPeTxv z{(n%iw^;(~X3A)uA{iZyN`8BX$^GI66Hy8i86g=BZSi%12@6fB>n*VUnVcB4_@6wU zP2y$~L6rK%&*EizoT^lZq(i_D@iRuLb&C+nq3;ov;mFRAvy-R-wxF#bpOxj9k?XHk zjC#i)`r1(qgIC+D`wqFSl4voYNG(CH2mV{b$$hQ)>4{`w=CVhFq-4WDh3j;}B5xLA zwbMr#!;*=Znj7VAzKK3zNb8j9EmUfGuQzxCn20+d{1Ad!{wtgL4h^KsS!|hYvC3Jz zE#mS0^)^-khFy7k<}`5>K0Qc)i1W%I&>F&qTfZ;s;HM@HgvwPjO8x=c~jry~~PoRL{!73yAl8l z$sL#x%!2`&r4$95##9Je)%%|C?x$i@=hh$|(_Je@WVteKkD~W!_vTlR)5V-re) zL^2LuRS#SJf0D{X;80EF-n^LqJ=LRDJd{bcTcXWTh=Cf+64B(r`gSsmi1rzWq1ayd zUOHT`*YAjUt;HzD_*!Xq$7>5?oh=n{=qi`g0cvyes88;G1}yCQeqjIm+%Dj|F~$2D zH#sfd$M1oN4S7znwhkIZ$q_s!Lg|d3z*ikeu-e8h#Z->hY&{86VOl5=W#T_qn0pv} z1>P>i@qVdSqpte#zE*d^hj0)?omH;cdfE6r^o`3sds%Ji%SE}{oyx6A2f%q`37!GF|F+VK=GpDM%iN!7DHF;o zm2S-2_~7BDDlcSSIG^tE75o>jg1_{Fepx zzc*(FK?L3mY?@&~ibAtBAu+A&d^<9H(gq2WV7b%-@7M885v(T;iSe0$W%BI!2fDRmM z$&yRpbNr23@AY@nX3jdSF`6EJRef**;b{^C9~{)pRup@QY|Bi!?W--Ip+ema8?VM~ z`;Dh^NXh^v(3DBbXx0MyhlnbTSci4a;3}l>KS45#?xi55a!pZNfH~p5*>An~sx}yQ zz{raQ-bVDg^CNf_{WMu(RB1OHftFzZKXJ*6=aW-i{^?Uc*ZfGr~PqwB)EkeKNIDfrl*O}IpdRJruT+VK_~tQ4>JG4p#Q zfhDe-dd>AiVI62cMfoWEoX-;0;P`MOVmYokvlYe9=q7pO_-|`$0KIcxcEPrn!*?Tu z4`s+`Iv+zB1h*U>U_7>X6E4M8>UMQ8t%2S;SX%qf-F2-N6TMgxd>V0;z|b?9ye$=r z;i{T?%v{Il2EH1cCD&DRrBiw0U*3ec;eB`uLl0{j=o&TaJm`%2ix3`6hQcs+AG4}2 z-JJQxYp4i%&xGg*S)6gkf2p3aZ0GDtakh!Q|K*U|@?I%!ug0t)T;Pq)e4OB@+tB#R zXKK)K`(fW3(X`L^g`3%agqaW`OKh(^gRpwV-RI3>_y)!z0;9+E)wb){+FHe4bYbY= z0WLmX?_oAu4Nwig2QKGgU7PLdO=pK#*ND<;ZTuJK252{A+9BuHHk8jJSVt%dfIx{q z(GAbQHRFdHn4MIbVB3|kcOMO9yDP*aI$xR0m+Eq)a<6fxsyB^*Spa-&4_rp2 z>N~c0dsvRRGFp962kfuO(016C*V6EM@Gbknus?yD62oa;$A5;q(&Ao4wvduiT;H6o zODtVBW!@_|%9zpuMa#O zVUvt%BPw${7|;5On9XT(o1f;65$u1XN&5vD%PaB9AK&wvP0)DK5<|0^z4s0dve06$ zWF#@P28T-eMFMSJI7_Crx4_B=!Ofi-jjxmMgc5Mj2qmV#Qr}2;`rL?h80?w0V6%zy zv=adRsJN!tiULwg?E}3&ajao zL&;CVR%Ln7r4HlOO2sqGd3je2O`;n1ZNgSJp0wv@#~06}Q9Fd|XW7Wd^aWEUk`#Hf z)Qe$XM`B=28?*I-(&R^YrZ0Z-uUJ^cz38Mn(JWIjrT&;#W8 z*P2a*9V1Gn%b`jKJAY{>M4(hw)WSGtmz{i4n z!;wjMTGw!7! zpU&G)kIxoHIKHl2$c%Ih1x(dM6`7}VO)!#&r4!+r2zhXD)yjA;Q&VoqTdnU`0Cy1B z92m1BE@z|M)U&6&`nBO0uuHy}A6zB73QOvFzf27rJ(Carx9YFC<@jxeAqKy7#N-s4 zL$SV-q}2*?((cgG(hNO!WP$iDhpn2GZO=8C@D-gsUtg-Lqip#;3vdB#Ty+1po_-;p=R*y8VpEuI6xpi>BD^Wmn ziWit`VBB1Ouv*Ggw^ZH9qa{Nkd~RJacf4rtl_Y7#fE#RwxAx_I+%y&1{u6XLbGnpC zzRNlRP|7CCsDg$#dP;zBCP;R16SfGehFgXyUC6dkQ!-|O(#%C~zkS80#c9i~UbtAbgCxB?c+^X1fCWH?Gx0gz4Zv~Apaj;!#@ z`_ehAAD&s`jP(4jFrbaOBwsW2fYIQ+wetNSpzp$XPavzgxR>yVQtFSe2;+Ob@~OD2 z)A=j)w7$TX2O}Ew>{;OR2d5sl*D(%JgU`Qywb3(`Zd+=C*<|kk-SSJspD*!#$atH4 z*PAOgXPgV&qzZaYw-(=$NWVw9V7e^)D5$tNxC$gy^Gg)e?16Hb!Ep>RG;U7SQ25E$ zv=Cwhz1r_}REH;-ulrniOPHEmSTCW6m_fFh{2-heVZ=+iLpHg-$Wp&zAbdD3*BP8f z&>K!$IWzdv{K^gTsd2TvS_^oB71?9n6A#@6SdrO-Onx+a6fL_``E1n2pBM%=6Vo??mZ;^nf6&G! zEgwC0`Av)2x3>3-+{#b~Y|22DZXLC*TX8v3$dQ{Zv!*rAVs6Ev*&6-l8s}-ohPs|(ui-$77HUBYWP$nq?=}8c z$t#%_@-X;0if-{NK!ChV=UV9czVoz9LduL4#HB8N1^mI@j^pnAmuD;L_cz)Es*<(w5$Y>(p@z9lV;`9%x0CcsM?muR) zmI3^u!?sjHYab+6N-)O*&Q^)h%9{y0gVo5k8tDoL?OXvBFf~nXX=TW|y+T78Qa8WA zK`D%ma{GIGNAX~XU@}l1#9!9Eie?Fl5}2rJaimf4GhMY532S_NeMZ5K8CE2t<&TY0 z(0*ulg*)prBwe@K5!A;c%-tQ8-ZOV9T7~)>ig!cR-p7XeD^nkojWAkQusfH07X5s_ zHBWz`Wnh3p6AZjwPn*hj#Ykei;PAFZX9~tFJ+l~LpBMQCG}gfK!g=I#T!8FimB#$+ zLLljw!40;@CZwpe?;0pUrAtz`-Y9EbA#H%27>pH2E|@_56UW}nfa4py59Ku@B!)jJ zX+{g$2ryk-jb-(FLEQCw!O}w`kNS9S%7?7H7SiEoZ29QcR4 z5==z$s*HKkJSva`W%Uu?o4l1XO(xCSCAB|AhBNzd&dbN=fB-?}CWJd?W<%b4!zj57KzWX&J;S{sd}F@k7?`wI4*5rgiy%*aWCI33K%!-o9D?`&n8Eu00o$1i^t>*Y zFx@hZRHKh-FWMZhBEO4f_+52AG2s0* zIhZXzdcXTQJ75X;S0yA-%Zzqw4V0g=m)wM!TPN?coO`Y=g|w2MypIdxwz0v}y8=Qf zOKEMMcu|I(1kOi?PgS#+Gow90VG9v*ZXvqru5*Lz5{J`3_4C{GOVo!6b!K&$;4=r{ zcS986XeIejZlE1oD65azmX$C9^xiXgf?9)x)h}%AoW=RK^d-|b=R&QmeWWR{=`jzG zZ!a%_^VwX9ompqFt20}`s)cee?-l^~1$b!!QmAoDj26Wv3ctcQ`*;bhsC@A!(sO)Jn1mDXzmNs*d#Sd*b} z9GJi1sK^Zyy~(6=EyhHH3t&H=NqLQ=Q1Sr3{Nzf_jcgmpz5Ok~0wJ~*rZ-P){taLSHhFXomFq{%$nJ)pO);`QqX;B3A5wAsXpwYL zzxC))?CTpK2TNsI35Nl`QWGIH5RNbtEjN|T9WT3_ZYM$LTHt;@ezs;fV=qxy4J37r z4wR*O((hCbDsT34cblOIMw7y3v1e?p)FSksXGYrbYmZFNw6xeXJA;%}+oSh~*9j8# z^t$z*&6tv?YyeR4s#y!Oz_2Se5WZ~J$>^9Ic-Ju~+xrAY9s61Q6bYG-k0bel-G;NB z1(LH?Un(3=ydn#DPN*F$L>$>E9xr@1?CRKW-1WCICpw^k-K9;CpX9un{cWbWnO;Z=ZSK%iBPL8 zfb)rGqro5}CeOdZsEtI)`&}G-uKfi9!Bx5hV=LGoAW3i)P}yT^da=7Zkn9Ut;*FDo zCc+FdZA){Z4xU5N4^JNgU>|Tw%HxSoWD@8ROUjbieBmO08EBE~Q=e#%_>k4x=3`QjI z?I#7nRl$kcQnk0(`62!yx2r`N+*=;+fHV8M?ujBz&t|H}$A<#bH7xHkCTR)zJl8UQ zllW-)5(RLaR6IacTsRK_c~S$L%qWgmMu-O?>LZ_x#2BABWuOMz_5K=#6>@_R8`)a( z7)wW4>L*p}`hV9MpC4>}+KX#WixttgkT_x9pz3-&5`5%LP*KX}N42(B4C;Zsf?pB% zx+}%C)n}Em)yyzvN%H@WttO?%)8_nMwtS9K&eKp-!8Q1{ z#7ckz2@gCO0b@Hxd`J@EcFjF1DgKwWaoc{v<|`#dCE}%fVJDgyU*ja6pr*-{J^THQ zf}3I*K5dm{{%3_iFZUiB@0?xPjuV$pTfMQ9g>;an^~o*8E|##-c~aYMdZ>(i2Nm@5 z-cG1&-F3r?as%N#CF8;4`ZsyYzaHH%TZ}b`b6B# zD|=%~kx0@#Tpj+#N!&rH#Bi&CF!Wt02IAZBc{gj6(_eCbcdY)U+{%5cT+R`3?7Fql zNe8{hC$Om7ga64RcidPsEmx?Ze@$sgIQ*B!U`&d)RYob#WYzlL)*2Qza|tKSODtT zJQx8h@JT7~wN4TY=|}?LHMvG?48c}QNEu8%s*RyH&3r$LE?xQNp%R@<>1prb70;id z(|gMc3D0ELi_08QaDWiD+dNjiVD}jS9qvn~H2>*~ebR+q#j}y^E*d{=WC4-?ccH$z zKf!ORTzxTEBQNhgeufFe(%D9jJv9o-YN>cT8ro6CVA`pBsv{08PBY27^Q^!OC-fCW zjGpwRm+trVwHpmb{nzIcevu;p6M>iqpoit5PL3yE7n*y**CWoNhgZfqAA#Pclpgb~ zGqu33^Rrj{WIY@!8C^N|jGJ?uL2S3(*TWbdJpc0L1Yr;hvJg6c#fk1r!K{L?ZH1WD zdL=M^1T3u#WV=i*kA%z_uwrddX%(i&AI!yV0ea4&wJGj&|FMgDgY z?sMmPq_u~a*P`pc@Pp^TRI>1-4EdodxOELP{9r7k2BYm&a|~XRe#MxT`m77sz_@}> z+rrLJy%XG6{z}(iAo_WpDnPBXc@7qxcBh`*77$LZ7N%`p+7C%qY{Z6dCOxr|B$tB7 zmd@=J_Z6=|f7-V^3WJe_1l$}i8nB?{8L_YsGgTss%K$JD54|d`&UJLSz(s%`B;<~) zc(JyrNF{|i5#(3OQQrQLw+ar_TXbepkuW3Hka`~73BXmzuvK&r1GG3MS~4k2=k4_e z69BliDJR38g7)Nt1mU75{=Xb%LKdj4?!MZ)wPE?B6w)Hg-D49YadP(H$SrQ}zd25c z7xkaV`0hk`0&{m|q2gDt0ux`_t4^7Mv4+YZG8f^=nl?Z9`M@9(Y43$70s9ES`&K+y z@*L_(3SSiyK{FlD^r>J$%;eTvU4U%(SmlD<13nq=k2)}ck39@%CboG1jsTJ6-;RC0 zRnG>FPrdwPLDKW1ROedbo9k8H0$L^lq4%om z7l%I;&Fs}sZ&x?0j$H(Boq*mfroOE5^XD`-WL8jdzr?~d!^0w@LDI+6vC2_SU1t4_ zd)JxUM?37}a`9eO5*pEF6fc>W6`6Kfu7ABGGyhxkF?Jq**X}8wJfvWluJ<)IOcR$K zpI5c#JtI8+%=whK?mG7;e^16K$e4OdnbnPdSgt=t}G`OaQFn)hivVokYC0RlOy(;c5~_q9I^2!%1g?=+!?JMV$Ent zl3;mnJ^$2c zN>$4FmBrWA4RbhJpv;rk(>N68-*rKt_1kN%zu&bU24N`kn*WwcokRFlplnwYBf2t0 z!ng9GcoqI54FWVm15f%x^EAeV`a`J6DI)=^q03#Gj&3n`#;~h9Ktho|L zj?T&*u2<5Hy5eqLhB-tekh{FA7TOo08+6hx-~;C0)X;EUYwdH?rf7Xb5&f9sSp6gS z=NyiHTEl|N8U~bsGlqQ$PYhF&Qr??ye%V#Hs+&>co2#Lsa8p5UwPsduqH?5aK zeYUv$-BsIXCqG@OHoqj<#j7tNH?;ZB^a*4rTr=j0p@@$D#e6BEzDjiKuFeYId#?3H#@g{(k+00%qn~f6ACuiHUFQ{PKdhJDE)~eQ;ua%v>*1uE zrI@~3u5B&7y6yDXheHv~Y32R%7LN2QHD7<8$nTgSxmZtpN3VwGI}7?tAP#?uE^c{A zKfV|qwJ<>WJtf?)S&T|7!yRl@wi}c4TB)PjxK3AK0<_9YJY%0DB`wp~{AJ;%ZV^Z1 zb>4h-PmRD~Wr`jsZ5_g^qonA|zAxhFy~@_c@fE zYrk4Ft0OPaB^2Lyjs*30dQBY0@X9!0$8B>eC%S6q+>||V-sYZt2{_|qMBYO3=5OQ) z^>aIIFdUP4LqtcXJM83eWsI|5Rbe7Y-o_{NPGR=3K?K`V=S<(H)P1#MFX~Dt(pKhdoM}E56nVV`RL>S3J zrtYids&(w!{I7K1CAcOV@8$_}qR!MEV-2KEVAYmxOSgzI-q_(}_+rfVG3B7RA*e|1 zpdFEdT$?NZY}a#yRBNvv&h4--L*ac_7QFD=2b`NoyYux_^p3EZk7V}TswrQOU`6t@D9nxP1#|$RaG;I5+)^Sa>?;poR1>xaA=@3yd2uDf{6$KT62T>YEBT~|}z@Vf=T4E}t z(lsOp0vkO_YIF@4u#HiJ#qaq2{kcERIos=XcHh@^-Jj3<)3_Gtw(haloAm|B@KWd@ z6P5uoKmcTxY#QVzj?2f<;5$-1rRP;IEmwan-EJr?(-9LYOCb;s`}`HHh#VE~Fv@h2 zMFo$?rx-W?bvdzS?wggk!#I|384XRzFpmPc6KOYHov(ZTx059Fk@4S-h0Zh7!K&!J#50v z-rGIwuF+9{&=J4x$dui8`Sx9bYBG5fE!^WjRg^}gqu+cU9V>R&umpPBJ<%GK8Bv+Nya>;A&!k1`Auwvqri|BFE<+T&yHtK)$}~KQibdCcK`x92oaGuJ3TyJ+L%IO>HaC6q21h z&o1Qgt@?#MDJ0_%_Wa=bb0ohIv44m4gSn!n7PNzQ+Zbo9@6^s9w;t7@N#CnHXD3Yk zH6ZJ9&uc?bcG9CH{+~zc{v$+Pe4+}uqI5jrxj1vxSL<+*P%%8Xrw#WTJ&_p7~qa14967_x*R!_%%&`6Pa;MpOS z;+CJ?{NE4DhQqD^;Y9_;P37TE2X)3J!^X6bC!yw}qMMiYYarc00Fa^@0za~xOqYVI zSc?DovHA^;o~d2HsReu9SD0|1muew<+Px}o(OGHa7t<5{R)*mOmLyDrrIqIdYDG+wozD zKZX{8laCxJg40mFtY)aXL-KW~$Ju5;G=#nk7&DGF|p98zI#;_b}M0 zdXtHa>`TKR*f+0i)@tWqhY_)5>%*tF(eV1hUKc8zu!EJL7AQJBcruL5Q#=u;&l>uQ zclO8W=W%aji?Xh640~1jk!GNXi<@?=;2ogj$`+Ebv07_?T3UX!#e)) zh4zJx)|iBfXA_?L%~c=OGHMBP?lfOb|gUU9|JWV-`t(f1@~!+(b8 z2`kx5<_}513gpmhg#E9DtTUOdX>QO`7A2)1Fm|v5;|!UH7opHmGt&-2UAr_O@eXH(7PY=whYej>E z!2ELExmuCv0Tkf_@}O{@O7NaBj=efFLJ$kyD6i<|Ki}2NXP6RSH2xHr^6)j8I#g>H z8tb5zi#{8Z`4MQdIq{;|su#MF{yVC~dM)~CF_r|ZdCu@C8vW%vfqLq`vym5iHy94f zNzo0NN2q&rh=`7@C23};AB^ua=MnlO%+!AyvT0o!Cc9%q={%88$C!;T<0obp#SVd2 zn#w?k@ifCI-oebGS(hI7zNu^me4H;+0rF}QcuzkQ_HHt_t|t*92G<%^$6Tl;z9S?x z`;)$=KN=Eeo%6CP7+wxP1+HPEV&Zzhqh>5)` z`ByYTmVzDQU^*O%4|=H$Y2j9wr&DN=eXG2gMh2Sn(}J%I1>HVP8SZOCCg&u5wA6xj z(?q}5!b=WvZ=R0!AOEdItRFLqXZ6*umlwW@8_EyNO9&R@{OPDE-($hL1$)ux0=tNJ zacqQ)tXyTm?eNhDQHgh$44g`w3FJ*~6j^xLizlPP|#!%L*ia$8GJ0WAEpe zA5|14f2UO7<0#BAOgTZ{b7wPU1tz;e%FB?Bc$vZ8Eg5y7IaYRcVemDMA8nJ@aNO=HJ1^|7 z?0MP(?Q0Lj9rx!I457dQ?60y^B) z#N1&SmFZNmU!qLI4ep@7`UXs*P14AvEd;RW;`cNJvX(;b( z(zkxl+E&As{{Hllt%W`2U)P58PK4+~&Wk#QShV?pRcX@31EH)eP z&o0Lvu;TpaASq`t%20JY^UIvW_}tgE76!Kg1iI&M;Zov1^yT5{{Oo{nRcZ}2zMY3j362G^}6GJao;tn)X=o zK6{M5rL^Y7N}7JB{bBN&>(kCk*CWvfT9cR)HhktU=c~J)6crGVup8OIm5oeoIo}u%V{pd2_%2ghVR;UQk(V0&&_^^cxtc-u_UO7by-^ObcE5 ziXGMl1oJt;>veve_aY0vN)Z2UIEt?hzE|)3tLGxzxUgNct+c1G|ub zezu~LpPF;33r8Y#=f<$ltXDGyRjlXWg53;w8e<62z%!%DGdcR3Zfgn$R8L?s^Gi_U zYMZDYG1ruj;$o{sHiZ>S6NN~q?DT7fzN=Ymt2Zc~=oF&uU|$?Bt^&3W?b}@oC%`ic zfDWDM_{hR{x90b#mVvHVfS(XgfHOTCwxYVm*avrqKi-`Ur#xTdp>f%i&(y!gp zotFnCgOZrME?y}DjWn9+WeCv!efCJu8?)%{pKf_;3aL-!G*~`|{;^-9SMI_u2Pq(@ zz46l%`w3h#*bXaIxYcTNCW<32hJe+jKkLqTmlhj5@~dGBaHlZ1Y#d)tx{(;e4J4>Q z$dvN=7`U96lP4$IF&21Tlj;~QzYVkC>*lVG2R}^U_EW#o8>I9TSe6uleG0$>0vYqg zzzs8gh#ZNFQlnG@GNL5gxdNxX4K1A4oYWaVz@KYpU`Ox#+U<0mv{b+c;ws*+ga9dg zeaK7oS|0zz;d4CH0a1A`>iI}9Xwcq?hOK&+N!yssp)Fv9;2>4$tV@VAgll3t>Ot@> zkLR(2xd|G|O}mG9*P_1Kb!E(bhih`FI)>)oESV#Ph{7yCiyU!Ui0{I)7`cneyY}@< zzk9Z;=)b>$AkCu9%i;Vs`rK2jM3S>N`sov1i*fT%MbNsXYFtM3gORcQx>JS@*JQGF49r3BDtz;{0P2qC6Cu^$ z6h&vcy^AM1uE?9HrU@=AoO{^1*Uqq?q2m zt_FH$QaCT1FFO^8z;d^6rKMcfIk+Sm(8&WL$$TV?B@XUq1bN;+?AFzyj!=U=M1{EBoc5hOf0TW!?0dMdh>xDOE@qCx>fXR zb&1y?zRv!sm{usM!Z3BXh=*0rnrRR(NV5sLTH;Zx3+cP+g{Ilq6p<uSoq z;$yPtoezy*1+UcL&3-(IOKbbJ6*pwoJ5Fl>XQ07c8UWUbniNx9<=Tc@q@0>9>s_U6 zPRPPU<7rhOS;F_oUZ~Pmg%nN~3tj4Z%0$R7L|;s^29;F*ep2T_PvQ^F{QK%2(Y^~-YzsZ7>Vp;Na-ohfz21F>3|ee98P{5 za>e}^H|C%RFxMT--o|<@_=ngYxf5H{g~tKjkqORiiw}kW^!wVur*M9=JJ$Rj^S;m8 z_2DrUhr%!V)E z(6QaueSg@n`4K-P$Q6aN<9U|@?-raesg(PWe)yGLEKZdi?dQ_|qUa%~Va^fB_wG1r zbS=zGxJsR}zs19Cp{Hc2y&K^po*PfIXP`%g+?Kta&NmMn=kPyhNUVzma|_EWQu_nu zBp4e9^iuXlc@l;t%m;{GZ3eZIKE#(UgX9x#&XAXJd+>wu z%G06i6qg|iHs|AJrlI<3*gGoJ2M_qM1u(2gksvcvbaT8MEL|=5m5p%)z{=JYho+*U z1*Wo0?Ujj9>}_e_{T&W-aU`7n&_?Rhpbk(Az>a4w5AUXk&zW+mk_UPnq(+D51NZPQ zto~h-_=bhUk7j|{K0`kQeLE@sk;PTkI6yb>pARS8AtyTqa*L&bOf(KgX-DyoaI(F(biC^SY4< z!4f5PlNPRTXndV?VJz@hx7jicj@T+*BhmU8Yg_7wq_Gl5>oQVVc$PI@_zNGy@LWPv z7|H7mUlf~N%$fFP+9G7=Cy$1-hTl^0^8GLlosU)XE0%>gG{bmOQY7A|c|~1sf=fu= z$Ne+UJ5ufZOHypt+z`Y%&4xp2d!B!`prOsR^q-_TGKhDm2(&6&w?0|z%NfOPzl~Qu z{sJu?(^~0xhjFS=}l!AeSw@*QQ$-hZ6?a{gtv+iX?gA#w&+Gmz>Aa1{8j<}lWHP3EQA5}5FSf4 zcdgkmUp39t&3^rA6psO&of7BTp3L0ptQ3>Kxis-UGT`Mv6v%Zsoo(ivZoqa1!MU!N zpJQRsP*b7j745}X@7^!my~zAo1`RR{j}*%&(3w;Br{q`39H>h#O&4#~Mf1g-4WXDV zp;;3uYsqHg8^SS1M`C9)rvk8ku#kY#0D5nt0ZXiGvm_>Si zcz9O{4o@pRAE|ZK5hTx7%l&X*_mQ+!KYFLT{Kx(_Ls#-H$b|irc(#8)?0k*NxS~6G z(a);WN?3E^#Vw(3zHc*->DO){vTtK6k_y6^5d|ss3yA%svSA5bQ6yh%ms|9*$y2Wf znTHi4=We-ZZlBkau6-sX_$B}nv}!2N#h=j8I>DouNaWy|^C+j8xPEPI8-DvP!l_P6 zFlJ1q4YIAE3%GnI33HN_8nqngn*Vlc%$quYDF?)MjLkCc;1$e0_&7Ri!F0vVUq?1; z1gcCrm-CHx@1oYw zcz{=w_yia_>M~0Q9oD(67^&AzQ}5PGy=|rNY*b~&l+pcBHTtzPV11CzyHp#y6h*sr z`?|8eb!O{z$BsgS_^`DOvv!++zPLJwn@9QM@wGI{QG}xB_Rd`B{e$LQU$kr^-PC+^ zx4C@Sj<>LIzT!Wz!IxEor|SlY3Katl^vTSjFN3^34Yx`v3_@-nU*xw5Ro4`Jvob8# zoT@zSe58nb8&lAs?d;XDK#LvN&tVr4YM$?MhnRMJYrm6&TB>w?J-$pHpq57)FP80v zq;o8QnGx89PEsp~+weSu`dr$%v<|4jA>hDT7}rxT1l?)Uy&-#;P6{Z>FFZQuIur}ez&l&fX2e-<*6z=mSZIf1(-;%#KPo-_WyuU^L_(Sgj zx-g%oIZSwrKl-l!=6RLJX;v_g9Q@W+J z`vIw|C;y(Y$$EJRtcPK>RiLtVp`GU3lsC%X8-O@Raph??dnPFWBJk3Ch zv=9WRRZpPQ^TuXq*R6Rgij}X)p`N$zxh>OHOQXd5_Vq9tUi4Vb26LBO86YHSIuY)_ zD7w}JMD2MG2B+Di$i`Kms5wU_U{oaSotAOofjPq-6J7BIEWhw#5jNJekCRt0{hX9) z0bj1((#d>?7Vq&+6X?+NZT+f?1ZqI-+%f?AdrgKH-mUKiE2 zRK|xvt?lOJts|eOC`u|mqL?oxotFHuuhu-(zQvi&FCSqF9Kr1{3n7IJE>(HPq6W3l z+ql39;oSSVhjPQYGDl(<_^c}({2(3~IO%|#GGwT#@{vif?HBD~rU2jjDNdP7gPv}N z-@HI!1g`CciuV%V`~hevn%g_nI!MqpVuRS3361Rs?X02|WiHK?_aX^im45<)nKwcE z9{U>UTcJBY$l~(^_0kH}Z8b`)?{9;dE+5gMf5u)p{I2d8uKq|-R^LpeX>BDdRLM># zn_Io6{R#F69^k<`12Ml6x_O4pOxWa6i*}WDv|#OYaDs-4xS&rCz)zUq;^PqRr9MVt z03TM2Pcom#*XDe@K7s>qBt(aOl{Rw{rps)?p z7N?|pZh&;gIA&Jo8}&*%-AopTp=I2#`QebDsbs-mn3N7wZN<>7zef_YeY6xM#|Lo^ z7cu@&Yib57DgQHj3MZvYzd)QSm)ay^Z`h4(2q?Fdc3_Z*BZ50G#?;9)<`nBzdyutc znfY35UsYGMoo`WO&cU}?z!9EvtK{K!y(IKb_7ru1cVe`|1$slA@;EllSFTPTx7&8# zrtfg#L;I7M2sxDqz^CX{CrOP+pvpDev=)IoVyE}}4+j#$BRG4;I3jG_zq`C17t84) z0}FzF55R@9a4OjyYO7G5lLGWz_Q;jUf8UkWcJT#6XNBA@9Ex15;G-pi5H8kzr`z>0 z7tKkP`|!24ffMGM;zY5E!k~pdZm^zP3EF$~g@bsgw%>>L6ljSO*xIvM#yYMWe7ARx zY%J#(+isB|(jVhr0zz+LGF>u_Rz}Jtn&FYtd;N5|riGgmq-5nMnU3`nqw0Bb-?C+D zXRqSl-}wjZ``gvUsmZeJf;rW7}sLgHMwwFYY)51O@3X0$bOa6iG5>Rk@k_gPs4 zaAVru2U%PkX!kG_>1eYXBRZI%M1Pl{l^JlRsp{13FG!ADo&l9)Tw!45T~kxVvMr>? z@4AYrd0LVx6cC0w5@)66yTp7ziv&o3AiC~bt?N-|Gl-&R%h ze!RVX*6!qf_&p6;FNbS|fV=jlfMRmd@WAT=x6&q?5XmguUsJwvLYQ-iz#lQ5r!bx* za-IB>wpAMhwaL#8=-8!lQp?2QOBHJi0$&W@D_F>0bJ$fZ5|4{+Rk$v4UJXFc3>)K` z)3=)Uzxwsl_zU0JS3<63O_jfv@5|Yg8;X1k3Z6M?kzA`cYd;@*Z5dg=@OaSf3OgUgS~y%lUA1oc_Scr@{Cdx}Q1Uv-Ag)WewtJ$x&*D@k=<26AHWf6)Ph*rmJ zX-g=yzRbqBb8TDna+C!)OGQ)!I&QS~d5+YS-V0zRaC5XlKkbu?i;|CodEyHNxi|l@ zE(YKlfXY4x9$*nP*y{S#Y$`{9-Xxeb_=M-Fto+NDE6aeAzPFq)9^QqB+Rkr<^bBc`*d8v{?l5Qa+e)}YkCSk26JV^ zDfcZB#up6@AaH7=U+27T{u<9f6)nx)r~)wBy>Cu4u=H%Q4pG?&=mA6r))nV>=et-c zQ_xV<8>h>*YsB$R)NRlE^(*z6LsYpYKCu0(mC!$DUe6wpseWcBXE4{A#o!xjJ0{ZZ z1_^BLMP|LyJoltwqnpggYYxExDKH-hOH$fjixxoEV$!Wa$Ma6{+bH&IC8js-Pwuzu z7(CtiOA7RBX^ku#bC`d`)QlVEG1_nM8XorGc|8q)P+WsLnJ)8E%2DA3(x$n{GUpaHGmmX%|n_@D1d4@X4C7NXWPw^2E^1qOxqA zhvkd3I*-^&W0Xz-2|*9u>*CODxn}uiit2juog8{EAmlb*@n0qH-a{#5hzTmvd0Rx) z+u|g|*txzynufgCJbhs4b>i4oy4kpa9=7B1LdBV!vR{Xj8Y-GsB(EPo_%WQ;*?c%{ z#+Ji(Y<2A(<_PfV3+CL+7KoAm4hQ@vI8xMgkoVYRo_r zGn>e^d?O~XM>6y;Dyz*4a$%8`GZjG#qPNdUYzd>+`5ayXLv7_pK;`TcABWM&HpTu1 zLgIt>xU?ir&r$a^=RUfdhHlg##Ej!S`kyWilW$ZUoDmX@i8<6{44{(?R26W!w~DJR z_SB)ps*G=Dj(o_!)Q|#hd+%#Glb-vGp=WL$pCT~7-|LSMHO<+Y;D(b-2on zF<~KIj@W3uJb>QJrJk9E*{SIdveDEoDEk~?$f+8?Nz_mPQx)B}D_vKYQWgT;jy&}j z-(tV(MZf=~QE65obx>9vV4;WG zQ?90+)w1(EBPLav9&!3b$d4e!c$EqFB6kHT?0x??>A3O4LzrJ@NtX#wPXN_6&+F~)cW>Xu4lr?lB z#T+p?;Z2yn-%N@~P?aQ>w$m5>3lY@b1rs3G7ouzKt03D$$iJ+#M2nlYB6!$g84jX=Txrw<5(VYs1WKL5gB|#I zVI4+E`Cg7!6V*tKnnm?JqW^+J2aNhdP8+5{XhaRMRGWYfF1+BMT>bsjkq&S>*Iz8l zDs_yeX1*{?@8hC8AUM8?v*o&lldap?Ru!w|QoyX4YYF4mW#;X=t2?$EU1MY7CWuKb zXo3CGvVVwA<>$*xxI){AM%b+MU|Yk3InvyDfVK6!boh?SSo6Vl=30IFSS(t?XhzMg zHtNm3`rLru#@WIpS^(=EcMm{cOe*Z_R1PHo(=q>~gwS03W+QBDN7|E6tWcIToK=W^ zzYv0;IzSNk`sD#!%N6i8deUS5uVD zj?ymITFt$F(0VB)aE8X;TUoACfthq}=+>I|Z;MmuVHzOXi3?Btf4oD^vCk$6RQ7C5 z1U49;g0k-fTcFbZf>Ie<`>f=#A}O8Nil@a|#l&RSjm3@s2Cp`kIFy`<6Kouin=orQ z?1OBa6ktq?S9$$mcZi>wiBs*Id~BUB9_LX(>_d%Ta}dIB{&Mrw)f(5D#c=DRqK72h zLM~x@$`1Wec!kSie|bqmsyyu5ROs8i+BEZ+LbIwLRn>&xUC3+7#X3*MjmEfWvoW=; zujOUd0eE$sI)1ZUula8Q)c=l4^X8G0Koo6nA8|O=%aQq$_$@YNc546i_yNP*w9k>^ zkbkTkwJ4T5M`m*x!`wIg@@s9ylHj(h_RonQ!e=Ef41k`rc53iE_?{_oT;q=6iH9Fg z@664eLY7WL@z!he*qqOge_ygup8hLhFIeq(CwSdn^$dD(>u6}4V*SQQ z0=@f#(*mK17}MlFDK4JBC>iyaE~hX@p3EdUmHXFSH0f#~1AFDCQT*#95%Tf3s54Rr z8DA8$3TNLoYjdgJG_-dsuCY1YcuGt}KzEd1_bvp0oqFap8b+UL`yIp8&2?~)9@L3D zo?#VsU45f5^zcuaSMRZ3(cSMGDAz1|mqZ9A-#C=-)w*Q++MU)s`I*ha0BKYBK0d~J z2+1ObDBnoWIwgPCCgzLCmvBvd=dW$kF=x?ZkB_nV#I-L6xmfa9;hGMfaIJJ&{g)dT zV?XHWeMvwHYp+$MH^FCbclxA#8U1AQDO(bpY<}9Urpzl0qtLG=m*Cct`(>#!9ZDWB z1s^7n?<1f9RXAih&i%*{0ZhAA;6i#^$6B9$0qArpfL$^J>Y{)+^JUbF*;Q~s%bWQy zZJDk6___33K`Scm)h~_zb9%-eyH$B)Qzy24g#chnRnwYwKDcck|2b|9dBg~FvE4Mm zZ=~Y|mF*qtH~U}ZMkAfo!*n=)TY9qePt(AMJH(AhOOJBc?0f!@`LwvJnW7z$K##GR zbF_gx#(Al0=I`K-wvoBPO>}67Y?r_dH;%ijZ#%E$YeZDA*S*+)|ix7E3 zvh$8B%N^bwoxPK$PW~ua*?-E3@Gwkjk?rLxtJ$kDYpuGzmkLUUkN{JB@|a@(z~Eo#b`DpfZh z;jofJi{mYKRl+v^%?^{%ik5QlI}#1-`I11n^6^Ds==9qfQUCa4J9Y06_7?S93A@VD zaPZM~Lp8&o!6_4j`Q{!ro_r|W+TP@pJ7E5LJEAQdCbd$FUf`J zYLU+hoOnMjZJY+`Ec;yLiKfno=pGO631Q<_w(dF+Wfa%-K?_znc$p8-80)JO{P#Q0 zMTd8Ivt2zLV5cggJwcGQZ^7apk3N~VH?2&{p=l0>d)%xnm70#2bxRq0PKx^69E~h> zNFOTlDlokqaXXN_eCYj`4eXQY-uze8I)F)(6py{DNU4@X`-QR!nY^vQU4Im z*ZE@dz@wV-yQfSS>hsbw_u#|v(fG`NeE*9qt!(_(8(o-{xHt_MTb&AMKMfo?$s+`3 z+Y%)aEoa`45`UF5)6h>2kM?&t4!DZ{E* z@8Ia|h0Aj#{+c;g!Zd|_b9M(r?J_anDDDo){UZtv(fJB&9A38#Vq|iE_oxye$(zg% zMTl(1`#2$-^15@t&axo*RW^C~-&0)icf#Vyrb|Ej)y#@WO9V0SUILT+aJ1cCWv|Y# z_^ejcA;1*J#on{>TYHg$ZeS&Dq^;tiMz=3iPRp%VjE>*1S7YY)7n=Pkr~(Bo$^=G8 zi{mO_Dd{Rza&4rI@dpyhuG&TFf1zSC$vyKE5ubgOe{>o_;-f3h7EixSlJT*Q+vVSG z?DMvjnKjzb)6{%Z0|4w`s2jp_%C^<_0>rUG*v81TtriPV6g3`V97pm+6x{Yal6sF|#k1*TYn{#vdru3VnPnY&(Zf^M9 zI`OjdPFC~5r!^8{yXw2klNq}vFdtJgcjg&H!G7d!Qc-E01WC=({ha&8j8K+bnDDqqjN8qvGtwQxOUfZ==ND!Lx6OP(=&5=Jcpq9lmy2CkSS}NQ+_VNu>s&56>@`V zm)U{ynI_Hcx&NepZE)4=Fpq2SKJ(Oidkt|u;23snmZ?f2dn;yfzFrVV?8WTF1h3%k z4dact9)0PT0zy_@o}~ZBoEJx=o_L)B=W7cpCN|Z!mjV#hUFjr+wqp!>8r3;8$pg?U zr%BrBl#A68x($v6WoWYlavGK?d6+DjAQlvtw`p&Lxl&T zE5`;%v@YGa_9?~sxvg|&BIWm}qvaWJ*zK(9uF(Vevm91Wl`(nlA79wJcUB%(oJ`M9 z!IbNNw%7BY3DwTg1^Qo%hV!4@DoAgp#~L;Ll7~RKi_=C}r3(1>6$}PUqi1TE#Gq+k zqqQpkE0>IcG<@BEKt8cCcMHG+l0{B>)ogN2nD(Jt?g#VsEEF^gm!c0JGns*Xd3|Ze z2WO+g&KG<^&ASp7w3=@cD9h*~LbDyP|5>z|#T;FTWgK6bt+R&@`q*DkwsV}KwPt2sB| z24cyfx-DD@P25k#J8h#o{%E~N&UuwJ9@g*pNEb-Y;l&GgIq@F?VJ+{9@B22S_X*VS zTw%^}0}<}#cloYaUH;I51hplW?T=@JAj{jPMB1H}O!vyd@k@~ZFrO%M z7X8IA+HO`wpHzJy0L97I6+_&N%dC8=Mn9lBNJFz7# zvY3)eO6+%etq(8SH%YLA$7(y{97FfD3zzrs844ptEf#mhAO-MD3BJymlg26;B0-(= z=GSuSfaC)ox{Pew;+NI5+Gk<>%5A$T8npYcR3o>x?r%Ay*8nNVYwI84&c))3M^U7H zG`TYC?y)N^wf?#RBx@(!k;NWod(Hgb$zyn`xaRB+ z0G#O+lwNco=N~AsPYf?^>WOXkN~=^u|AFSoX3UBkygiLjDx zP7WJ_VNpp%!0E@u9=%CHmpN=pz>i+OJUv>OajuwgbEOo}n{s>{>h%l}9=nM%ST(mF zw5ynzhjdBRUUUEow9%))v%B!8M!9g6_4IBxpQn#A(S%r)T~4FN z*H=Rl=dv}L+zcs)H(_!%d zhow#(Qfo-Fo)FZPH)JK~r?^uN=9=`cwO_|8z!jRlb7G;SwB1h6V(~a5??#tKi9)3N zI4twW-g z@Z?(%C^d3pu~DMo?vy&C*mu~H1W!*2YlP`VG;G#EN-bfgR{kET=T$!yhPaQHyAsb* z&xHbN>>MiUQ!R|&(r2yF==#|IdGrkBu$PPL_72w? z58T+b`7*+O3bOh-zl{6Elu+455>9ZbX|Su=xVAV{xg5q|uxudPb7#+N_MdGG%j3Y< zR{OqWXS65bI{ROwzB_hwkc3+nZIe&*&T+g%uAtfwreP;48cp1MZk9R6orkfx6E$8h z9ltaYnk9&Be}fiiz2kD@SM|Wf??1E!1K0T`$@q z`uu}`kCxfvcW&Xsz)mYE-VVp-N8af2UGsa$cg=1t;nbVQx~FR?l06Rs`hXWSlN1^r z`1N6g<58_;Pfl2~aC^}Ht0+*f!Qo{+hGJLe%|$H$=T)6GEz$O`;YR=Qc$uwVRI%V= z)$o%tnI{e3!jI)YPTo{0VjW%4a2IL}efUT@F8r7voUw@<0%4I^*!*6q=}eH+yV|WD z@&G#{Pj~y0A!AqQXftp+Ndf+Zbu!xtnhNU1S|oRd@LxFdMEYNkrviuoyNNK$Z^Q3x zZx4Z_i5md@*jn}VKV$(MMjwJb-B1%%eSN!L1^LP@A9-Bh*azj#sDzoIVVNpxi)!dT z7SUBE!)LAte5uQf*To-9-%nh=or2DDsJyHPwO}KwZ)K`Wzfz6`pZDexxCA=-;Q+Ie}u=LXS!j!;8#mo~<0rN(w zF}@Vp=#Cu9=PH}FtpP{~K*~hL*CmNh z06%Pd%BgWYc{#>Wm{Qj8-rZ@+<~7A>LTAD++I+{U&aAlG@7JoSN?t{bOj1?gc#XWJ zSA+3Pd3nn>tU9S#D}g=p&jRN}EPXnd-|92rWsEj;RljyBqxER*7WO!~CUEE-w`M9f zhO@aAP+wRFbSOWTUk8w+C%@p>YpMfs4kJbM%(h>90pJ??(Q`AnA;)&e-u z8||@X=?rjHB!%}{!LIUzatZoR^IG0;|6@`|maVremB-c*gjnU#CD1x`YR9Jw8U%PR z$%E_%GB7XMwN%@bqdf3a{Blu`-A|^37cQ|0#_?5+S%n(Ur2m@x)Ji{V{83-q@pk{ zJVGuhBFd9jU0sne9; z<^hv`Q#W7>xd5Rb_O0(f-y3zuwEqz9$a=w}Gf+{?pKidEQ9+Im`=Y@0)M#p)s#CYf zcqihkJnmEW`DP@2QEuDnD`b5qAiKHyr#wH~AF{LbFp$Z9fA-g&^Lfao%yHpmzKuyt zQm;CWRK>bC;l<|Gac%O3nhQq@x3>t=qC@M2a6Wes<+Njt_HSfjhmM4cQ?JozgjX~5 zUc6xmZ?YWjcr1DArD*1R<-sduAAd+zCD#O8AdI$JsL4=H21k1puQeKQC+{~NwA1rt z9t4J|*!C{{81_A~cK8}*TW zxQ6}_GMI0=R2Q|W&MYApqQiDMBv<~?UbB?c>g%MsdrX_jMO6jjeUZ}K0Ux}Gn*K3k zeYYigr?&ienkRxv0jrUm1Rf~e}^bs8q{GK zpI|SW`s1Z1yL;kQ%fx{irwl}f2(nn|(bF?)OTRKi@_%KHP( zs1l@QIJo`ndil&wV@12?v4|e*onq{Z17~6AH;SS>{pFyOdY(;1TO<;m;hi9_+zmjx zlKd9{PH}d^yC3wvy@{>H4velQYoSP$=_mbGsjiO~w>`gf`D?e;-?O@vS|~E!d!bJf z=HNibu!|!yU=nn@n4tKQsvDIbue8M5ox7`dcP|;RSsT@W+M7%IJPeZT8kFzxyZ0Po zgm&I18Q>@}D$}d&&I-Z~x6sYZR61u}t6^`jU~$bF4=N@Q_kn?mC2nBSTNHvFgCiK@ zuRoO+Hhc3BS93#%KpXme|Bqnt`Pp{<;Hrydqc1>Mu<6nwtV+dvTiZM zW*+=EvnTobBwGO7-XGkh5^TRc_Ewneu>bBJ>u(B}Iy|ZF=(y&admsUWlw-&J^IdyH zWH7&1?-;2MaQ>m5?(724S5m?IDZB$`OTmlV*7E(1mqDAOgx-A4kyw{c zF;(`c1v1eYHY?f|eMYT^p(7TyB@!X3@Fw%{*e#o@G&*3Hf(5iaZrfpnF!&lIMm+XczFBzi70IpTL%ZWSE5#UoIa3; z6p<2!YWbCjAHT+N+vj@ahUIUj`PBr_gh_5A1Q+4^vixGNeT+_cC!g}T6`&cnoXMdSM1~m2tgwwqw*xw{c^JmEU15pr;&^v)~vB#3$3N=jeLHS&kJhD9b7cC&M=j8*z z`LV8tCJE@f0r##5^Pfo-owjF%#~Sf68f^5>34fe!A&K&FT)20~d>$KA6IB^i@t@MQ zu2i|NW9v%YTELpCA%!=FEd4+R-PwC@Jd8EkC&5`b!6nMIlMSsgOp4RroP!mVM>EBr z@6OwY$E>UB(T4wiG>1Y(V+y_*BPe%e zsK5Jeea=xS$K}3kzlY+J-!b?x)$-Ig)olh1&e?*Y?wE!4BCGgccE&6BT`m$-Z*O$F zl}2{FM0%yCtec zVJ}u6cIrLb&EAs!`C)$Wn9j*BcJ||VvbXUtKtS5zwiU+VlI`&9bCoed04ehg_PfjK z$e%-*Hf7@F6DU|uxe}P)61uiyb#P`ep|8>-go>|Y(9fXVsPozT;I93Nf;uG5Lrq8h zKyaNCe|F{jpkUsyjVfz@>I-PC`*Uhc+^E#=8ILpFYAJmVU)TDSJfa@6IRWWF0edAu zCznzePgNIcwKzKT!+d#c(0SAtJe5WEMX)1NIyL0MtgUBRCy8o59E||0Tzm4+0M?ejc4Ux6CD(Y_fXn{j*~{+tMN`UxB$0M{GZPWt zv{vd}d^RYA-k9@Fwx3X)|31R-i^4}Lve`buKX6lG1GjIWN=k&ingIA(npMDy!_6YE zz_&G@gu`g3ieIGp*tngxA2=nJ6T0@N+Hd@XSm(tuzvI%!+Jj1kIHr?Ax>{G*Sf)+;xPp)u7*K1X@lE?ToADz3zA$$z3vRITjqHk(L zEThA(>LD1XS5_79t;0`kJy!9*q$K6<-NpS4Tu(R)J+9W3-AGw`>bU3C#GQT%HNX{Y zgXkOkgIBG3TN4EG68D7ynH6H~F1lhcXzEpAwZEWzNUZ+ejiMplsGXi`N&XSL7aP(t zx=k4XMpg4j?(y@T?y(Eyv3&=(w2ed$M#|jPTRG|!x4R<7%zT@Ak)$H>n~u>*2^bFq zcERnJv#rc!U$0?RRJd%7)A{*6LE=f_Q_!RetW?hh6C3f>DbTB>lh8nVt00iFu$e%r zl&rCJ;JX{Fo*{P=%s%ce`_cW|d9&F1b8`OWqgMCO zIk)!CB-ly?Rn5Q5QNXhO;5Q2z+#u-1ns+~UE7w}7sZ%I>^J^y`DgNH(O6P>FRGU2n zx}6JR7H>{ySv}IiL?oPE=`Oa`+~7jEv&sJEZzUAppcF$^&dk0N0%or7+ij&gHf*ja?X|e`h4E+*9-H=&ceRSg~@A|;^Pg+YS_CB%aSz4jQwLp#F~a7(H@{U%j^gB2*@e@$oewqviBY8vFp z6OFlecGs9bonkqHKb#Wbl15aWkrT~R+?aN$4gMexs~OFnwVJp(X`VODvQhQj{`fLo zI@b~Q^X&Jr2t4AegOpmX@JDi52;x@%Ml{5xm*T!=PC(>bx2j;T7NVt1{^8KAvk?-h zSQ-nxxx0wPv1`J^Lq%iQtGhEsnQTU_=t|Jc@z8OvL|N?0ns+5N?AK|Q1OGeY+FbLTF^kfA+zRg-tYD!fBAr z-#ol*9QR1DdiZU6qmX-68#)Q*5ARCU35|B3o`?HB{I4cIGC#reaVg!7FkN#J&Z&wX zG9crj4sk4Ld-=vD{mw`YW7hz@DK;X^j07czvnwWG>l+*fG*N2B9pwl~B83Ls@ex`oFrFnJT{d z0eYvi`oHPLu>z&cZrpQjfnJa8u+sNHFBoNi&<0A8@W4&m&bSd{@{qjK&zl!#@osaI z4co&pw03!x029nOascNP-u=v43{JSs1~?8g4FOTpdN;s4vw%h^F-n&LH>}_;0N{KE z0iinB2No`Nm>pfd8oY7os+np`?ZSIjN52laeAKmjYPE82_)T=CnyGqp!^r z7{5~%yB4DZv)YbIk0XmV7P-S#ttSnPNDsP-ntctrhaALhtz{|^qTPSlZw$5@`u6o- zLq7Kl$I_a2X+sqNu#PDc`3sHw%u~HYpN>mb3RO7wxOm8^+M!8<@b}8y1=`Wvz*T0a zp;$@-rZ>k(BIeASXT%UWMWG{pCUjXulT{#LO`ng;*aERh;Vc@XLFmM8M@V#$X z{xHFxuw5q&a)Ec@e0t;03cd{0RGLL`jp30)g=?kie1TQY38_eWNG<72vLcXOOOJBM zY2kK|d6~~~^W4;ttbCep)pO>Z`{(l*3hJ`MpQ}NeXP+g0Q+&Pr0i#XoY0*?!3^wu|JUsC4LL&m0qgEIx$z~vV!I@{Etj{5zIjd@p6+mB43`2U zg^X2ZloQ&GPkKuBlx&w3QKEOYJbn9ucOHjpBNf0ihtu6UO9yB9v%g$R_vVBiyU1b8 z!lRh`?qB_ZZ1+w(4=kEh_KA*1%6Wcu@lCiRO+7AcC{{z4Tg?DiPdV-P-5pV_58 zikS;DJmp|2zVJ7OJ_`Lqy6GBiZYegiugB}yA*(v=;Xh01T58I~*Hx!qSt(U^`kv8T zWHo}C0OOw>{Y#kvkKor1%rDKk?&UQ=8fV5$Ft9A$vw^?sd6j?dcWqtd7rq0az1!B1 z-K)p@!c~Ue$<$R5ZkJnYgc)514c?OnKIOf7D_8?W+4tP(wt941Ogf}O-7^eYDs^`M z@Lp4%%*mK8IQnydn)xNZl!E1f0wU`fQD)2ivkSq&xqk9a#^`ij72R(9;=@a)X5p>w zxyJqTOvNqdVIA#4XhSDxw{B}@27f31c@XkNu3*3^l;_Shj^?rGzMAERz)>NOoW2%|%ivf3_Ef4TD4!sh>>~BxMG@u!gge`XTTOdUc(;qn z^)B>c^u^3S+BGxC8zp+C$L$9cL+EEFwhEUgxP_|(kDtccd>7cdV&VE!t`@4#Q*o`H z$!ENft!^;Wvlz+&kvPfL(PBfmmG`@r4JA5Ok9K%#Cw&X)`lZ~^389x-@l%b^3=Ziq z+OgL)cJi3gh>556dlvMqm%e2fvD)$L8Ht1Gif#VQFN08KOd^dx}E|eCCp}Cos!$RCH9MqaVN5 zo8x>%;pfZdf)P)(@FolFym z^!S7{2vzwc9i@itTt;?(j$E4@s2LmHyXvioldW)I1d(mlOlv0JIEKl@huh8icAH9WqdvZaB|6_1CXR2YcF_vR#Y#cW@~)D!wu>xE)xN!(xT^&gK(I>Q1$Y5Ww-6wIp2B{oPSwp}ko zay_xb?7&~gb~#esIL&<4z4Wj*p;8yTb?m1GT|qAVzPM!p^3d8>0d(2q!`D14^7hBgfeUsS|rR(9JlQZS;S7Nzs?yOu(`Pe z1e0OJ%hSg+4Ixx{E_BzXUax+1H_2|jaTYtx7=tkTtoyX03$8)(vuXz_sprRbP@NdW zLV0+RPd1bGB6Nb;X6!L0iqI?8>yT?rM$Ea@=&YP!=)#q$Oy6r?jE-$XN}!hfVq_rU z;CC2LB3h0<5q@MM!B5d9;^+vTpugMIU-e)7j^5>Q-CpQu;gy@V-04vhRzI6FmQ%uJ z>!c}nxbs1%p`GZO#I)b7@f;uw62XVilYLt!R7a4zp2%DpG4-BnX}lae|3ZDdTBW%> zN?Yk^kpmxux?B$kT-4^Mk)4(kUVYZ;RyOr1JN)QK}$B&4_orc$3w=PX3o@(ho8>hVBJKOIz(?h>qX6yE7Hm@7*EnqGS)N>Su`4j5 zV?M;+x?h-?ACO=k=WZ&`M!U$hT~*X#vr@NpY=_)Fk{|)}I{#N%c&YjNhFLC;q)L{u zm%WgL)D`NH1P}2O7a8mM;+{*bqIUJs3nW9KC&*;M`Kv#webQBdguyA2Qh~ zhD2%DWqP}^+lg{)5n-nPv*v~Bt29-=S_{c7Mc(8?4$(NEBL!W=#re-+yoc{|Gg$Rs zRjsmsx*OiJCboBZb%8Vgv?Ab9@@Do6n@fJ~Fmpf7D|AQ^fUy1lA6fU~ zU;rcP8RRQ>!*MAteORZg=aSjxE?_oVHse*cki7Sbb@T|ys~==e7M?nb=L8(Jo9c(6 zpG32!RT^T5pq{AP)?KyUA%Yg3%W+fyrZpYl`A)HkAbR+!>$#&#&x@ldv0Bg^L3SJ{ z$e$dG`n~jVzWi#ak_SLqFjbfe+yU^mmQ4s!VKMDUCX{CH@fYiIz@r%@N)#n7JRvl2 zF;sa)0`H5tIaX`jcO26BJ!a{D_W%NjUCLI|5I6g%%S;sZ_>2l53qWX;tMy4*1DwC zR$L}|$n1BsKo$Sb(6R3F-j-O(|F&B(C2rg}k4BQj3JO9!KO?bp0*q2+!XJ(vjj*(% z(p!n(?&Hp}{+`72xwa>pO5$9%fj1q=+i}@%uDv3lws@7+Q(>BOcp_k9n=^h4dLF86 zj$#OM3YS)#XrEfpJ?HhOGm^^5u*6ki1iKF(Y*~&NtgE)uGfsIv{4evljoa1)B^2;p z{pLB|$%TEg9tCK>fNS(J$r4J0D|39&SM6wYlj^}E(Z^MMqv>WQ zT}#u>Cp)jqkCnA%Ho}F@s33XW^)WTj-us)F{j8bSfoHhPV+KC)AyaukdDW^-NCi3e zwZ{?*#iuAb1UTh4%0HYljDi)-xhPNVX%wc?#^jiJr;s6;F0LBOXS~d^b=rEE0)xqb zLLA&+_ngD7djUL0m_BWcO`qdaZu6u2qFX8-?L9GVScWV&9X1=Xs}{da*YJ4B#dI9~ z{jukZ+Uo~4gb6KA#HdrYY0=h8HZ#vhu%0;~Q}39$PI#qqU~OsZ0cF#=pl~3Cl9shb zNPvEB1hcslfXh1;E&UAnGW!L>f@eSs1ujXJ!L59{inGT~*%)&N!r;aGBLT=goUa2# zR|{!qlt%bRp`JGXiW1haK~Hm*4qHCNM@+<)lp{Tdba-o#Sbfo2*ILQof)1WDxT)?| z+|NOPfXr){Wg)W!6^Hk^EjSXPoIYd=iow|!L})E=jp&2G@rjCFux)xst4e2ciWs~A zfYbkm>5Q*1gf1U;HWru5Fw?^Ve=fjAF1R{Z#9UZ*I!{Zstyh!sr^`LeCQVx<0<6dT zDwqKFbLRf-lAQ@k+)J?{N*tghoBf4~-Me3vU@2oo1$ZfdqZ5dwMMnWxX~MdZIYTyZ ziK6wV#(GU^beVv~`%$76GOfh$(1?_KDANI%*1M*wVY50N+K_wM4}PQA_#m~;c+v_s z(Xv1>c&Kf=S8^0UhCYt$FGjD*9_{6huzAlV{+&I^N)Dn5MAQIr(=_sRlbKFwKTg6@ z?w9f;-H+PfpL}4uO+lNl1!0+6!rx*fd6zUjM=_<-n1JsOB9IN~R4#CrZo+z7N8MtCfTm@#m z0MqLQs56uu!6&yjxU?~~<ZdAiqt-Ekc0M&4sF>9)Uh*u|k) zs-|UOd148ct&&)RY7~2F+n>6%+4HsXmPWJ9OUO%k1fp3w*xuxgUoxGWJn0APw}u#o z2X$D#^X<`pCix5QQBHN)^SKF4?RbnAf>coAoIrlk}=yLu>GhtqZ zIWljwmEe<(SoR1p{?me7>-Tm zbAP0c`QS8Qy>(+_acQ(cg}<&dZ2cF;(wefVYZ*oZlw;JK>x3+7GNZCUAS2=#hEdF2 z+UUZO3xzRssyZ7t4uM>~vepYouKcg_==p4Cax_(qn3mk!3OT9|;yD6%r$8Bz>^6LC zni#@s24 zGh93L?%k^*4BrBMQIC~pNGtNAMO+8!rv>5fAps3OKC+p|bSELoGFv6MeBpA1BGvN@ z8(P}$!ZG5}XYrbFREKyn# z4E=|p0K4er@9)CUbkgY}jFS!)TLV1P)JfYUO8@p+kBI*zM0oIYR$8ff8edqyzp-X`EJZXiES4 z!hgF#dn%yV$$fe{H7O^Y4@rblOii=Iv7NnR&*)d}6PP&V6KMCUd}?j-Phlfx4){JX z70=L!W1MQ-Y{1%Yk8{(L)%v>A+gDHxqVCJRSm6<%4klg<-gw{L>G&I{0qU4Z30{ey zWN{t<-w_3$NQR!gCvqTX3-BTAjg$}n3}uVQxfql&G0E$4qG;Pdto%_aZcJ&jyxi!#(fj`u$chtziI>;%+Umt0&FhFSEPLQDp;d| zJVo2b=!Wvc-v)i96>98F)T z?G$OQz@JC|T3l_w;&ZsKyd|Baz>5H|EB0-n-_OB*0y1w2IfmqaNIrgPmGp|`H8c0U z+?h_er6L~c|3RGg_SJg23ANlsQom|4E8)LS6NKfRzzOCp;R%Yd%;eOmk4}bn$(lfF8(j$a~#osmdNZ>vJAlqPw&h9NiWn;-NLp`*mXspxc=To~%u)WaF`z zVwaW5g6V&z*0Pp`1X&CWM6Y@aYU;9=5cwtmi>x$}x{5C4rlU^(n) zRy7kKkX&Vs=yYWyU1p#A&~ePT&;9o8X9Qbcu1EyF0Q!@L&?UwBKiVdUfjj%#hUtU@ z?qN!-3Y+MID7l3=kD16io4YrC1iG8ohxx^#=@G3paMum#CLivj1`Lwi#RRyx;brsytf6~hiE(RZ!NiZEHF z8QP#TG*4v+j5iI4K?AN$xXXZbhWy>}+ZD}xR&C?j-gZvI^3V?bm~ zw}IJ;iE!hpobxdFQI}QUL~A5V)x$$EjFf!jmqzQ~&NLKhc5ePYZ%8H$CM{~di-9(-{x=qr&+J^uvq6>nl zq&PeA-A@I2w}BAH#@PG_Yf%O{J5sikIb2Zo@9v9_tD`^UnNW&6LhKj)@ao;A%Ju3( zxo58!EO4$NPW+~ISpltZ?D!x$I_lVyJ(}>L3vhyi!EAkg>o?$u>*Pd9{l9qJLw3IG z2c858rtbmOS1*IB|1MM?EGHYhDAi4ghD+aTFyL?t{+QG<#a}RNH zXu?`&cQ)i!?dyoFcIH2T8EoEwZ$Wy+y00Uf$DP{t!{<9Pr(I7jrPztRlC}ma#R+ly zC09F@Csz&qv*+xRvDB_$kdfMXKK15_K+wUM#9SZCyXxK|e4boD>V$K{&aMZZBzm>} zxy(xIwcR)6J%fx1@MX&?A$26ZtwEkQ^+&m^Zts9lOo66LD>g-I23{+}bf3McY3_&r zM2)55d5WWp8nxVkHM_+5r#7#a{j=74t$+5!7t9b#Ej59PGP}8)-dFW|8+gur0jQ54 zoH)pCXChYCHnc{uoW-fi*G#7Q9AMoOQk~T&OuTi;Y`Jfrea))v0YvnP;$hD`#Ej!1 zS&aivXZbHn=j^sthYB(fBI!N{8^*Tz@D|oy(QyCBiSyvSojW%igJIvu+QGYnWEM)l2HckXB{VZ(YHsPPE`a{N)ymBNHmbw=X3)fGsX035Eu?KCF z!d`sRGHF@oMBLXImmrTv=`R25X#(>fTkSsz zDBr)L)`Xw`fNAwJkq1N@ep#o+?2fFRR!sVXeIB;Gt?|2&W8Hf3rFc0qsguB5co51p zoUO{5j@SYEY;j%dVSowPNs*`AFs(BxHEuU>eg9}wOn6nC)TX_#8q@UX@Mc!v^1fYF z=H7Vleu=F4qIcj8y9=tzQz$(?_CFWVqT4@0Bv5IdY%oLf&67ZTE0-$i{yS#6O{*3-v&3NuaV~8*Z26Fwa=IDTbZQuK(g?dF{n)OjT*#YQW;NqV^L|WX#k%pVL zP9`%|%nnM7S}+HWmvtcz>$FBReR$N_$JfYLN8Uae62<+yn^z@FbzHm{dttGs@a$vN zt`A*=n42@BJ+pfrIMWlAl6Q(|=46_`$gyV=b{3BRE61Wk!bO@F{x*FtR2NJ{%{Uh% zwY6%kc|?E_nT5M4EsX`&LGCO?;!=};o=waq&0ft{*Bmsa<|$jT=PM#7r|aAQ3(t!L z+!`)XDN7v!ePtDDemkWnBS4Ze=9u~s@)*Ex0X(T>9OE--`cU9@Sxte}N_xt@Wk-3Y zk`9Bj3e>tWH(?LbMVBd#k~8(kW#7&^l@vb)!FdJ8V}Ntq!v3gl6e>6SQec3|uR=-aMxVenKlbkqEBGYv28^t6>3fL9kjunmgTZ{5f^2dg=j zw>hL#j1o2~VCWa-$V<~Hp#7DC^P;W#kvYl3NuMfPD-8&~Sv>94O-Fti(m(m-a($R; zMfg|c!8{N6;p$+9f6IF$>b3v0CY2J3{BfEnt$(PgA98Km2KTbU60)+nMsji!FhM(~ zVe$C?dii*-7#X}a8-5i$`+|JiXRF1gFjTEU= zBEx?6AJ#*Q$~!AtI|Echp}tyHpX=3{HotT2+r<>L_0l8a?Y2DcusugD9Ujv1X!f0i zql*YCc|fzc`k+HZSOPK!R`Q+_7PpL?jMdcmT&r1mtFgVH#8)J^Ez0SY^XmSH1fS>` z0B~3v09)ibT8Gz(ELrv6(DvW)S1?oe5i8L#fZv$oHY>+b4pV!r%+@nTThH0(j-@rO zh~q+qAZSOmi8yZXb*wR&eJ}$yC@=1@PNwwc!8ZSX@sy43zuJ9N&zFF^$P)Pt^}sk` zuTR=lQ;2eedxY!uKCjK}=$K^4{*F`^wUPPRAe;G?8C!{|OK_QIO{1);wpAx(yVt0D zkBy$hg{X<{RKCh>QBG?b0||L@OW$6yEWE9Jn5 zq8$~mc7}&?dinS%`PllJR2Qc9Pdj?gp3qoZ*-(>9@_xpsfEnKhb9VC_?@T{-C&wdXjIc*U{xB z5yOk~Vv~fw-eghYtJ^Jm2P{7sRBvbUmsY6|IP0jt8Z+-qm24Z_bU`7FR!9vNXu}#& z2!}}%B$J6a&JCAD39$BqH)hYlw+3wzXa4Rots^)8JRWXa{{=L7(pxVt&4lJ=;1n;aQD^Rd^2!_d^Z=a04Tb1B)>q_nnu_6Jb(Mj zoi|hDx(L_(;&UQ#RBw~s{iAC&q>9Lmpfe2nM9zOY)%^BEe;Dw*v$uz5CTa|4spre5dEcPHi=w7-$%@G_ELzA+pQ)0X7sywijDfo6jIyGIcnzsRtF!;O5vUcbt0`t6qiFW zw)oo)p|X~nDdUKTzleONtbTyMSa1V4${Dfk)NZ%q*VLXS0&UkFdzB7_bcTIPdtEd; zi-?hZ{;7Imasec^(0yfWLe>9`19}Lm{$`G7 zPM&bo_j&bf?V_a;o70|(54annc_i50M~u}7)}h1k{63i zJb9i!GCmUi!#huRtRu^NsR43pODBEO@*#hOr7~|>%T=92a*1kqRB}3f-;L>p%@meJwsYHb5gjMq1fFi^%C1PMZ+%HHgr+hH4kgE{H|Azam;9PJfS z`K))*d&;5ngjU24pb357pPUt_IQ{0(WJ=Y80aMCwD=cKet0U#T%5Ht&piuZq`pTWS zHFxIT#S`ByZ1T?UhBfnhLtgE@jgh+Ks)0;~U;u0Mh&;<`Kg4N&ww|~|Tu7}TBGp|U zXNzNlP0v2UMKmE#6j&>X)_u1N*K*3xMY3l#)2d>D!L<_>c^Oe}XzpfR;PA~77dQO= zLw7nWux18!$x;s`EGw0Hr3;|`bPtlkqF-nuKcB$c{u&_$Jl85N-PL`T3sNM2i?Y1e zSF>9jlW$XZ9R1{_ovJU<*@-?C7AUkIOUKDUp@-FR5`N}fVLZ&lFg&?c*}7+L2H>Qu zQW&v#N%Vafs5|eCVG5Fx;p*x3&jSRnYZpm)hgfJ881DSzx~%DEXZDu1+_}~I9{>UZ zmi}2FdmQ0jtwqW9SMaREEflgrLRHvkx?g6B=_~KR4E1rf7KplrRV#Z{XIKF_Z9|Jmoe7mr6>7z zlF#}4aC;=Za5I+tU(v24GbuB+I-R6f9taoaXi{UA@{~Osf~q$wM@9SAAPB=F%fGvm zYvy5)f!U4oi`qt)&bF-BO}Oj#ZA$jrMt$Oi6lRDusbuf=DJsu(@cM_!UVGsf@}Bfn zBFDaSs|d!s3e@w*{nf(VnBwPEc}{r;f{H@-`tyjB^xEHqe6*Z|;V)tQ`zooMA{Y@<-e} z=plLEmxZp(Kw9)u-VcWyYLi4r;Fhgg6ZY~0sG=-W-g7u3?V(#bv?9So$?{9@ldvI) zpqDw{dqS4_3M=a$qHrzKM`KRI~+DOK56FJb=~z) zl#^eyX}PMHq=eKfqMtU#O()UY<6Y0Rxd3zW=Se;WqQ&}`AkOhz;jJ4b&n*4Vj=hIP z`O8v}u}*am6&H=p?n;w;dL98WmZlq{)#&grktPeP(iUO##lV}lUiq}@LbNn_?XPLG z-cQwhsn`Ki>R%gI+MjqeN@**FtTbc&!fi4GkI#v$)|uYjc;VoRNoYDe$RljZB9khf zY2D_lc-&{(+b?Nr`RcB4pUa+l*V`bc?k3+1xvGALJ*TFve|-&JOaH$hm@*|WJ>%JE zl$~r`_>P0+NZHA4K7G8MZ_hyiCTH`rJ~AAdK%->Oo6YR7$8s8LTPdAfJn=`}OEWOv)5DZo4LVBYK1Wez6O*DNBp(*qVk z%Gw?qhTkjf%j4!5P)jsH>gR!ih`tNl1gS^d5sY3Utw^tQb5@jj5Mo=boD$_l6$kS8uul9*Q@B zx(6RE0Ibd214(G~Yq-h^jzXwaVC&CGWzt&>ONL0*KC~HR%f=9(>CcR$;t|X`32UbB zT(|YaY(dKVUW<_oAqGLr@}-NtIKN29?`ofS547N$YD}^Xzc)$%RV!u`Zt^p!y6zG+=S?OpwYPbh5AZ*F=o}GNH zd!c_w+g^Yo8V*UXKWLP^t;pon`E|k7mpwokMY`NMQGY%gYBdWPMg0yHr+J#+vfX)h zr5#8SpfKx7<+Jr z{fHIur|VVw8_3O;T$Za})Y2ACq0)Lo@{~s58qMbD(ze`^*lERhUu*SFjgl5 zZVT2MZOUXBz*HAxMI0H|(IZ+Zvhl<|vHQ)?`#E z(jzAoaN1*NSz-F6{WSJjMolK24~=V(NtEOsdR)t*aNWE(W=eqb=?S*U?o+)hx2tnc zyJhX{{LL(GhAOx3G{&y>d>rfRlBsrcB-FPihvf*iPu?crwUa6P4~fSKXyeypNUb{x zM6dXnqn13By!>h&1J!i4#YfHO3k4whkFj6pWb9tDkT`)}^4yy`FWG+j_XF9XN;ZY> z2bDz0r8K2uT3?lGKps#(uWc@2eDBT!=}^eqlkD~33*acB8aG(G?PenAmX^ZCnU(Mt zc&ikc(+i|~CVqv6;T(LctVEB1p&qlieulQ}WdL*LdE%`c)hLaxy=sz3ihQGd5r_^V5Ih9V@sv%un$N zkJvNTPzZ-bDGh|N`!0vsjkHXDPdRl595zjl(b6Z26|{tsFwBT>b4o(Lm+?dQmT48{ z^ebIM^YlS0sPXCJg3^_S`qYC18P-xd7d7H=rCb%hkJ-ju;p@CAYHh7`kcH!km zm}G#NJW*NCJrwob3Ik4PHPR5O@$uk$PippuD{VJcLwNoSk^U|yO;CwWy_faw(4+LV zgoTV}EfGAFk_ZFBddU~(TWSY6a^hRhu7qh^UFvSI5F0((SPv4Hzxo(^=yPrS#w8pv zoWs?O&s4qqaLwb=q+(zSOD~~-!4^UwJtGZe5_5k3ey3rE!|VM^*Qaad1-D`g$ewsP z?P?ECzMuCgJ%JEPn8Bf*9m%DwE(XqQ6>M^4#OL`P(a2YvL%j_o}eB3g&XlSPNZjk27EY|11?L*$?~iZ$8ybK5d^ zKgkycz>>H==Im;2r`h$A*?)gqOLo)ysI(F*63rwoOTB*@MSXXs$9xGoD2R;w@t?&bnKwt-#=h?A zokMhnWnMv`^L36himEdXoLy5DHqX(ep1bAjZz#>(k1o=-LYzN7%}5{Z!TyW#iW)NR zdPbD>-sDu6@Khd{6!Y%kUpucbUdJy1oiq5r%ngBgs7IZ$j@`6c!Q3#llT}GJZL3pg zb1ja*;>64$Ib8k~Ms`W(D>NVEY<~hAzm>mZ0x)Y*A0T-x{lBB7uJV#(3>fzGhiaFz zFlsQfYXhNbC0O5F)O`w}cHg!!nzPx(iYE4&?8AutlYaN|kNBD$DjD+Vov*hhi&CX- z&YhpqYKPAOzzJ-bi_DyNHt;M;%H>U-_HNi=s9;B$j>)NnPIjq(;c7eGk3uchKq1a8 z&yn1SUyuzSRLi#cO4ze~YRaE1i`hitOHatlyIfuGMn|6AlxumwL zgN9$nf|9x*j<(VwWBrM3`x_3fu2SP!PpeZ+73bESGRgjn&Q#~{=k4QVkst?LtUa?V z_V#0Q8@HgKTvm5lb@cQ};007tVyZ6){%M3%a(SItf`*-}Hv7YK>t}b3C>}2Lzgx}- z_}-X3@i*BW-Sh~RRXBG6?>l==w(1LVRF{)_$tp|*9yIYtd~c-3FeAKiXOHtN4)#GI zY8Gm|b~jAp-4NTWS>F=740lUO%awEA?7>gZ+usEEI*+C3KJ1-Ulm6-^y_WZL;j}WW zKLz-LSt_jT42e=fZ{L#^q$KXzY&9N5XoX$X2zpG{WEC8yc z&ns`aT4lsJg`w7Oy|dtru;bZr$C?IcZd7_Dncn@PHmxd6wu&CtPW(rI*UzP0+7jEHRrN~F-XA%jM>Ig2lYGh%M(%P0x&hnv$K>r~*%>eHr)@Uc)RqsAmrV^M)VQbtvBV+D^X*mh^@Y$vqkD^4p4b|!fI#Xy1 zBzMk)v(^#@7(UV;lDJC-ybT79McA#hpU$&rUY*biiAnj_bGp^|O$ixpzH#F3BD?RW zhH$27<|n8=q>NAj5B##^r)$i{!&ncm38{3uR; z3Jy3>mgYXMCWKcuQjHh+>qkrB!l|T-$3Hp0qdJZ)BN_Mlp?DYyJa{6}U*kcqYPNtv z7Oe#h*FcuPIVKcz(0q}g9kv|6%Yv+nbAxMG-O-b<;ks2%o4u%^N0N5362KU|7m^BR zS7|lVN{_RIUb+jfK9|$}R)V}j^kdpZ5!XU_)7oscD)lp)>h77qxj{Dt(S$#qv zVrk;+J&xNT?AVVbd7hy*pznG1g@Col9k6;GZr5lZUen4q+(7l~v`o?w8B2n@HR=^o zb8+-i=K(y!=^L6L>Pn|ymR^&1=ol_1d$3N+33jr5zxDx8ZCTvvHY7Mc4m5iSp))$~ zl6^e);Am{Q+e=d1#DC%Wr7}PE>UOtaduh+hN*j4*n+9S-Rn0VJ#AlO~54Fjxo=-$$ zQEp}e^E+Gkt28Rf^*#3FsDTZY<(IphT&vBs#%5C=?vx8XhC*ge24OUn`G|am^o464 zK_1GSkLz6(Wt9%A!+rT{2S%1OPkVbeG%}Q{WB5&3AZ5hi7*?QNsH9JEiyO@7rZC( zYVz`DrzHPK)E z=!-ktBF3U0(Vz}@DQ>JZ>e*q5;W#==8Av3sK8i1q@nJ2}?B7|W#6707*z*4ORJzK0 zsHY74Ra53l+Z1Y@VsLI>@8}Bv#U7Kfc+8>l2bHeJrRtThLu>opGS+~i9VO4cQ6-w@ z{#NJf?y% z+R6FdnXhk1@}rdjLe!)SFICSwO%Wn}y6QuH|M;ZDU5HZ7aJ35fy2IF|dR^t$<;B1f z(an=QCIg083z+DW=>EG;$o>rnn^@rsLJZIUPJoxmiCAxztl(ade;hGWs2?^URw2^S zZ^H+Q=wD72+2YB#b~d3yU7A0q;9aIX9a@sT{lu-l4B%9ONRboBp1nZl+wiN?S5ba+ zP~7d80DDYkb(9v|E~d)N>i(;Trl*J3L#nEI-@4$NRGe>mBhl>po$Ykl70Zl2LVXmN zO~-y=uHa-QA!fbvTi}>*AU&vvZZ1u}EZ+ zF!9Nb%G>4g!Rbw{}1?zhU7h+-MV+JZZYwdJpq}f6{*F<0;)<+ zg7%k|bukgjbcc?IqF2r58r$1`f+xb>BSelHjb}1Y;?syK$7ViEnE){BmaZ$33_> zqY=fnkZJ_TaeqPb9;)zv$L-~a{0*aGn3&S_Qf45Noz)dEJLrqW>n~c7bS79<5PbVx zdGiy^{P|fyT4C>l76dy-&2N0J;Jta-H^m>K{`;}% zs>`T1U)tt;&?cZAWR9qA>~H_XMMj*MLF8zNj^ZcLD!ax6Uaxro15DD>Yy@SE^7s7a z6+wLd$Ok2q`1Xa~P)d-lY=T!N!!H>-_47A63=VhZ-M(|EW;F`rHxAwm ze`uxpzRrriocQv z6Ra0lMacmWM6d``Tn(A^rNS~j+pdLj^J!40Pzme=;lBMbiDmaIP{0>xxn*+ce9fH zoN3MSS9!YpGQ%@wInKw4|I&{;wV3xq<3!wc{*%LeR;u7^uAQ%3OxW9hO;NN{V4TU--WCH3rh5y*FzG9Zu6HhtA2eq?u}aus`>MR zLVXEF%thS6w8Hb2LZIG0K(U;tgu&EL|LA3`mUJbWz|v)1mR8O~4PAyd4Du_~Fk6Is zFZ~(8wBDz6BP>yRa_~YVx@oILGobZ?h*54UdPxCCF*HxaFl834mrb=3Zgj~6ZCsVFyQrl8V`kQ6(Y zkEP8x*N4A;xVF%sV4Ziyu77MwS}opo@bUh*Q(w|VNZWrcZw*$me+%Pa2#legKV^fR zOs+M_dcTgAhjjW@5aGg#^UFX)(91cAPD-!&uW1r<*1#AgGN7z!f4qCHzPUT+_i#CV zY67zQ^ypM8({gQ_Qf!t)5dj1Asrur9Sx7mE;LtO<0W;4&qO~Kj%h-V-^Y{mW^EICs zht@reuQu`52m3f~<<9kLbkoyIx0qyyn1vDdIlE%EaB(rtvAE~92NIC2SkCYsBPsNl z$80SG+UaaHOi*bpJ`Q{4%(h}y?7L!cH5{4b4BiXIKxoV!F5f`h}KFeW*kxX*MnKxc%Op2 zL~ifddj{~5TkNp7w1K_iv@z&6r?=bP&eL zSmBc>zVAl2fjrrJDrqWh#F?`kBDXWzZ%}IKGoj`UwOa*8*8gGZtizi89zHxkK}A4B z=@Jx>l#mXU?k|Egj1DR39w;c?CEcNP4T+J`-Gk9RVB~1nevjAt&-*XA>)CmBo^#IU zbKjX*k()p14jQs)xZkJ4B>PLBS7~c;OBo-9G1L9V@E5E7B>Bm66>zs`BGU@;HTmtG z?6h}=reUzKHAI;0Ihd=36D)H}_+E|OKjlIqyEJ-f88F{OB(g56wwSj{H_E_zjHEm5 z5L?d()hwf=UL~KREQ@axHE*^XR~s6BEZIr9+CWD(0}ETpL&KL;Gg~(d``_!kPYCsO zQF5((m1t*f1XdD3y}x(2fKIKL4?JePyUFYZ_xd>Fc=3Cd7oQjhKygX%?vnN9dQem36{ z=nj6FOLlr=x+0TuTZv8Y?5}oD<^S>ta_AJsZSG|l&K&iAAl0us@XfPG8Z&!JkDf?r zJi`;N^^Y)5nu~AF?}?<%O@!vyf0Rg>y`j?TnIvZ`tp>feE=wDuO5ASf_y|7REARc_ z9b3W8PKp`?xvpdcu%66ps)4A2Uld2=jI%f}dn%{-ajP zg;VR@d(a3!z?127nbMxuT(anT&m59s{M_O01|+KTWAW);yEh{vW02-hqw`ELRD0R# z3aeFeWNpAH5)nAw23nnew!=c?{pIr-TlnpgEXii$p@_gWjyKL=9brx&zZ!#*_jO$B z(DBEjK4q6=3IgcPt71EC510DJP!W`bE|xknakK9!s>_?a@xvh~7X3M*yn!S~*#JNz zqN;cBg8hIzJDxqd!Z?K?VkfD5lw4otq2KMC4vuzp%%+@QTs;%9>=@Gg`uBr-2E2q1 z^0{|6be|mtzPF5jcvRm#?P?M$F4Ki`EZjI@JLuyZl`Z^sKbY<5NNRtv6Xq@e)3!R+ zA2c*)dH(K78p+^BY;*a+V@6g&*M@8nSLJUAp_OpL2-*!lvu2JBnq5w+D3d&Vf_V|f zsWVv9wl<%hM+6=+#|iv;5I4ui%bWE~u)DT)r0u1V$Mo5%z@#$SgBnKV0Cmx`Nc46a zMqYI5ujb^MMaJNhdS!CQXZRNBKiQ^HL(d7eD%@b`C?+E8|AK_^`@l0xisB^4}ZfV27;48z~I(;CqT;mgJ~^ zJW=@R7eLeQQMUlOxZ{90Vk=L;KhEM73?HRv_xhY3a2*FeOZVI>DZOf3cOiw*Z_j z9{qF}EOQeHm`$EFfh{lN#joVRdW~bE57KO2&EZiTG&-}ef zE-JcDKhR-4?t$MUsM42UBicYxF>y4r$Kbf8pG`oNa~94=?&h^7xBh9*LU+O6jxd6P zP?`G6c6lNO?$eygIozT1$x#tonHCz!8jg_*%mr2nxhkKnLSd+7B>s0qI0Z}{amtO! z9epRW&bizt=y>OreMkJ*pb*@K-lakA?l~!@JNU}zn{AZYm-bjn%J&)(g1{#9=~kR% zFnpW%5J-zhMX99EI#)wT)MO=WQhqTz=P*8j85h`;S6^+mV@Oendu{Mner)y`c|!eSoVxH8kU;5J`H zy~=tT$5-=3&Ig6nc=`>SSMnpj-+np+{h1iH@)LTR`6N@Z0u5wtC7)pgvvt55o~PKI zfk_c36Rymv@0e02Y~r#Gp~Yn^3H_P5IYs6d`>!JI;}6Y4k9(t4?zESNFF(t<{mfRt z{}y6)>p1p$JwsNul}+!@5y`{Y38+UmGGOFrR;VyP{}R_@(%0Q2q3`d9HwZI$G}vN;h@(!X!eC@Fg|L6#6tv|5k9CH-abV5H2)$mYrIX{3E1pj` z1M9jO&n4dO{S9vGyJW%ApnAKCdO*vyPfH&XLreCa?prPV;VE;3F89}hVmGkJHw2vi z?!$87?8wd*b9`7$9S!3=Q``LrVqHXTo9A^1$c~B+zy0rWpetAq%90Qnb^0vR>Gk~6 zopuvtW~I|q`<%Ni!-0@oKFai=KX-MMOIDkFEF-_&nm;?I?H&?Ap*A86rD^C*ZO)ALc_#geb(`uG7RzXgE%{<+3PPjtfbX{;75n>AMSz# zKwi9CCuJ65lF&@bCxRuiuLhR8otO^l@;jzLzf_3z-J=(Og3JP+v*2hSs64XYOpa-( z|Mki1_Vx1jSI|r7;Lf_{OFq`_cHnq!yAS}pvP+!*b#=DD8#6(mJi~ol@$F6P@2E|z z#sq+V(4l%*xcT3>)pvFzcghSOt*s>#gZieS%JMO#ZEi% zNdSoH^&uVu&`m;|TZVD;py?dddWn#-kdO?RMKzT?&NxG|y@`z+^cI0LwQdjJm2Q}O za^}o!B=Hp6%i=%w@==wQ0b3?smYy&#DVK|9LH8$bzFgwyC=#mH4!ZGbHg1$o*+s9+um6G-^87r$vo5M+ouVyfi&xCPNXR=mKH zSAi2ttDwQoWD1YmbIqKhkh18KV`GS6nPkb8`Y!nPp*e$*{?|5w+7?cx*eib$$*kV8 zw|V6HP zeV_EA^neSKj0nBiIkRRGAO*PtwphjYk4PQ9E>WBn-E*W&K%+bmti@C`kkD?BNepIn zf|;z6kIG@loOnQ7I}inI5DLn{lf~^FwB@(cKC26c5LqRt+|I$xT(f<4=iD}7*1~$=N>aygLO6MB8ib}8 zha*W%%i|_(&x!bn47E=;`9tCr3el`yQU;(?r#o*J5;VzVq)af{!~d0Zi{v`Txt;il zbSw`3!D5D`7q^ z{olh(EqU83y?yHJw2UE?vdM?l`VlhYK_@Iv2P64=+V20*rf=daL65$2N2i);A_Z@* zI|rS5@{%BSo*YNX>ZP!u?`IV6`St@_rYN{g-^)l%Vz&6VKJKnoGI#ZTc;+q>;W~dea|3J10r@zwc zFFy7cQ-GXQUeY{|X4-Cdaz|Ct%W6we?4rP*=mwU1e5Upc0-}`y##y2q^orx<0}ctF zOV|~2mez1&b};+T7JXm-J!G88jl6Y8UXGEKo%!B-f^91My?uAVP|&)0)6;4YpuD_zS-RQsgFt%*nX;O>FtwxY%B<^om#mX~Ay&Ks$G z=C}JSkn2fhf3xUnH8K1gJuV*iWAV?s z@t=ZGj5&Ny%4Kja)SsJhbP8^e`JE+M+NKPUe;VC8+sRMStH^huBPS4y9(K1Ko9knq zuVI%)N3@*@xU-{#n6_u%4qNJ>f9pV0y(uGIvfSIDq%HfeYtuI*Y?U0l4zS9a1Ls1Q z{aCR5+DF^J&&Esf8k9MJ9dPcXAETfj;Z<~IS!jv9Z_2&%WTp`=*)b0cJ|Mw9?Z6|K`+X=^Y?@zG zEGu9B^G8gl&N3!Yw4M*&{kS1EphDe`o_ zA6!zC`aT*oPvh~(%C+vj=D=25@MxW!8m9S3_}tibEBEKO=*Y(%jF{iYjX2;q)X)7` zB<_1!3E}W{qaPG6JuO&@syTwFd0#-JSv>fJeQqn; z+pkp594cuyHMXns^d%Pql&QT%SnxoPrn1D{4-S-f3}TGw@3^lLFV38H#iz2*ipd}m z`;gO260;67NB6w{GApg72IVWu#7`?KCAC@vPeR#^e(xIk{q-@DsN7!nd~Nt{qrl;+(tzNdy+Vzv&R579mX>npfnZP`vI@8 zw9QUsa-WWR|3>s}QE31|;q35-(wU!Z_{@#Xol$PjVE|=X;@Zb@2s)vKct7O{ywj2R_$ZM!2_;+z>2{C*`;eM3^3Sre0k%8Cxv1s!Y zdpz4fQo}m(!_7I?b=qh~_rV*}S*?e-zgnqMK0_Jrd?XG(+)2oLMXn3< zQ~)hY{E=Fx-O=6Ay~F8QHH~vsH;$v3+G`-ux1B|`jh^pk zHPryyH9a{+Vj5B?L6|Me>pE8w%R@jvQLLqY6RAEZ4}ez z7Y*4fMh9($xJVjJCJ%W-4=1YGJ3Wx>cR6Hn9VdfigPePD{3&~_V|FE1n}@FSfa_v0 z5Vhh>{Nd_GgRHoC^^P{|&4t#nCJvMvP5l`{L<)8DEODRV)@%FCZ;u0&-qy&vU*-P8 zL5%xf-~R2q7ZQFc;0J6&%NUAkXT2@5%l!QSp(g?CS*I7;WghXrCOX8izdIOYUIG1? ziX52PZcV%y{PSwEsyfi^+{4B_+5-n3UPaXIp+-;hFy}=)Q4G^KXkd6^m?Ye0WAP%q zRw4!LJRO>BmweD$fZLuKuxjb7<`TAJmwaBVvTG_Iw$76AUj2M{;tpPr{bxN`O;{b` zh%?YeVWzx4<|^_}fle(t`yd$etWT}bOe_sXBE>cb+im0JU>Xd7}%cfDQbW z@jo`P0$xKkRCE5hvtwU-Zv98{Nle6aVFb(xQ%_CBs;>&oUFDxkF&FV<%c-PiVwZm? z(zOdR+0%u+aT`;my8#m9IMlAq&)T7My@DHu@=@6{WTuE@etZ6?pMT7X&Oy)(@<3GZ z^bL?X65GV2+)RZ&1R*xpfD>a^X+6&^G83^Hf+`2#in4+d6+ThyX)_+Hr|67-J%C?| z;MRZdgI|s02?YE;Yfl->IaXu{Y`~Tv=6cmi=9L+$gMR6y{Trg#pHu&TxN6-0OH6#Z z`7rK*fkvR}()8WRR#|AdfA!dy(MmFBIt5G@f^4=SzG|NO+DDbop;_OaK^-aB=Pha z&H@lVlK-Tl&TLgD(&=&i$oB8loI|LCSt@UH(j#ubNN8F70nm&PqYp}P(9BcD|);tI?6=%g@I;@TL zqk7YgrlSsuQ9sql;T%&ebB2*A^5SVT`hBTckB}}T)=gJ5=i&9U(08j|8oL%1DS^#- z!UU{$0$valr65EKxeQ_ld>_sH?#L@3K004q>yo>*$!v2YN6;uqP!sViO*OY&h$eoW zZT9AUa`HK`@b+ltXN56&YXtLHy8M|0BwzsltXK_yZGs*%Qa+y8TEXuITcV%mKv&#I z{NDRiDmtF4&v1}TmKtXyp=!>MMPu;E=1nHsAtA&57Z{9Pvb5Czh9$5a?ReLs(%wwp zF4#b*M=n-;h^|Yz5HZrBFCOrfIcP;DYzM!#MZW}nz-|tD%FIM{H}w?B(x8iWRAP)b zPkzLpXpr;SYFhG?kBtwP9w4?33Ge?Ap?r|O_u+?;rd7Sx`HF2{%gtlN3;M8k$>YXz zA-Y~0f9j6luU}mFWLfB>Sep9^zFL+KM!Ed5%TtbJFP{Cg#%k_{Vu1fMi`YviwtSu;ssoK9+#0x^E9Sn4&T{3p*`4^kE8$B$~(OWS5xdNu8419 zX@Hjdf9YJWvAs-mdD;2jCY%VarY8~Qf`p3$FUH!ZH`~-U?0E1D_yiRsyDN38s8nAH zheVX1Tz+u1d(gIflt|LOsRLfSb2-4W3vn0*lxJ%CuBfv$vvSL1gX(0U2nYqZCD@)X`Umbfva`NX*(J2eQx7;x|7sBPg*VdjNWhW_EQQa>d&O3bj9Ro)Rg<} z-`BQD(=$rSJk7W2O$Uy9V7tsS>5Ijg%Eh!}<;od7r+l}6FcZ%u?8`-;UJ2gwXHuif z$l7Ej-vd~{zms(4n8sfXb!cc|wKLnZ%}AAm;*n7P(rJUl{X|>-S|Y}f0piE?Bxq$3 zb;0Lv{jqpY+k(N>0Gdc2!Xy@t1;~o%S z#r3R@ZR#AyP&Othj3_mQw~7u;LNc>Eq**kq#G2*e)3kZsJXzj=<#u#AyXZ-GQlDM= z;yUkkT23H?7Ps^c1>nPv9UmB;68vvrfNb}LenNIkB2a)N@u@KJ{JA`- zN^h`8_ug6U{mV69I9v$Wf$m(dJd6I7GwJ-`apPG_vxC>o%Y#1%Xh;>XN&UFed!||Po77w4OS@GS@r6PO`p24aOad^%3}TM5GQjFr49-v zCsn;8&3gxS2g*PKAi;uObxK=k8^HoK$cAz2yp1f zsMS%EB7h1)QG`fkN6eEaur$lJz_g+;rvSm*3@g5cIC9h34aH;>tAY9GJpS?`lM$D% zJc~6)nuWRA&gGmVTE*+}<}&@4Wc89SiU%v3B<;V%a1!LO$Gd){ZbpaOcG>gm>^kE4 zX9RST+j01R0km3?*7gT)XO|P9%eJ##(RGavU&AJsfClutdD8~}Z`fSK!m%DjrfIX8 zr@+~Nr=|0`nSm%;VA6_=kMwY!Yy+0BE-!J@V&!H=s_1;OUUhmzO+61pEo{p3Tvqt? zl~&$4%#`m+o4gUQ0AU` ze0W+5HGV$8tf9q2{FLW~m75O?x9Nu{`s zgtcb70v<1dOM{*yZ3W+o)-6#aP&H7!IS4q4Woh2{yzhMjAd`Rw9w{~@ptdd` z=I@fI+}b#6mMUwOpIn6hYDi(;vW~I8v=3_d0$^^nKF8q0IszoPj6hqk#L1T9a{}a! zx%#6OC^5OBIWdL@d)_u7aWT4RY3s!+N$0UqZ9C`yJsjV(vJLrgqSf%P<1KN!qlx^U zVjLriO7ph|)ncWxvtjY5b_fCOQ=bJj$tg0MP24tr5@BJBDKKVYm;Y3nSYhdi*fpHO z7$PyhypTRDB;aW~WQNEz&?vwdD0|Z?->Orq}VS>lp6BmjgntA;1|juENYO z!0$QX#ZXF(cQnhLce^v>%B=vDF#VejJ9;Mo3d-|Le5t@1S#krWXH@WWjC^+Qy~EUP zaP)J{z;cE4)HzD=%bf%y>OxX2+Yh7aSDMg}cPmTYd{C|42lTe08b~O_6-qC{Gz4p}6x7)Yr%lfcq0c(xpSXft# zs31_UIbZlVtp?)=ov+sOz2eIBw0T2y`(H3v)NAC5LY@HvT-p4}U1nN4Qk=`peH^Q$ zo0VG7G=0GC75ac6`v?DgczIm!{G4O!=q(+GGue_Eh9x?e3jJdi~*q2^V7@*Bi| zW>>}x5$8btazJQc0uxcPG9!Y%I{85SwY0L?)aUA~!Ih?!#(xFq>W3+YLwkz*W9}!8 zx}-0sqXa=Rz5uz$&c9>A@nZ!u?(S%KG=uY$GcWNtg^%+QX>YLjP9!1U&NPffJf77i zN!6dR;c8;AJdu3Bc1G8_%RU4QOd8rENAs8t`7DLn6mVlmNNA@6j==q3?Os9j{6|!9 z%EWR{^eAzoWZJ2tedPeT=f!xLT~{xzE|6M{*g8=NcD!TW?)QHDO6vMR&BOd(_wWl`CO%HTMB}ku^UxuZxzxbx_nKMB1>k_3i4t`2BbXu12a!?&gzu%Dux82=SNQ zMG{H%=0jH#Ark5IC%8PaRy^X)$f>uc`nZn;q0TQ*28a0=9tR24)nd`7rj3F!X;uT? zkwDP49>6b@l@q)&u>m??zx?Hy?wN=c0za}N|9-_;9|UExH@HuICi0%(9o)0lIMvYq ze6zv(DOB28dCh4bV%;j8q-G%F(Bm%kTdSBYjvs0g#Z9c7*&7x z)qfxsZ0i~@+@N%$v~I8hy;YY2bQ{^kNj?rpjZ9{M z@1vZ|j`Z1F-Q zWeO=;xmz~E(Kk|eT;$%!z!Dn_TU7)n+@+OaZpu9`*7|OG+UYQ@U(tx74-$DId9{`< z4xVHFSG@8lMj_hy)k?(%k?EP&aMERQ>0@ftlncavBZ$+s+7X zdh=e^IS;2l$6Lr*LQYy_TECrfSj#@A_Q6Be%w*sZv0yBM$u(f`j~=?xxV2T3uXxpy zda_W*8ae;VAo@uS|9v?VwbiGRUT`Y|M>2;8(qflY`4ijV00z;J-liqy+h+#a{Y{5A zCu+@Ctw)s-;s$1yB|n#S?Q;e@??>{TXNil1`8d7$*hqVgfLq0O~Oe&&{+T z06+xgXMt#15?;x&l4-o#HvWP%{93Od3hL?*i9zzG5YvO&D`CgRO~~Zt-{onIWT-!< z2RO6~E;olCBtM0eil&@6Hf@fe2lkvoG>3BzJy&v(%~H2-p?H+HX-SLhaY+v^@M)aL zg_haL!UW%z`~!kc#J;=j#;C~d2emxkSmykrpaKE~rhwgz>Qeeh zeJt;1s}23E*DAA!_%e%~9~YX6(d-W_6*2)op6bmel3&H^O3oax6)nUbd!9+%tod%;4GiOD~zQFf5@sMB>@_i*I`L z!zc1fr$m;e1eK)&&_nN6l6|N?>w@^hkaFVVft6EE7-xH}fcu)8x zdysihWdt{jk0t9LLLMpp&BW);A{7)P5+gH#1|#IAGnaqdVpMq$5*=)4L5H8vCbYVRjX5B2h!j zoUwFKbRJ7w^BIedZWbC6oik7mTNxk7@fpKJp^h<);7?V$!E9Drp^B935>&79 zSI(X_3dpzNXhXg*XzE}wC_pxrOGpponbT~x4sOjCcHq0K_N|XCfS`CqnkkO$GjR`8 zC#Trn?G98>GDHs&BaLt9V>cy5Sr~gQsGJuX7X6pedvX7Ow{&!%J}5vrsxfpwbMK6U z?Zk7Wi{?lDwW3d>GxR>F3ph6*-%|>qgSf#B5ktcHLV#w+=X8;T+2=i#GFP)7rw?uj z#VB8a4)Cz#{?GR|i95#v4>&--SQ%dI3{5u4j zE;I+$I8RN)Ea?5ES_!Jkne0Qdtnb4`yP)+C4DLvEP4en3=}7`p4_mR$b=Ekxf%SF{ zExwU>9`i@Tfsug}=ViR37=<<}A;<@P|4gol9|oxG8NGnXu@JMp4Ot->{~GCO2Q9jl+tG*W zT)DTYQ3vkUz0y&paj15J8m0dg4wd)jgzZ0yJFFQjf7bE>L*)Ct!N(Yuh>#6N?Ica) zbrl!l5leF3zFe>+@eawqFE4`nSQp`S5Z5MYL?mNm}FRab$G5^q# zGUF{&-eH|oG^Lo?AG&|rMmv}D*mft}$${X)R|~SFdnngQkO^|j^u@gD zu#1Zl>wKS~U{pq05K>LX2q_!Pw zku$_(fSq+h<>d-RnUwQ-E$&z9UP}Cv2VxI>>lnPBT{*^Y-|unXp7RMI*kqer+b9rk zP1A8mM9eM#c@}aiJUFE*SqDLnZ}aicV4g&iH{GLEVCt6IyDf=mnQpEllB6)Q4~wl{ zc}bY(4kk!)vnRgOf4zsV_gwFDt-Fg+-MEk51sZ%ch&TRvAulrPKpru)O<5J?fr5aR{h-#1;_1H^%GbhWPc4=aUIxoAk!@Uo(TJUpG}vI$BgO{r)I+3Swe&0Ia(Zz+{X*H)>$-iRBecjuzAgY zeFy<0JSzw^6U(Q|_0cwir8U$n;UIV&W;zOzZb zP!Ahu$JKJR@9P^kpJdX*th@OBZoZG-6OaR)7OG>7XK0L|%|8qjxpI{=_FZ2b^KUn{ zWzg`>bKEHaM6^3HGel~WrA{p=YQjkLPR#ItW70S5G1=(~MAhR*7LorW34QN+K zTw*2GMpID1%LfF)q7`M{Yvr8GAW+X3UFJScmECIhUy#qiE!)4IPeFv1+zbLWGKiv! z`k!T$o|i%0^(@w7pRt;@K5h4?VkWIYfh~?nZI3Iek!ypMNW7W&J+zQac|h*Q5S!I@HPLBGA!H}W)H z2}(Y6F10fjy0YjNQSNf19+ml|VKWSuksNFvlu4A=oSFW8d8vlusAfc+pZcwc*!P4Y zYlI}raGxM17l`Ag7`Z#dsEHh`CoOoSJHY&9UeMUdAPov7b3oVFbs#mnt> zuaDuo`i+Uk*TqcZ5MI6?|6Tdu+p$S z(x?O6#0|rPC4HAJOZQU%3ZS1*c`U00VRU8|vB*{dn%KlT$~|v1Y>AJBiUvtErrkE< zndCk2%UbrP=iXa{WgsdHcyf2>Wwp&L_fU(7CMXc%33$WK_aqGMvSr%%X;z6a8qB8( zw(3UPKr5qD;vIF>%;O;dn)VE>kw5?}D_ScwVOsTFp|%_z^6+a3UBu%L_p7*oO1)hA zx%{-o^JyyRa}z{#b^cv@yV9h*gD!qsC8*v~v#4vmP6v!g2&Me? zQTgD@ac2&^tF`ZSB01Z?_h<3KvjXM>_edreFC>1>EavF{5yH=e%y{Q=M8A$5`H-jZ zVi}z7?|K(O@(@H?Z7$w+@ikfM@4v+ zD_5m0k;wdLKXu!90J^hyq$4w4S#|PxwU(0L%v!9qZfxmDk0YbS^c%cFT7pVRR-FC4_py;043K+;}IsJm_6uL_Pq=MNc;KkHQ#tUqsRI zM)y^a!8w$tfbeU~Gl}a{# zpa5!@jB{W9)-2gBZhEVpB9o^qQ zf4JE1{@QYcaCK66Alb-uQT0U|uQ_Xvu+0#+WKy;6jS!w)VJ;aEc7{wPiQh?_{3JVR z9+2~@#Qq*EUGxtj*_Wgv4cG>I=?z(*sox2pxwp;f^SJs6-*to6%?G6e^!XaXTrCME zn>#`s@+hipGPa-oE85Rn1JB2mklO!t%xw|B7v*KfWcjPK>B%3$#jn|-AjHpJN{e<` z#cISQtW!>Zl`?be%$}60YrH^9OGb;lw2QddN|Z7$ zV1He8JXAY6ir4>?*CrFd5}Eo@@r`EKqJb`8^2n&mg3BJtd~eLrcb{yeTs&>_PLoO0 z?&HhPxNy_@FwK$!CMH>PP(RYC_Ha)tlB3ZTli5)+oblSBqie;%xhvPiJdw^HUER)nB32yH0}0}!eYBMRbuwaa z2b&iGlqwGVoP>B{eEiCZS+{v-jk#0pGk81HKxK8`K54Im+$ip5?w!9(qw)sEMtKf# z=$|+ua+tRhV$Z%Z5iG#_h$5hE$kc=JmY4;R~#~eFh799^MfTBosJxjqqe3yzZieNc3*c zx5PeFHY|CBRCQ@jN#o~HXPTy?)ZRAI6^zQIv5ek~6$k)EwShwO#k2AYjWdvbya<5- zJ-MV~%h^GlTGm2rp&~IsB0qb$g>Kg0v*CsmB?GaK{T6f*h=x|%7=}klFuN2C2#0x<17%?l5j9t3e8OS`YAP9Fl*K)I50aR41~y;X2=u8)G2jh1qNX z%p0_EHv(bz1=BCex zAsU08Hd91|zl~f{WV0FKcy<{pa$Vl{f=>wfN$O6Tfz0UfzLpNC!KMP{+SnsQ+LCu1 z_~^F!j#yZDe=Az-kUS^I<20S*a)nu4R=+oKG2ztO{7T|PsQ)cDGKJ+2OE-Wn_Bt=b zCwN`YYJ6PjzwcFs7kF9sa}zGwN^&WZE<0l*WAY57EgzoCz?5C*r+rw@4$~q)sZ+D>gZC36{_TwGN zTamjBv(6eh5D-SbV+d)wC%d*y_s>Ugr@cb&Q|O9wici0cLPO7!bR&MIw* zBOmTFv|_7GzkDRPOD&)91hre#Stl1RZ@Equ@laVlzBlukr$YpNbQ{4ldJS_u4?75@ zQ`zO_GVD-;0SlzJsFa)92U1N&`$ajAA_(@Mg9i@A==ewgbCKpwL1WETY;_tfn$(0T z6O>V-DILZ~USM9F)QBVXF7|uLe;)233$vy;y9V{Xu2nP3-@-uK7jWOUpOtOty#)SV zodx2=R1$D zLfo{QcR6H}hXA#57KateiA7QcC8^AGl$y&(6$)lp+v z+$a#NSQ;Vx&K;ReX*qF$J2S6uQ;J=cr>F3XpcNvW5E8M=8I2*m8OZ*hK}Vdl z)OK6-5lMMXETa%(_Cvt#y?yUB%^xKUi}1W38(%l;_P2x<)OoWE4UXMc?1dCsH;wO; zg@cZ8vVE9q-&zi~Oqu1z?V13U-&pn)WMB^R{u0kHa5utBgf`Apd6s!~GhPht{nMrA ztno@dmJ{jORx>n;Uyk%No} zWjjhBWf@?O?-~G!_+)P zm9xIM)jnc2xL?z}({ABlm`AKD;l!c>h$O0sJ-On<0fx+e! zc8Ht`#lNVtpztEFD7@jO8(j93hkK~n6PT2aknzR9fGkFxvg^QIAwHZK^76xTcWX{y zC4nsJ`zX07D8>*wGjG!%tk(S@u1}>`nMfdOP$#zAKtv)Z)F@tP#dU5|XVk;2Ucjoe z48777({15@l1SxjIQ!eKEUE2EI7UNA?w+I({$-O5*#k4qI)w`j`a@O56hkYIBR7=O zp{jDEKKn(sY-i<014wgPx&XpomF&3Q8_gdP)6UQMmaz-E@XzO?E~C@8>{(b*)&)3w zO_L|Lc{Tnfc=ww~M)CFGj-Ba25f)5Jp{jM}Rk#c+gc^7 zMzFO(H~5L-JVn_ebMyi!sU_=^rT%c95~99~8jeyBMF+1%hg5Y!`NnCpFGI9*yuYO# z=e_WIt?LH_-w2i|`6JUWbz;WNS!4onpQOOh5;xmsQ-#)*PtL3(Ztt^EdgAz=X#OZ! z1YB)QVV`m004EAdK|VhO)L>IvooGCRZfK$Gh?X#2_goIo;L#wOv0@w0&&G(UxX!5Rw2W zb}$my2^!M&)y5WolCrxNU$yPa_b3&TE1`+Y886TVDVw?YiP5ny&0soByj0bo8gVZu zo+vv-CF=XA^|E#SG{=)Bsj6i0zIgRQ{~@vIAc;2(jRH4{1Mvh|$a0aoZekQOyS{Jy z|Cl=KxTeFlZ*R1MAR^5`K?O+>q)SOfN|f%9+CaKXT3Sk^8>B{!hG77b(lKBlF=}+P z!Ta;RpZ9&@>0- zCaX)5tAxkb$(WIgW|4-{pPknit+G8ev1b)~iECzjCczmv=Z?-SH}=$F@$1j)PSAJO zJ*~9^#of-<+kCP^(vSQec&`^Ag}d14UQQDnXj9%vf7x^F5X^Lz67lOMj$%isUTL!& zx_HkY#GdAHcd;=3tBu(AFg3U5^wNJzzAK%58>(>kxC9C}x>~MdTb+fr1Y&7e$*pxqp#0REqC(cH%xJ! zt~a8=ecFyfaJt`w9r;e1va<>~mgVEbr7>uGicAQ3G=ZqXV7BLy^?i=ra3J~X`gWO; zv8;91K)jqv7It$O2KPcQuWA{#AT!a zu?kvMO9aK@kpDCI6?w(miXN-&uaLXo>k9}Bf z-NW+1LGv~(Q6xdzjpN1_3vs=(6H`Xj1P*mc0CJgUozYCz*nT2@Om4Vj(3Y<@)%g)8$;6~BO<7T0K@mkz5|=t=Rx`wsayUKRmAeivIR1l@yQ&^s!J@0$GT;3yeE&5 z0N$vF9mlTaO(M6Lp~8~*Wt$u`vNC8jJfnrwNo*idX+iO7yxmZ6zY!i4RQVE0Wb^D1 zHx=`}@<>swcsC;3p9njikOzh({<2rw8Qc=qn0bOy+J_eA;SiXz6?Ulg?TDB=oGw0- zOX{m;64-k^<6>QCxRdE#0W2EpmTADQejK`%7MI8g5irB9Kr@s*;z`ZpCk;e{L_y{A zBtBsj76OAcMz&HOHqY;Z; zf4$%a#IegJOU?H#qP_dkuP+Yve9rY*--W_5;uJuUtHKkBWWZ2v{;^_qdheF+5X1QC z`19+x!|$eMo8pMFhV#ez}49Q^CrAsN_S$Bv zP@FY3WR9j|5D~)L?{93JC8va!1p2uDi^^xUrix9-q!FRBmYQ!#;ZK_+k!RO_j%x=p65vw_9YO}i25y3vZF z)El=~4y4MoizQA-C%Nh2#JpaAH!z1Q_^|02;x_-aFwwCJYti2)Pf!cyKYk!KBcK}w z%{dudlnBjIiq$kSZA@zIG8z1a%8ZwG&)Dl5w=uxDVYH<^>u^V_b6UGi{BI?be5+$g z<-FvY#-mEf-GF-p!rk3Y(;&fyxw748b)>n;YRUC;vEC%%i`q+zh|L!Lu=vzw;c**LY0-}7|V0T2BjsQM+| z$B!UB+U>MoCvr_OegX-j&Qrh?+~lJNlWX0UC$X_?ykaWGib*4!S1H&gKhbDAkLO^H z;2k;lj#uyEzfs67Jxnu(JWa0J^v$up-bnz}u4<6Ndp`zJ8Z{h# zs^qg4`$(WcZ|Y-Rs7r;WB${LrT9;k8+#1V1#G87Z4?Z%(KJFUpE3c%6?d*_Ast(*3 z*FGOQoGP}lGJTuFBY~nEH92ThF{kH~eD2ZB{ghq99ZOX)LA6fANN^4|U3#`foU(en9V8Pqj|rM@jM-kU$5iRTAe&r$vS^Qz{bMar8MH?YT{b;+0o?pM`B*2 zGm)FIRi!ua3{tMX{Xz%3Agf0%kV}QM#eGgXk2tj9OYX89AYae;<$!o4vvwEyK>tjM z$*eNB(&W;VxP63~{K3j#@y!#Lg(t>c@{)v4)(cyXNGDBvfAHO>k=jLIIug|rs3Z&c zcOE}k+b>_4sEEnCbAHHaT)RLvd9VcPT{Bxzs_s?0oq3*kv3I!94$4t#XXZ|fMS>(J zW@u%eWXC)oaz}hnZA_V=s5s7qb?kK83u-0Y&Yo}mdnP*kex3W8Pv=i1yPG9tep+I@2_Lwaw zIf^t~BST#0y9evggWKMh5YMQ0LN-R98-z)a47_z_HMx~cw5 z=Ize}8lRL1&2w=3`>|tV>G?b?vusGNRoah>ad>* z`;pLjJf!T8;5&DqedQ~{ByoH+m+Boy{&)KWynQU*kK>ki)B=39?CS?8buD!KhT#^> zdFkzEC(CqK`LMtw(P)kzPCJkH~zU*lrCY~ zuRC$u&m)?MY7C;LOd$hKW-AJWtGr%D$(*VADFowv8?OQm4%9YU6*)nZ*8TUhZqGlb zoUrI!$vzp^(TSZA#nWi8JTufirX}a@Xk4`TxW76T7}@F0x)$a)oLF00H!&%BG<{=H z5I(tB&A%C;kz`E%hP~R)bun*3{8y}qLkG=bEg1fb(4_b1uWHq&8|<9+`&!E_!3@Wh zLya7U8_;8Lm4vLXp1SCxe2;#yXSMNF;p%Qq*K-dvyk`8Dzo@crXs~D{LS8MWtK^tk z88-g;%fbF_!K%-rK`XtdmOI`QCSf;QapaOcP5{ZWo3q7i2{V{Qo5s=aY06qlP9KBx zWbhJ?HfP^3LVH(?PCU|emF*cicl!JoB`@|8&bNWE7(kvEzhJ)&iGsVihhstKc7#c6 znDeh^lHaVBq-sWoF5tg2ZuB3`O$F3U@74cY(HGPga;8oxGeMiox!?YGj&C6 zZW&$gVc8QCXzG6$adZ@2U1B}C@Id-toLsBpesSPEKbW7LE*BR2f)Ck3r4a(cd8jk$;)2X_y=NMBDn1RJCkgsZbY`4EHJ7Q;s zAXB04^^p4W5~yK11>WIez98O!5Ll4B;!gsoZ=8Z{-VmX-fEfxvk5F?XJ*HRwk%+4E z*7QNB3GJz~D-LL@lJ1DvBNODT%S@~c^gC(jhlR&`@sxO<{SCX_5Dhi9fK2>B_Rc^|+Bc4`9hc&a3DyF7zR>AxfQmk$YqHIr(5fJ|q;*LP zpJb?{NOHI;Y59_LPO0)kNrF7xNuD2JH>M~P#jyqwV~@vDe7Ku*@?nOGaZl11YBlsi zIl7uOS4a0+SA9I4Phz`w@A<^MXHbamZlJK&0Uqeq-nBmCvCx%X9nm{|u|iaFENy1Y zEbckk1l_M3T`nSw@jd!&Ls(yXZ%3fH0&r7xmG~skCrnMcqaRO6UJIC2+M16n^*xKf zZ|Y-J5*Ku{>xIXI9baUZ7S;9C$rzk{x%uBkw(<4!&W7``&wbQzinZ$HN_;I9~MT)i(>jYmJY*5GzPw zO*!p*E1;G1?g57VK6wu}+`rq!wbZk&DuZMCL!5zf-L-8yR_yxxk(-(g4fTZZj0(NaSEn z`>-~3S9^0rFI9%yAMefcK)|}rzY*>2bQZTJoxPM7)L^&@;OXB(={9CbD+Y(ZIcxth zH*I&|?QiYIgd}!YR^?ugdS7T%9k;E-Iz^Ybl}|5@l7gGy4F0j|W!0APygp~0a`$)t zk;BBoU*x7qWchaeYUa0mfdI-F-Xv#b#=%YT&Qik3$=+|0J;5XO-*c%qZ#}=SK+0JE znyk%X`{&)>#gaodm$Q+)y;VpNNUw>4P=)LZu+P)ZbTJu;Cl$Yren*m}vl8_DCKBBA zEFHx!s~u1Wq~*PyycD*pZBW2yawZC+F)z{koLNjkg|rVw?Bxuf(mV^+<(WQ^Te(-5 z%_1ffgHUA@6DpE|{5*ewSkrr1^TzaS5tKxXyg?-=Ljn;iq+x7RlXL1gu4*L*%G8>Is9mb`&g(l?Zs(i^zGeoyzfw<^&XUp?$%~w zWNExCULyZ;?{&c7!F5>z0_iFKmk`)=4Dud4b;XB7f|5jZh;>Xy?*?_P0h zqJ8ejA^PLh{VenfS^XcUAo3wXenN%_qpNm_8gH?h1D;jk12p&fLfD6+chaJ0msYMH zC!!MXj7cZFf4s`$@{!%Bu$v!oQP{iwCdC6$?HDB0rl_?<=~n=%+g6ilU;R&KJ8 zTsX)}6jaVwlw)QmLBI2EOZQxH71TJo%0nL-;s#FxZv?#*Zja*L1C`o3NAROQ+E}P7 zbx^Nf_H^P5*V}x`oa%5G9_!{b)t=;dPZ-9}M$0(=DX)s!zyTx!ySeT>k-n)DswfkA zQborY_ep4WM)v&HFct$9mV6e&w^>3V@nZ(Zi|Qd4=zOHsi+>Y9ZyNCKVsA`ftUCin zt)3jo>rW)WPln>raBDcvi(sQs7t;LOkIOJl6HPcRce?s=BD20%ujg@_l<+qJe9+zL zMBQ|ZlkIohrK`6^s277#fvh4UsK4jA&&nU?qAvY4imr|0}L20L829D`;A zWdIE#F}zl1UU|OGk*+sKv^oF25O`c!F*M3#v3@6=lhTq?O|=nI)1MEwq*cP;bPU{& zzvM=Trbeh3Zh)ICg#v>D%}C{BHb0NOJ$}<86`XED3MJ)ShG#KtZsb3}5$2rHr@c4n z(JTO-aVlL~XL66!X5IvEd`wq}ZHbqu7X zkAA9d)RdS|biQJqI;@~+A6Mo^UiE(=(nUDGKa1T?-4$qCIe_av+UR>GD1wMSEAgbm zk`Ou6RXz#Wr0PByvE(cQKv?33_^Fi`_va93l#IjDs9y1K)L(En1+zFel1PSX?``|I zG31-Gn~{OrbaMUQP1GZL7vq452ItU$RyMyk*gIwd%$p{h9-&!(OwXQs=p@|A*phnf z;qx}NlDz&%-<55-%GLaQg`ba|9=%8^_d;e!=b^ku(d+F&@X>D|lJeMNVz2A$XtV6P z0VuGGXa6`#ptKYnK9^wpuE1ThMkF%5KlYij_wQrZyQ(Gb=7dQTmX=wzOXD7DFOzjH zyhw3Qi@6zprO`8KSa%WArhc*}k}6}PX!=0>%MRyIhG9LZ0;cKHde zYce8vd~!t3X)}T;Q zSw8lty4Z(29XFYOUWxa~n>Jw=8pIN?KS*20<$kF@#In_1d2 z&3|I^Ti>bLdhv%;51)>Ur*OY^q5r0#D`; z==2AfZC7mi=;vrC@Ovxw0uH5d#^YCIh^v`uurh(_Al7;+o{?f zruDDf){caRZ?)Q(OH% zu$aE8m>BQeTHc$fR|WC@G>dDX=a2(sre%v@M7iT4!c@_a=Pp?foH{H4+-}0wrPqn`kOjUQ%TxOpJS6`BaRM zpVWMsoqIi-v>;>vW|PJIrbFV*^r`FxFN@T@ny#mgaAW6*^*<8T>SlXdrnOHrH!E}$ z#OUQz)xS|LzrGvMqq;R|8dvYbP!TM2eUfv{88i7;|EiCZ5L+meKEQ6RH$n8mjx%Xf zTSJz|3d`~ol33RpHFD0*%`2Iple@p$-kE8xzM#1pso|~HwfvI_xLDSy%Ivh5xO>k| z^8ulH{Doig`9p0QXG4z+VD^Ip3pT6!uCZ``3rEl3c#j&U%6jq92j3Hj@7!2J?Zi1n ztB*x_)=_#X(*fg<>g$MWHrKh|f?kX<>n$Gjx3&Q%ePmZWbxH*60P*~2^C!kz&Myx} zjRB4q1yM9XI+8O_qu7FyqauTcepXdo9?kWcFqQ&uEdM!0F>U=^S#*Xy@qF{SthkyL z_e~>7pAt&`H24#R$%90aGmMn}1*|9|rY$MFz*^EQUfu7)U7<+FhR8jv{qMcANrLbw zL20gY2O-$2xY;b3d*y3=tbJi*WJqf`?aO0-M-4xhPipu6`6|gHtS$x~a$NkDGlf?A zMA|#&CHVj?Owv5)GN@vEzBt1v<+2ac#o_%|H^bQk8h<^(u3A{&BJ1LjhsiWedS>~u z#@)?T@9KJn-stPq)U}$RxMj#EKB0xEzI!FQuD~K)%@8dBU5kEWxc_NZFRg!({t>Vq$8{w%nOGp49?xz?dgU9jI z=oJTUM&~!UdJ+?)qLoe zp`C}uat9E=VGg0#QSo6BBG(SyeMo9+qH+Co?92On0?Cb-uIGWvF8-9`7538q{ zpRAe?JJ(n6Ca9kr=%^(wIX!f>Z1xoBII^F$%J2s~^DbHPP8XO&svB~vHnkB)&%KYj zUwGFjwj-A6_I@aw9qRj0fyf{R9_MqXGM^%Kk~+}W`%dP{LFxxIgF4u3Zvi0K3543E zGgA#5=hsayA|%*cyxh~%6*;JlhfeE~_4tQaOkgew$TiWe9@_Nw zTN%Oo8au0pa#M%?QhA3vC9OV?PB&oq&WR4+t$?UX$0Q*tm!ID)_o(t~UOLYZTp}c3 z3|`tCNxNR~NnUR&h!YpY7@xnhvRlZNLX7O&2ZilkpG{5g=Tihoi7$_REI)Z7J97^u z@5zS408HF`)hddseg+#El11kc7P*umhJMgz=u>2j6XuPXR5B#?pmf%uS#P~awfn9M zfl<`ozZ^*%_{P@A%;e`fyunE1P3}m388KuGKj+{S3w- zY)JXGP0P(Z|6ofTe@m<~EMIzX^~}ktkOKKl@hwb3sy~~ub?)l`B+9bwgK!tjbB@`X zVR$0ggkY=ph3vSahd4n%EKfIS3-N34w;7r0TfG(-<5RK)YMJk{12SKndJ~87fb{m0 zNHu@v*RTHZV;?zPRndtxCu=3i9VIY`^rEt_oQX|U&zq$a?38Y+PcE-xMC_ey)@BJNdcB#`K9E7p<9U%E|5MV<)|{jMPvcm zFV!!< zqsQ;#>wWgW3}P3rvX{TgUMT5Otru8kCPWdJ4BTkp& zcxy(aua(hGTLy5Po{BFX0wLkM{;#hsFyKBpHCLCwjWiij8^M4K$)&_Oh2Akq0GZLY zIb0ODFUS{euc6Fc&)xrG_6oPX<$pD?`y)U!pjxK)@13kSl=>-V90x8e(UwmdNZX${ zqSOW_nIs2tj!byOQM}lRp^5%1(_Cv=RF?FhJ29V2$xSJV9LjO0tbRXQSM}L2Sn{+|Z z{Btuns*B&H;liRw$snuf^!#kyO6Sl&V$R4RrP9s9u1D?5-H{iGj}E^LGBCS*=L~>a zk`73gE_^c<{0M|j@1I9PUFl+JeS>atk~KXKAYl#asN>kjkZ50k#~ZDi)`y0Jd*ArK zriklL@>CCbsbS3dTmEi{$`hW9qJV5ezdr(*#P*jQ`cHOd74JrSG_aRcDs1V*IqmK< zxP3q2O#+C#>f4@Xd%}%`KjiL~S28;?VXfUIXRBKU^agcoQ1*MVRL6O`N1*t@16t>CbNU4CqtWUzi8a6tG~E(H#AxrtP?Fx zSe3eb{$^`G@`&B>7ILr=l`|aKc z^>bB?a%MTw_|Wv-WR!|9x`j;6^hUK~`)xS@q<@8lToc>VO9m>Oc zZPy39T3~039wYVgIM-1arevSGVuz~wim9f_IHkEeRyyLl;wyt#%tW}Lj%#CQb zL{L}YV{VLboC24!Dn-=Z*Tapk9d7O$dZK5Alvcmu;9=}_dvRg~n>rVten$|?THfp= zY^wWtCe^KGD&lol@UD#Nq<#|U{G_DrAjXo2Qt~Og@he8TypT9aK8XK;zB72{t3x$? z%RM%`@vhZ}7cnZZjV+ISfmIuThKDHs#X+6?(B%{Fw88mq)|TtsjS?lRe8YUtX}eRm zmB*M84h<@*xk-> z7vYC4`&*LK_CZq7-hQf<9UOh{`PzZR_)4waieAfD`=6A#nzrp#@keJwHn+whS6;vr^%Rghr3jEd*h@<=m0+t_0H2KI7e%yS+v-XPgeZ;B+Y1 z`*#LK(S$&je>JSzy0t&Y)jZXbhiU7%2~ElCXm3*4uaJthTZ$$*3O zhC+-BT+iN8VeJBnDL22hM-QFFnbyJ_H7u1F<-y4-KTD1VzG!vdm8O(I;w zQqp9)Yw_VBbbu$ZH_p4Xuks>WkVH&-RzJW`CZNS8CC-t$Rx|CJ(em)a7au?>r z#JFEiiKf} z63OQz399Au=MUq$BWt8iqXG2NC(Od~jWgQA4I|k@Qd7XOnRrE=j@YZK4WXp3xu9T z)SG6dU6qcfX_U$)jw~rdP(n^f0rVZ#Wk~j(#lY;d;hEld*Bq+eiLsHolIy@t1p`mt zsv#YvhGgB%l@D*@+BR_jpyLO4I_>>5%fah=w~gPBE`M~U7>1?@PD|pW@xp5f^jFH5 z-I-G4`&xvCB=vyp9Yk^-F!uf zRD`j*53iQdWRnM~7)OKcQqdYI^=j2J{B$sMDa|NI64qrGqf*gd#!uy)7~#~=j9Z<5 zjpoTJwTk|_p&%KlPS@TodIkNh#fc5?Y;hbbSJKCQQAl;G(V7fM(tlatc;OK>I-<*T z_+oRb_1o?daSh366g$r@sR8v#6~HKN@h-~O-J~+pp|UPZs;zf=+$ojA=JTxDiu`Y{ z)JD~zp4NopFOL=}hpufufv>1+?KxqAjhNu*TU58FcQ1Us!vZ|hYJk)kz)Fjp-t;dL zY8TNpI3TVfQAs)EkW3U;!zxs%g3^%F2Ni5Lk@dlyUfC!Ls##Ype*ukbIxJHfossPb zm!Y>J=YB8iNV%5f;i=lemc00`Rubk!B@`UVkMV0M7SNt(Ek@xY-}}4m50ra6-+aL~ z!yH&(?8*ut-HH;0Qe)fH0@^r|WUNK+19)1j-mFQE-iQ6f4c&s0;;``8S3kl}W7*eX zhZdY98BbPj_!R?WxUNgrCGrtYEJJPYYU;_=;NOBV0i{6|a^8g$y8ez}x4{GZ*m_k_ zGv>6ohAx5iC^+mekql6eZc(6I9=uo+gq~}{_%yu@m_!6M=30i~6DE0e^Z4qtTKpi^ z=!kz?!L#L6b&RYVXXh3enld6A-|qd^y0=6|ofdVSq1at)D$wD^BMyUYoEeH8Od74JvV@S(+hbbpSv%F+)dAs6wKQRPROyIs*`+h41pF-i&e_GKyt9`#D~!-0$lI;c)b+ovBu_m!&|EPJZ+D7yF7HzD6`8D6BsAvju+|4VdSzAN8-uD^XeirVPui~Z0 z)Zj1O>_m;SY8+EP*ZJLVTWP``VaX?Qo9zR&JU?gl`SEwbm}VFD-ENWLPQ}oE`Acsd zuLF82EA0aT@8iw))ut<15};hRfTJ=oQwCZ0D}#r%CAx(Es`1IOR8Mtvyd$UrsPUiU z(W}PBbnkM?;^Y(3Fb}h8C)&9S>~*bEPnaUPz2hi@a7n|RvA=?R7AiH3JvZGQ9#~Jn z!dzZ4pLCSZ8W%$GhUclfg>|mlS-&4aoji^Ash>vIM=J(0m%=uG?hssIh3hY0q*n%@ zctS|p2Mb}xkG<`BMos+-vf8Gqj!$m3_u8qlPEhMkNTLbV6{8Jtj4Eq`Tq~LV6^D8k z5ht1+92!=iHV^i$U+sR|U0r70T_L1!XSuh?>@{o4LBe63h9kY0Y<;Dp=*caqIcqiI zm}7w7#1mYv7%bQoe5N=LKl#Wjii$bDg(&H z`Pgu+v^Y6+&D4r+)~Js8WAG- za<)!2Ky4@jR>zkYnxNIRPqx~nZGedV-xy(NH+R?aAMoV2m~1kX2nYdGM*x}h zSVEMuMd;bBj9LRgMl<)jMts#H{Vf=P=YY2c#~NM%m`=O1pA&>|KA@PAeCHlRhI-vT z%rw@ZF1J$UTsOMsTwbKLTpK|3Ip+owe3H&{B=?bPsi>M`8gDBw+yvkZ`U-Yiy8-TZ zyuLUFD~05irTKMbJ0xXL!1klst~)UA{l>LPnLd7Nd^;mYGU|Xo=ns{#vcLX=ERNeO z`t>?0b-u>Ab_Z#b}+aq%X;R*a&*dF)_w($S4P@A>0SBQ|a+s5~YBHWUSsj@}1 zqrzb)RcW=_Le2)n1fu??1qPw8vD5j`MpmT9lLUE_kOMJ6ceSh(B@IWg?SmYE)a!jj z49zG^Xa38u=BEURiRQS?B*o&X%_SjbJlNZi;hJUF*4Jtr@2kcE)MlV5KtR_e1im4I zhe?(Vgn9`hh6bk?C#9+X$TVk;rkC%pV~GO6Av39DhVHC6hX(w&GKk=?ke!uX#c9f% zCZ$tG1a=X?yz2nH9$3o1Tl=PWtzguZw=mdUIIT_)W&@L{IcydE?y-)`8(5w$dYWi{ zTmkwuSi0LNYs}vI9!GC~GKl6Wt1hilV$99CA2>0xxx&p3+@KamDo(1buL zcfk#es#WJ$m2bDmzPC8#-`%ivqRSmy!JcZ0yMyPJpL<=DmBeb5eDQbBe~zu#2`$@F zC=*;2Y>sOJKFh1n4a}1#%lnw`Hgy>JqT1uI!-ehkcNXP9=}ew3V16Pi8e%O8UQmVf zi4?xK`v+6Tt0ZI9h(XW1bxSfSn+$^U)E1^}kACWa3^!f@uzEXkGt{_jDX!xl-bCf58)1|wt&LNV5r!?1j-;-F&?g6BP+x=d=^ znL-x8)Jr+M7Nt-2W_srQo{18km0OAGDj-9p04=kgM6v@0@iJyOOf{4bsS^rojZ-L{ zwf^~Yb#(7#x!Y;A#~vA+)_m*z&dHvb-wOu-Sg)k-$^PBr12WYKGbe+R0Qkbf-32*k%EP(=2}$f|?lYDuoL38b?#Z#Pj94zn`P+iRTf_SmV;{h^Ol~v+n831OGsr zj`g6v`Z?k%_S#oxwkPbZBbSWeRM*Y=kfZBFT@&Dw$l;B3B8CsfX{%4oCX&st_Ex8; z%r(aXd~g;L&Tf9{^{J83%%gPv$l-%hN}l1rXhn94w0IrghgWuXV#rrfRINMi6MnGspaJTL9swth?c9Its zXjO!UWe`@MQN^;XNIPuIYyg^+IC8qa!Jxs>6J?}KBfE(26eYVVvWS@;a;M|hBO?t@Mn5%HBCl$8BW_NKE%3zMh+r}Gr_vwoyPJXlm*?s*u;^$_lC8KAeJLh=HSX$c zSXe*%n=cn(bdPlYvSUTQDmzJAs6%yS=SvOE@*NUU=S{Yb_>qVjE>u&{tOi01?tO7RBz5?$!Q>&1OqKy6oDP z_5C~s<_END_(YJQ%|3}4>eWoMOh`x?2=$PI+e0%qk%eRU(eBMvsg;Rm{$(m?;lVjg z&sG4p<~_M6MRH50iVXFzbr?QqXV~WoV#%iUdr+VaH?kI{k1{syFbT7q<%t*E4W+gX5;yYD_+s_PIuyMWw zx7%>vwg@OL_mTw0fMI8rdv5~oX@N6He5SUgk^M7l05?k$*U;XsZ_v5V1k%TP1I^kI zKuDK(b>X2M8D(kiqp9lAM-FR$_bc=K;dw$_5C}($0H#xviSf^574x(j$yu^a=1=2 zOd_XNY<17YbGc&dbnf~CsA_#ebYZYoR#YwXsWhw=KQw|CLAhi!K|aT1jxB#U1`?DH zHl<~zw2kKzyz#M(#U=glnZ>MP$k zrtJVER_|~a2(ditxxoND6{jmCA;|wnho{7o33KU@vk;u|Hh%B3D~EZ;M7C9pV2LMz zGW>F<}JZCrnW!g6|2|=X7eg5Vnmkr>qL9M#u+)*2j0J#P|8YhE#o6d{r zx0iRGdXD^xC}}^DzUO~F$TkYR@GG63?MN@F)5BP)y}A*AY2`kJ<_c)krc`MMRZvQy z%}eG9ShUr>}aLXURn$W|GB{p<}7=SDO_^tFZbFIy-!<2-bV2y0(|n zbOQZqEsRtvdxh)OA;2o8MwitcVUszpR#a^~r@c@AWCUC3$8qdYvbhoHT}?Cw#vp9R zqdrHt{X<##oHsO%sWr|sF1o(dCUqzo)Z<2K?yl@s7ZA(fR?6MxqDt0kPhQfSdc3>1 zxRfoT(Gcgrj|l{1A!grS9+h-n`Tm3Jd>n4^^ON=XzVaWe@^Ht648-&Mv^r@$|W0qZ3yY%R(J%(9z# zeWB*K+7#_j)%suewJa$U+9DK76oic#|92f1v|dUu-932;E{*8v0-=6SJT&)Wam!4Y zT9ontApHbPcZr6)W8yfdQ#aHlf7VcFi035!_r$mlaC9M3L$XK@1>oroYM}+Ny2|Vn zo)5mSoUlaQi}xS^78}44Gp1RS4~0p(aRvgl!U-fQ!QCd433L4NS-vX!5@{Qd8Y}L8bUz!<(19qKC8!` zqkvjEgU$pR<)wps9P~#Zu-&ZXViqkz?6Ch4vEb1I>Qt9G9;n|hHdJ6j_xCsx*krjr zAG*%0zkVJ;!my3ss~WL8$6X5z$;i&Tpp0vZ@ZlMbs+?IqO3+=jccXLxg;`2;p@Jg7 zH!T3`C6dc*fo9u2<5sb_F;{`62U#CR?VLH5w`p~D%Gee?Iti(wxQ4SlW;o6y*Z*)> zl1n20sHFm7Gp&}(Y*LOsZG~E)Hxs`91`wWx_wndgA-ZQ1F&hBIsOMee3b7`goDQHzp|G(2Wf#BbgaXx>pOXnvBrsQHr3-kl@l$h z$qx1Uq22nk<|UkcS%~G<;xGw;iA9~8*w+Rll`HZBZ9tLj{Pu6^! zZ>7BiZf|x2$<24GsscydShAYn4r7-g=dN-*;&i~ffjYKY8+##LI3AjItR6zk$KmLGLkXj*s2TPac z1yex7VZ#)9?muUV9P*lXAz*}DNaiCD>iHOfx*-viQ>`5N=h5zkrmB?ADv24h9Vu(S zK`b2mHJXE(q|$Xr-h}@~`6%UQOzztTRbMdbj(MxmFH2Q#N;&BuY#&U@F&BhD{@L_A ztmk{PR@R?LHr5I}2+5y(!RM7>f&Bi!^fpG#;1T8qya+AdPXlKFYEi(M@ZAMCs-+OR zvADEaWfQ=&{;UDpB7-@z)KCQQgoU;L|La)#O{WO3W3J)z z1waw8G1p<93i)o7n zzCniG@Q-4TW6gxeebSYRkqB0QZs|iD55Sp5lVx{aoM>zSjQRLoMNI-((?0aK9vThV z99F-xsH16`;bM78N$%0{t`6_}W(xpfGL_YDC2HD{>aGV-P_ON|1E(Pd9OF5EXx6)z zGKHpukLm=t5G;H;7^e*S4ZP7)X&|gjO}#nrm%w(2Cj}vP5}H(iv&r^-=zki~S=@di z{$zzKut!AfeMqKhuMfU=S)*jQ^FpTal4hTBkx63zuY*=kNzU%X4J6lmWutgAv=y)3 zoEf+wu8utHLe3^>Nb>}}3ljX}53IbHIv9X-{c;p-%9ntc^JxVJv;0pax%~0-Iip{< zD%e0ZQtho9ldYqc=~EL^2BJr^1VKluz8ib_fxw_Iy3feEp&{Rxzy0smY}?rbHNXoE z*qFkJl9}&PR3a7{>C90E!n{n-o>c}rkv*jRkBo%2P95NHxSIVPPXNYXL#ZUAw@%II%20nEdql5F3{(wE=M_gg_$ zAVqrKC`z57`BSNcZchzLuto$+z*~?jzgWf>&a_bi3;Hb?_N63&j#~*@Kkp;&A|K9r znec-hlr|sxcvxi55$J+rkoyFa5*c^VP929@vjDGp>_}g==#alj82Fgj#%8x1zj!k+ z8(x8fR+e7yrKy!gAfP&Hb9Y#O$FwQrNi3fN%;s&v$(_3E=NrS<2W(`dhJCo&E!bvkz$#0sB0G<3Q)z7THQ z@Q~cOb0B`vlqA0vKqxaY@;6v9sQpwLuFNUr#2e3F-MRFrneQsoy@()e{|SSh@AsD_ zJ^BqpF%vx|fm`ivziFqe7}cT`qxoE!MF<`66Qt@==SPbgY=7vFl!t$tWTP-GVg@S- zKjO9z{qqPWKiSjcq0d2QSKn2ul`kicTL6t;sl2Qu6PkeGE0)j<{tPLCx^HIydZIXu zpoP1S4~#d1p`?FnE;?4p483pZU}|PWzdo%anHG5|BpO;`G?mClh_HpbFat*fiJfnU z3PD&LP(r4@=YQ-~!4AzbM^UmW+lW);+=OS%hvPJF0SEUY%kxX%*Z_=kT1aAc6+_~_ zh_wQN?}mkDV^H_w^G@$GNj+-zZvkYrdi-A{V4%@Z&$MO!R|GUHhhtO5FEz9aVmZfq$x+6`}U_TZNvF%!1ZbA z11f%RR*6~I?ZC)@Yn!$L_R<={bI@p&%|3){lAZWsH zx5<|h?tvzxR7byG$O>)&=JG?c=Pr>2&RXNS00{`+Z2PrLb$)Fx8hq7VG zrmw2N6@tyy(kopi!v`@w#0-KBE83WsSLxSVD70`*a zy{MvE5)Aqqe3XXA$?5ZHajW$qtKwX0ku3z!R+kR> zVFB%-0*Aggz($4ljT3g2Zyw@LV-|E>6=MVLl1n=|iYxy>%(qVec-?IWY+Tbv6SG9q z7p!I||8Q)=VY%Y8`eUG&Y^VepZZFG=tD;nY?XGPY6n%pziTyxi{bL;IP$RHQ&K68E z>Y*^-A$Tkq%2`#%W*V`o5<17v4`HseU=jy9OLfBkqphNJUuR1JhJ#y`VsGVQ0p~&- z_HVwOVv2;WJEd9dm~4v#Reg4k+yf^x&L=o*`NSKvd_RIMi!MsYLHPvyG00|W{)1{z zLtUx?f=aJ-IDoX+spHxe%9ut8S~VI682kJ|y4FVi^G<{b|8yl5my()idCJ)_w(q=i z+g)Xl21u60x}kEZ;D7^=&P?s@i~0k9X+B1-I%)=olyPw6JVixp?=pIU>6rAxr@usN zm7Pn)UNfrTgeSQUzLls)4-Pi@#o#%gRK6FUUI!? zr)W^Vcy;^I8pz)tRl)L<XKe>uv1#a+L1{-Jjt#-HwumaeB064jF zRSc7RzWSzu6H5sEd=iE5qxdM~u#w!Ww|Mzoyq!a`WANZdV}Z!t_%;KPHLKrxV&0`g zMS}^B^;z-fiQ~?8z99Uu=qzu(`*7>}XsIRM_qt?i^S$8OQcn)$-suopIw3o7qd~7% z^ZmwPz z1@z((w8O5&8O(iBsT|b*TyBwq{g_VBV&1{j|L$NWwsLT~gu2a52jD<~SdA8Tt4SrV z0DbTWA+}2iTr{QQbyPZ-VJ(WF7-bfJKafJ~Jqxgdsqa?1uzPm}$cx_8LV+pUo5&Hr zGysDJ9KoEDN30sq-SE#$;{JXok`_6aeOWx*f#c_= zb@eEDN{%G9qJeSMGW@|%pr+6Pve=f(1=k>3q%vr6PJxwvkty&RABbZI$=}-s$kU$# zvbGeIfOHIaR?e)l*Ff$V&~Px(*Oe&?{qgWyb2gccHi`W5>&IZ@e9BJX`o1uGKj_KE zn+m`eB8)UJJ0Wswk*ztw+Fj~}smuij(BulZ#`T-_2-2dj zH=OKBvQEkHdu>4tapxa|31u>Z8}gwL?M0^o_2h$_>)ya|SSfkSKwp$s#A}LMkDS3> z{0joXwfFBJyr+en#d^*@b2B;jlAlF3_nZ1(_Czx1rV4$r5$JpqR0;eey0XmQ`!T0yS&HEi%_hV?I(|X?Z=sP_ccR$~+;a!={e@d|{buLuz^XGaZ{OnL0 zz(lA0PZpVNh(ye^vi3>6&mc0Z9Hk_1BMKzk*YlTs6AI%2$}lr7!yM~rgldmV0J60? z8V(EY@gPLRDUcykSx|&6v0o4tju`~gBC7wo+?$&uP?Yb3?|shU*gs%1z`qI{M(BNR z@eRuWCk7fgq|=W{CU1}$v8+2Bh@v+)1fq?8!@GHjWT*;B%URC{FH{kQ|9ke89|YJ#Y3kzm?f*m6d4|LFyzTcYu|GFZg2zq(*npoW&lzNkZdTziNUy3-#Rj6-HJg4^!tH? zkjC^k*B$(zM+c*js=@MKuR+G3P9UHmqCT=d3wg@6nf^`L)TLnBr=zixXlvb@Fgv@} zCPRs<>47=HPd2m7eIN@=>>}!fWAD)D#f>MQ8hHQ%>U@4TcrE!uK>_F|W`4O9Il7vHsKmU?pw`&MQ-N(KaGb7w%EBai?%nxlc&&2BV+Hc9OmT_&3ldQk_17KeVQTs z+2f6+3wu>=^CFvsAGzga+B^p?%{+#q7B_JlJaaPk6N6QH&FP^Nb_R@4^-U9RgRLmy zx&!xKlhZ%zHuz;vy|U#X07uy!eOsuOTo>-RXqRl7K?K>T0WG_C=PJ`<_#NJphZz8> z&Zx2Ec1U{(183C?wkvugb{{}5Co_6)9jdkuY}+d}OKz+Hm5X<2QB_Bd+I+0ytJ4N2Ga-BHTUjH8y1g&Sluc*h{}ySt0sqEF{11ujrJ!63)uopNg!JdQFub+!-e(Q{ zxA(F}I>1WPxcjdf(&RV*5%ph^kLJ_bS5Rnq4-C*EQ@{6qp+&X2gV&DdP*M+Aj)3ta zsQl9RC#1v;1S_o*>GJfZi&5;*0nE(8&PKQXW|Lda$Inaal4wPJPg(kKt&u7Rm1bbR z0C)gOkMz8Wy{(CLf|WsUI4)JfA~l; zOo2kD;hjJ((Br^jifo+sjsNb+djWK>dA^EslHdonp zr4#7qxZ(Al1F*H!�fuo$24Mei~+eR~POVz?FRZ9!(%@=H36u<#!NqC+HJ^LQ_^m z;|>gbc{^~&T!OXp=_in>c3yl_Uf7ODGK@F*1*^GITGKL9JE#Ic2=#ddifSih`D6SK zwz+lU+5%^9K@8E*TYk_=`Yi6uUgIdvp{Z55X9wHCVMx4Y%*R)*HR_BV5qeJ^r?jl? z_du=1R*U1-NXweJp?H^f!0}8H0f1P!`kId7n*Ys(YQa&?Hh zC4S16c=jM>8~vD{cla(3b=w$$8L&QP6K4^c%)+vIgB~8Q%(mRvZE*lV1Peq^yPj6& zk?wW#1>^QO1bRjAOR{ZweRhfLzSrT9l#J_Al`tTqeZZgDXz6>I;Koq{=)dPuEU=IK z9W(lW0g&eRLS@}l1<+8no6D0IBc+e-dmjUu<6GT%f}0N--)H5qx`>!v9I#QJ6aN^M zcgoQ=a8FiA6Xp!XH%0%Dl^S$;Pb{6Qu$_Jcf=7n|QucxV?aJ%`G<{T98nJO*H>&|#Pffg1=&4)Fy_U|;?Gv{z+BvJ`&itfqxLzrK{`--;+y0n z?J$rF`eTnKTmQ=xScVj_h8Me zW|irvPADKlcl##JZfh64rQ+B2Z9VDnwYv!>L*enlUZA^*{PgsTS9Bz?vz(Jns+-vU@n-!{o}HG%?1UbSX)D%~w%Y>{%&cr#B9&C}ZPGLKtt#1BvXP(gOk z3Uc1=bigWet!ALa;irQ>$~ewMI8UOGi>GqoLS``%duxny)TZ>rF2G!+=;o~QVcg%q zsaqWgtqrAUJo_%TsHC=jxsFFv&^vvxC%h;fJ3I6=BUoR_%^vFUP7bx2B0WVaH?MCB zaKT&+A@w4YXo>QgdOb|~Zs`f9;1uKh&`pfgazYv3e$NYnvGd!hS=;>o^ggNj2d$~V zm>5G1pBGjatG@%c!JUEatA-d?xa8_~z(lD;iY~9tfMc#v_G5Fpb#jSO>6zbgM1%Er ztA-Pfwo;qiA8P}r+j}32-Hat?Mu--6k1#mk(2+bgimx77AEeyJhEL-s4xR(8H&TFz zT@?-7lYnl+!+3!U)W%dcn*hO&Sz2KJLA$8Jx+W{W=2?73D+}|dfB_F_aBl+9LQaFn z|H&uY_a6h3vOXLViOG|r%9~gK4v{CW1~ZrsBFcW^`k-TK7x13Mx)Z^5I|_QdqL{z? z?d&InI<@coo8Q`fH`tJmx$%+3i>u2ua|osG=QOM}XuAH{mrc9e-q$puQ%v1Pcf^T# z=>ZArk^c9uo3+a|4dwL>0Fx8uHKp(5OH;PKjcGDI{QD*e{aCi<^a34~_n7@nCe?UW zjO^i?soG40x#D||rR{fC<9)z*Gp;3{3vtnIJ0YRGvZgcoddKP4s}Zj^2L?A(s!f1g zo`1yG!VToApCY>=DpiXz12ha>k?X7ucI}dJ!I#8i^Ji^Zt=#Ab0w~CGNDD3 zOM&{v;}PVfa2rMrjaMd<3-|!A=g0GCMdutnE-Y!1Eo15W^o&|B`@;^ZY!Ur{0s69~ zBj6cx8b<^IlB6sF!-u($KAX!Ybsr3SujTCV=n16D#x>`4T`8I-`Tqs&|qRuWsD%B!WgTl0Q#1t7UE#w_&b&GN3_^C(t2YSR|DW@MlK_I(Psx zH5k6Uy_R$RdD|(;0@bc5F~k$!ycz5@&oV!;=ZORN98SJ{B8tl%!Ie8;&C&l0-ST7` z#RDwklp^hTfO1`t1aTnT`pTigTK4yvXraG!199FphAsm}Fv@UX_`>fJ0K6f30AMg{ z-Q6luMnD&wzH9)H$^Uo8`Ks|!j-c}iOT5sFStDqCW<+vo;gu8zDBbcdfc_Z#b$_?$ zULVejTS~<*&>N$xhYJzAiZhN?hc?yg?RX^rJzdBA*v+@HnZb|x=S1oO%BuqS2*Av= zQUDtkT4}N*vkd`2r(BIA!aB_!TbFE2vuuh91WI*$0?^M>0=!TJ2QaO~;_C)FNbxb0 zj#Qb5NOZ$w#sSDSU|Udb4yyuQIwS8gK#YO$(8Gt@Gsaik>yQA6-O){om^Z%@WQ6&}Se;BiRQmI%?~< z;37T-=8dsVHH7B3RboSemxQH4=FS6U!md5L##y2u!+Gn9(JF>8^r=KloKBJkTVXv% z7ZuI6v3?+G%abXbzd&7u6|~6Clk!#F9uY@0KO0BH27;?bzzno0Psg6!LpBlOb@W0!J$_Fo?h}m17lVx5feaZAjQ+motNCfvi4b zi^g9uTJFL(!>u$K@~POq_FoZT0#0#jc&jX>N{bqx%Ubttve}2;%dhahlrU$JqvmXq z6!UrFFis6kE3l(p<{9XBSc+7+q|W&0H=EO0>|l;U;Az`z4$W`NRI1Rxc^%W5rPE0l zKK^`Gvbm(TnLd7g5foO_JT%;J22MV75_`l*ZCb|%=~*r>(k=SH#+ibxxOU{9FpZe8 zn)B9vjQy2gcHQO@SlU3C5`7&$aJ(sxIdOi|)v|3{K$*vxF~(jm%GFd;buwTrd{S95 zTq@hrXQ4K9inG1>hCrjq3^NLjKm#`3n(1gfOjc*u<+>16U*ZiPlmEyeGsb-)!LwWC z%6}Yx|M7))s84a79+GLd_=HuhrgFZT4+7g-=}xV)ck6(~Xnc(M`yTM(z8^2ew))jo z2K+QeDfKGj6gA;2R8cNM&bUT>XmQ{gc897V3OEaQao|l)z;Dz79n4zOgPYo8h!&m$ z^)Etzk3+=ueD~g8)JQycf8~~}gy_6b`_8rE8Bg1&2ZMkmHc5Vcfb5<1!;Io~re*(; z3o$nKd_-hs(xk0zbCR+T0%NO^3UD$k@Bt>g1C#l}5*tMm`|xejJ{42OkWH6a0=PGB zkDOD$zMm7gN6MFQ;L&No)P5BQ=0^R2&S$RtVpkDpECmu9yNbT<9J~xWKwa9Y%vGkS zTk)@SkE)E|RoC`3VZit+Cc6+O-tD=*2VkZVTZri73F!Gea{DK@HMZS7%90&ziYSkX zn6CYBSa#VyP|F8-ubxg=4Y`reXE)kG6@WgP@Fk`TU^>UD>j_} zOTV$jtJwX@AE|j(I$Li_Si^-7JLUvSu3% zxSlxGE%&4q5c9UN${ZZ5-VVaHkSbSmtW8HcTV`R#m^i>xUeINhQtGYeUc#&6acGJ- zTxc%Yi@@~TI{s9NF&xugFYX46LJ7A8XYRkXd;e7ID_`PIZ|Y%OJRN_QHfu=jH|hNc zTArT066KY58X6QR=9SN)YgH1uaCUo!;(4psKo%6HS_nlPSHg}43=0KRFL!NG#XlfN7c?ptwomHy;m3l~P(@>ps-?}HC2 z1?8|smNQ4uj%)o86={bW?EBlDp!x;V4`g=t%FP9%v-53l{AAOP(-fN?nl(+NcBs#y z)B)y%r6EOpM*+Vlj!bebtWI;pNH!B_8GV)6=hTXdUfd`DhLcBAJ+Yz2Q|mYHU+Mp{ z&pH=hSbCa(#>sRqDub%N4 zm&5S+dkoak%2+2TI0O6Vi0k^^rHkG`?Q^qGpKxwHLu*W3ujxerUJdu@$dwZ|@jWW; zkwGfa?Mw~{uzzCO24N7y>l<$WNOdI>%bq{-t$L1=fW}a6dq3In@B>@d_!0o9p2V3m zEraz)V*Pc={PjIB^|?@cU_QK1WzBdUr<#=_?V)@u<(W(`280{Ei^_3G&xZqKM6p$u zH z6Vs>Vcz~~XqZ%U%TG&>-v-&WFhVOJ~zav&mVur!o-?FF7dZkgoao?YO$ln%q*@6pJ zqQnE23W3uu1ku@azPqhvuoRVib`m7xtSKE`GYQYiM@cluMY9^}-IPhG@4o^S(K%O)aUu071a7S7=#5v9LmuC{uPC%Z<|O>IfUl+q+yxZW zGwz@EU88~snJ8e;{qzQGcw0$(<@eqhZ3wtx^CIT8l`WaMAKnU9ePgGh-DTe6w6*(0 z6xKoifW*vzz4c#iHb`qdqAceInNw1K0E;_kHpa{QQ%tUM=P*V| zY2;kydJ872(r^!IMLz4SEhXnO38;eY!@h_e-}4zVDHIhFc#Nyb89IK^-Dt|3NRtCL zw-|xYw>|PGJ$Ktv?)_Up`TWYQ@h+(!l2Fq+i};}B*Nf-Y4A8SH?B<7-WZ$8SHy2eW zYLb&p13blt4cJ84MWC$&UA=y~<~!p<|SZvmNp zT6pJzMJZ|m@D*eMex``y+j0uY^}5;@#YWmBcKIU9Igx1Z%`44GfYK;oahw z4)}SHaU7Z9(MKn~OhtU{*q1&V*knsk-S!l>(-Y`G+q9@uK2Yy0wt4l+XTN*2D@@Bf zqaXussO5nv$iBT5pkHnRqbiNcxGpH#4N566cKk_}{U zIa^2|k#W}mnriJcf=?YPTW3h^fZGRDuIxe?tSA;EwGX9-V)(99{2yZ);u1O=8(B~H zZ_Q0SvVK;8KP{t1{nApL?!EJR&SK!st)UTKcdPO*Yek&ole24#wjcY57kPWHqR1`K zWB16*V?85UKA}QSedwr5PRDWC#R{or+v3)*wpboPn-_*o!1TX6 zM&^-uC(!$lRME88Q#A^bc5qyAy&7)7hwejL zN@ok|4u9U&IR~Un;F3&h;IHqiY`Qkva4-0TI3!t4Zhi?&mZ)6TNq)7;uA$D6;ir7u z>Lqt3tjctS8uq#)KWHnE(z@TNlE;8qzP0E9Rvx43t(^V7N)T)~S~1oJg%KSyi!)qrWP$oH+n-DyufGyhI?g%*m`z{*= zJ5SnIP1D;WR?a#tu=6XL0eqYT)fW@hhl54Ss~2()f|kO&#FSfNQ2`<|&;*amH1|OU zFGBcOsh)%bN;SkM!}r|pWtL+dDskl9syq1k=%^w*dY1s4xX!@5RcveasbuzeOlWnZ znj`)D)wx`c|9#yMb}5z`7@vNrG1k=Qj|lRnFoENj=ARJvW(KcUH|I<@Qs)%RV5D{O z=*^9&iFbdZUHIQkx{JWOm!V8h+I`|md8Lzo1Eo%3#35Ob>%@cJH9J3%oEg2V`ulZ? zjA?@nXucYIo0mY_C>qEUKxQC3@1Teu$(sXlJyPvE2&xBvbp01D{dJWqO`O&mMqZA` zcqc)RsOOr)bvUzh(s0wx7<%9nCrXSHXRu7K=MC*mQ`XbmF)xM(v}CikYPc!@9r@UM zs5J1!hyA0A?_~O)Z2>w}u*MR$nb5~?*}me!A#?NEXiH(7)N=41*Lz|IBKS>5w`P8f zESB@q5ewjCY>@|f(GiD)KQP`fOq>5IYS(Q|U-aC!r()NNjrH#I$x4q|X z?a5{?iCat?Ul4vbo9;_rlJU*ZhYZj)06#S3(fHikS9r7>OO1{vga%iB{CmkdT0cE~ zGn^F!?l6qV0V;5H1M4?di!xm_{19a_+byDPFV9Xj9^1*(IuD!%5g!z(HQCfWTg&iC z66JX!ljgee-1ZI;r*lL=(y6;?z|RppH1rw{pq|rBAy*kyJm~G1P{>p1%EpuPWwo%X zha|eJntB6Qo4~amI;z=!xe0jr5HB|7l`a_lcf+d(L7HLoK06a!N9Zyi*&-gasnl6p zol33b3?jKDpYoTp;Y{%7+OvyWSHp)v{mjBYu1d~I)yM|l$xIGaF(f< z#7G?`96Q}$p(__DYCLofPZy!rl@HqlQiRcCk-cyAeh{>*w35qd0+xmkdiDe)!Vc1TZXW7@)G>FS04Ga_=U@{HRfS!dE|pI5$+uWq3Z;dUYxyK^Zhteuf~su zmp*z_o(DL!jhojh7S%a>`@V8ozHuClOE+VAABeZ4GgGI%_&IA{6kJ{}iEWg(uma#B z$BphGK7a#~LQ^htx+J$4Fx!d~_&88Qw%()aeV$7yN%XDT&hW#!?n_!tabz98@_W++ ziqfOTZs)$9&3<^2krQ>sS`+xU1hWTzdHm)X$nST_?_&ex_rXfC?#))Oj>Mk*fRDL- z+XXKHbCmt^Um7h5A~744KtJ(?K$_q`xAwlCAyCbE>W<>P4R+?0iNm1uPCBv0Km3Q! z+%lxMaLM+d2ZyoAx<7g|eRwqotnDcgyF?yfJLM z-qRFNO*ki2JfFYERY?`dk|DkqqQmwr$<$sG{cGbq?Q>nh3GNW5hJ$bMT6=eZTc& zeieDhUw^B0sPD$)(F&q!A4P*e8Ef}6M4 z4GJsK5}M#IDh_ zDW4(~_`|#rm)D~o$dOKE$D3^>QJCpY`m7Hod3jl{pgj_sDs(At`k835p}dl1Tnx@@ z)i^|9@e2__*s zN(>v`=ydosP~A_-aY`Rv1t*eU{iRwe^ODf@;$+yX>SP!_u>OY@mNMEI)Y*x__m*&- z7!vP33HeJsItm-?%9BiYJ)aYhMG9SpN1)Hv#pu+a{eORV<))O^BnqYWW?bIH0b`Lz zF>4bdK7LK?7mVr07_q;?Rz1D^IMubCNiqRn|GcptnA}VDTDo~=;xf5E@nRMIs|mf0 zJ?)i=n6NLeRq(-Jyx1PUmI$5;&oTR0u{3YuCB6JyNsny=Q#sWLtlY4FT`$jQP@rH;i_imP#X} z4=?^kR2HZK#gIqByx=TbI)@rapkW8&Djo9|2HVgG#B+ppW27MafKCb}yLyD}#Lvnm z-IIb6Vn{7hCRR$Os5=5tK+1jHH+o^=7IoOh=iii2Oy6R+!$|ji@{smk`vh+sbk_Q< zA0MP2Y6HUqZiRCjpeJ@!kMXeCr~=eO=hP&#=)WJlXkPy6V(WGuh|m9xsiiM#cfFFg z?bPVnyJWzGeK+yacBpIvq?F_yE4M|5shj-Woatl3XZ#jDagQO?#`A35=*)o0Ua7q{IX@-HqxRTH#e1om^ckBR?!)O*^z#-32daiOCVZzd zqmk8_(}M@lzywL|#;M&{4#@1}nexR?LdKyWPLthxa)1rlET&r5(1Qcd0Vs=g^~;M! z!+}GlTQdXGx(~Hv#)1-Xm8O~xvTqWucW&BFO_nJ~DUbG#_?ozPBSdpUC+xC(Og0jy zhz||=Tob2fc5zTGdbaB+d=5W0;Z7}A0S9?bKjW+EvI=g1UwqNtlmI)uQH{4&AaGZf zM7GXq7O7J+NV4zz`_4k`092~6+D-|nu!UZlEMtIvzbFA!VaZmB)v_bk8?68C;(rPiJGz=Y;-Or6}yjC8mY{vvBIz<@!A0~>iMSV1K?G(EBtg_B!wB)V$ zC!hvl)nJjyG)V508DC%Xk7YZXcl+IOd#q3VII@w<+Ym4xyEerEX(40zgfCE6nKrsw zK{;U+p(RF--MiI051h{5A)ASq=7%oTtHM_I4z!j^Cx7qit;SrJ;H5+}>hw1jwF>T^ zZde&%1Z*`WJ>3ODa)a+jKec>~b{{lKohTZ- z_~cjec@%Wu?NEpJUo2ZJQDZa+XGTdz=m95`THCqXlBE`i$$=^ybsE z+|w<E*SEON6KvHGVQI_6&$U;nENUy?`)_y+x!o~DnpgTXrx6&lAf?OtPu1D9(nzgHw(j50O+B%RH7@BHraqW*aU-SXn-x9c-=IEbCg@7>;gyzTjs}^0 zql~6uxnwb&tV8K7WiNM;#z1*)##)0cbc_l&u-(D=?9(mFc`E+!VTSeekWkH92oBDh zlfjOOGl!gsLRN*X5Qo?8rsm6IcLljn9Y2Le<_YdU@%ds3c=yN0+o(-^tdGUWXc@~) zAIl%p{xe)W7~y$zttp9#{K(a$9z$vY`+(E7R>_$D5KjgY_NHnDsy!ovxcqTdt5IO) z{I~y8E4SOYT8D7WUoI(XRwkT(H-Kloz-&W!cG27IHicyAeBxj0N8E4Oy!$LI4A=H* z>C+B_PscpI+eJ08<~r;Hmw5F2U##wY^4`L$gKL#(3o^Px;63gUIFNodT`w2Y_*V(3(lq-O22))uGFcIyqe!34h2_yQ(eVM$4*=Mcb-1|(aV3S^Z zYhr5hg%_uLgFqLT_OMyNMd%N2B=rQ4kU>^|cl3R`*!fAh8*OC`fQZ9X!KkO ziOkzf&qp)N@5;mP(J_4(8i^0D=gmt|lj{&cUj$xEJ|PoCP$sGOA6twBOB>SuRN}NX za5Vi1PY3K<6@%2)4*~sv6i~2E&8m!baow4-zq>xL{&!+&VB0H)*ksd)FlN{37)(k8 zU$bD0y%RyzR9>1yxmvFeE4WuzyEc%2b=_Lem{Rigg;VcSof`Zw7c*|FzQmLCG1=_e zr?aX(#_FoU!#!tQXF{PjGgJ2REJFPI1~vA=Y*SR`^is{jwbk%cTw3KSl>*TA`OJyy zcvnB;8(@T==US%QXunojKS7rZUy4$ap2!02Q4|M)4(n0C;ZOatZqVF8hz1AT%yt_F z`0SP*3cyTQf^(i}x&r)6ASX~tv=HZT=l0Wv3Tu@f*a=4oz#9@js*FA}d5;%v+ zO24b3qLDW}7aZ%eh?e+vy(f{TKoBE`F>aF54DniW7^QK-&TjvqW`p)li~FKG|2A+F z%#92(vEgC=Es}-%pUOSC)b|);QhWSFa5w#kPous0G{-UR0vtH)XO8m6c!>RZZ%pAy zzJA)UDa#l3)jyANV9sI(btLYm8Od_}c=THo3whjVkJBonY5kx9R?LJo?U0fqGzPC~ z)jmSl$aj&`aQ;G4-^i-aKhknQ1r!>=+bHOLXP!Q}SK27F{`3~_G>U?|xyS$1YqCBZ z(8g;c?BhyKKm6$R`AnU{rJctQ1^Ay+GU z(`W80{k9z2C{C?Top zYw)uoxPJT28G-b;6T+SgV`|$P#R^wsp7Da7+28SlIPIASw-`@G$Ua`7xiWH4TN1wPS3X&4*qs-UKz|+`k^i=UFmyyBcPZYDqjzEWT5cx$R5`&=JIP+Qbk!a+3fF6S5Ls}iBEq6!_xB`OB2SYPn1Wv3U~Nmk z+emuUpFF!nu10AbDj0LNb^$WSn7L|w#aecfB|qZHhWix9 z9DAbMQr8D%>A!m{YABuuT)rwQoE(vC*gX3N#7u1>$PG9(8&O@@n8KM!5hOS?584nJ)qJYs~#R%k-XC?>u_l`I8=XJZL!m08$Y%9d3O=Q zTi!a9HH~L>P>d~dRJl6M(6N@$Qof-ryql1W__nxl(39ZY^G+n_nC4BkzIygo; zPD65iC{uzN5`iuf5|G`vw$@mN_rl{Q9r!jlZfglycP$jNyf?ly-mzEtawzJYy7uGu zwN2+2yt?Pi1woOM9^Fn(5j})&^s&pr((xQG3*townIN0=4>Oqh`rw%fgpwNLt&Q=H zHFGmLT5=&1$0`FG-sLUZsUJu@`uH>FKq|tzj6Y~>K*`Va$RdtHXmWv_s6%v66D_33 zMErxKoJuq|>92HaL70q6qY`7^CrR7Czb6B0wjbLQZdt0ud^FpX;%#3UsGjF$+FU$u zqa`oD&q!JbV~D`z4gCouRUfYYT7et^C&+$*&3G2`biV#*7LJz;|KXi|?6f}olejx# zJQ=OV+iDClq&Gu5(o1hEz9-0deIoYN@&oRlIrBFW8vJ{GVgwFx_*S;gqGMZheV;gD z<5u5;{2X$!M3WXd|ASacDgtQB`O$yM4Kb1%`C&^U65||fxyRqRa$C!VnV=I$ps*lG z8k1pd!^aeXpT1r*P_B+sJRd0Z)ZnH*s-mi5V~t(y zD|X#|a}aC&C)|cZ{9bg;zdG8Fx z_1j}D&$aR1KZm3phGg;A+$H0ll|kSWa`iPww2}rrJ=ynHVQSnmo9|o#>zHDXk(;L9 znt%XR2U(2lM-l9d)c)Prj`Rode&f~PMA>ET*3cR@8IQ)qi5HE-L7^S|PPh3-sv(I9FWZ|>YKB#H!zT1KP&EhyhPMQ! zPtcR2(tPrFJnv|Nj=zL3tfeqt%l?YXkg8|{a49E#2&NIB9t4^ZJvtQWnQ!^{L zUhURdegBxe8+3&7vJ|H1xra%?A(01rH<|B5gY!5#wg@sZg)i=V{%)LW56 z#UvoG(BaX23R9ZbS9CgtMuY~cUk7L4_SjL11$^$W&kIrvHTp$~@$AUqC>a^y zk9s7PL_6pCa4x@DIuDNIfi33M`ZrfS8#6}VyEo^h{dtRb*^72Luh#Ka3K-0Jnl+-A zQer^%N;`4kEiGPT*-9S0U*;s(SalvI{V^nFnkNV(jI^CY*UKTUuoS*u|bCxb;Eh7XJ+7Yq~6z16mwmZ}nM6H;oOe*+3P)YSC-OMKo9W1BaYp)1fAp>3XrRGJ@rMZX?3I0mk={F&-o^8b~rcox2K?w%2(?p_nYpRP3fhW z^9iXV&L+#2e|eK;=KS}QDr%4*>E97oX}+6KbosoE}bUmfSj zB#nLhP9~Q8{<;LDR>&_v3W)aDni4Us?f-fL*>S?aI2GIYaUF(7Vskd-`Fwt#j#bVX zUp*Y@)U57%3Dq<3N5#fSZbzIGILC3Hd4UJafU0a|DI;mTZwtBPB>9v&HD*o~3S>*Y z^gM$Kj7m$#UB)9V@3MQ?V&v#-{qTIJ!x6iE4?UGy`!xe21U?N)k5a@9z|zyM?z@@e z7=m-KWbH&h@BaRxK;&rYHXX+Y-V|=73_Jx08{v7^TTcV$8<88Ymm>PadJEGwtCS=d zT7rzH6hJC*64&XPy!ug${|scNbPQbqm>LTsHsyFK`WtEOUX@*CYXP(^`QpMvz$;I0 z4S5s&s@i*MYrlV?YtpVr?}#0;3i^zI3Fvs{5|R63Rj4l6vo{`_Q|p5iOH*p- zo9-$<)HK<)7X2<5S)eI<^&-J(NJcJ(*gZ0u=;8OhbHvgE+5FQ$bBFF2ZolmtPn_+Y zHHTd>vJTL<;gl=@g=wFUH$<$jX7G$uiJ)b#WiYe4W^^lopy=#aa~BVd^QTU)&|$_e(!tBP&wx%CyYZVUU#O=@pXXNYT|ClhccSi zT(JaVWKj-g1cPA#kLAhcUrM>Ki@%DLF(=#N+>Yrcd^XcKAVxCqjjNmrYvwvPjnr#5 z9Yvgf-;sSJ1ONYyap^L12ZcyCsl%1=WAo-D$HrMMZBybok99UeK2 zn(j0%bSI!UdY86-p~L3Td1$ewF-$zU-@1Qe`-jyA)_En)wYeq`2@?pT;{u>!aD%wHjkCt6g6fBGwAM_(8vjFD<+dp zq?5#n!ekf&)?Bp7vqN!$j{5qaFyXlwd;vHz*EkRHw+y7ccNWPfVN$#Fqtl72t0a!W z&x$hGK0ILKG|5O^H~9F@t6cy`IA=oEy`4Vz-&^^3zF>jui+MW7e$Rh}@aOWL^v@lJ z8qr4YLR8tV7CgbhR&Pv&D$C)U*QzpWy~<9_SG?ny?3LSguH7z{y>I%&SAcp82gOM4 z9}eXXfiIsoFNRQ~dg5~S*$1o?H8BjZ9|5mlQmYS5y%H|=A%9Q3j|{mx8al@^y`Nst zN3lLO&`r|=Gl^yRM`6jHkFOl9PHoLF<|&k1GY$6S-gd4V9ghClKL8~A5-a-R#hE)N z8rw;%IGTVsnj(B6C8aiLL<7OK46--#v8~2AFt6^&O*zz=E82K9oxd))4|$8@O%F%X z-Kiq2Dkg)I*|ZZ5j%u^AO;9bM(iweE*25gD0zqNa_<1rC>pk+IJJ2r0htE2YCNArS zRt`CTzfRM_xbnZKM5;;c>N8MUlHgWs?KJPT>Amp*AV7Psom^!is|TQLJBj^740+@| zC3H11ewU4Uld_>%`s$*4En@OcK$R2Jnsr8S9i47MT!nJ&pAL(2d6HPI@cSNvD2wn2 zo&8p=OZ;`%7LKUBil|rkUoT>3S&Qx0c57>%rjoFCfV-A4l&oq)5-5=>Eu^4apu7zu z@k^1Wh4e~vjx};fliqdW%EN8goWE4ex>uH`T$*>Mq(i6wQh+!#p z&NDvi9K;{Fc_}27bcy&3H`q*P0v4o|Sz=!LmZNwOL|&tPj!=HLX0SXVsZVs)%Oi9B z6p(7a%Gt^0wWo3XkRxRhko=5uhIi?Ed05&1m?o?NBMKK@GWuCtkFelv=7rWg@MrR5 z8~m%0adWNN9@#-=NsEoh#d8mL$4fByF~Qzpie`l}r}==tBQBO$X%B*1h8fh-j!GlD z+0nROWD>i5Kj?5Zar$l0c(n2o#X`}qeK^3=esVHtKR!)XIwh8l|8eKYw$Rz)o}l|$aD3UR%}hctA4ta zm=~GUIOQ?dz(x%-aQ<>UzF8ytOi!wWytU@6+&}Q0l3b~0npf4#=g8{5(@`77vuM`m zN*~Ebk!#aT`VniRH@-rr3dG8}q;uKive(ibdY5>`Z6il2J}X8?W+<=*8+oUEhoMgq z4@SiEmC0f0RX_Pw57UFECGEP`A1gIKQ%m~n9&!*?%3Hxy9~(6w>7XQGg*WL*n750_ zICm=HJ9OSQIsLUaP+us6?QGmyL>sj2bkG0v*l$U~e*-XBEyBQw`kd7yHBXITPZk_( z{`ZlP0({IlE|5;jINOl@P2+2?=2SB(Fe&+d0^>d{lUu1Rof$}^O`)Zw#SZqVLdZ~( z-vn1KS9zsKQpb^KsBgCx>&SULkSKTB%cSblSr-Cii=qgW-eR z)T^7oq*lHCVsEtiRSt9u=pMFx(um9W36I(le^1ccyDnK|9W49RD(zEgRl#qmLP81s z>WfyVJN7|CYu|hM(o)m$XqFYOXZX^0;2)0@)nDe2n+}~*h`vxx=oLy7t3UX0%$d;@ zuFe!k=vj*0Sd;#0PPl7`fxAbPzoeG%_jmzIoT>+ZO*sEIs(h{kdD9)eR{R&eC_j)ki?f1rs|S@7Lx>X>k?Ub?DC#TO1$ zN38hZ*2G6g#kT2m1OJj*I*{bzKS`ReJ--oE;1_2f8eh@AA|7ODZVxg0)-cc0&U0s8 z9&X(r)9jT;D%tDqZ6Hu5y(7VL`UFuHo5-C#3$hFpsv>oKJ^owwVWNU^&V2O*b;6j1 z!Q4V?y;>JP!9AQv?^(fgTCk_!#dZuNu4$CGOyt84XBrSY?(1X<96!S2WJ}y(v}uB> znE}7nClY}M|M|q49ecuoJ6y2HlaD7df)u;y)?JFd%#(WwESA52J8SGMSm-15Py{5R zBgS7}R;=CV@A-bLKJGLm(3^8!zUm+#_}{+}+} zH$8r-afg4Zs*7#(=B4laty(775@Mv0i=?)k1n>*eOC#;OsV&bSB9V^yZ~PmNDty0W zn*V4T>!|QpePQodGA|(wMEo+E7o#e1XG|&09XteHvgzAbzc!m|?H%j}Y+4n+_BO;H zzi;96*;s&nI<7Kd$1ps=!H~^A&TDCgHZS$ql`fL^HKE#mx%ZD-VQK_^4|dWBKtWDH zJX9>LU;S~udPb|YGYUL+Xq+@fu9h_fmGb6SpFJT5G(bP^ClkrwWNI^iAxBwLp_o@7 zFgPjQ!}Opd+h1^}F{)va$HcGO1V?zc77>I*4FP5VT(TkZN8}3y93!e~KZ!? ziD}IpQYg7ujTOI$F@B4l6)edkbxoy^Ktvyw7Iv-{ZV1i`*l=}tnp2xEkR3cQmvY{X z1Q}cqTwJ__#7#*&^@w!TBw1MburTgOz7PlLl>U@RBNZk6IAWUBBeaS%$isi(*CX?0 zct^3_=vDOV{k9tyZkzs~#?pH)-v_-EkQ;5{agT0XXi+{v-h5yyIaKN86LJ1HZ<)pR zvOzcboFrP>TRA17k5(d?P$FH*2K6_eYykhCzR*~a<0Jt;h%88rn79l>>fAwZ%i?63 z*ibW9JdqM}s4|P!h@=IjQ$E;m$kaGnDLS&bWh@ z!M64+ql4kno_J|z=LQV~e9|cJ#*UOjX*+yG`rQ?eD9?5Oy=)cJmbr8|vr4*TU8x6d zGXcK>)Ie^Gdct_);_~JK(q+F_$$-dqxZ+BS>f~wSI zq^?yLqNX*CJ~>_Gme2hG37^@9;T=i!JzYT);;l;8_$!B#sB{UKZ!$kS`kd4(S*V5*4C;Q{v zM}F*x6P`~hcU0ebNKISQZ?;z3xBU5Y{ZkTLl?M*Ct?NI<|1B6Q1)RfZ^^L$*f+`%( z2+huqb3hr4;qAB6lr2J7-8Gk=n%c?qv<)kECfO>9caCGU`5d9`=)pm``yhuTl5Ksk zXiB$D^b;gw6o@h&mPUo_qu;be+xyz+bO?!h;@E$nIMY$K^o6}RW9CEaX*hWnoS;a{ z%IEEs>!za&#CDkcxy__&llz2eS%j)6v&G1bn(n{}k3*{AQ{zmJ4zRByIWUh(C`|vo ztr8@xGD^^%HFYlHRNrZzZcaBca;f$uAiUi0cCtp?=)R;Xj=N3_O*O1fqCx8 zRQzGn50V2;OnE@Pfdx-j@NCofQ`mmXY*MKWRzx1`PB6f0&(gIBDcmks~uEjmz`@A4}LYEhgxrp+DjkiXhG3~zSCai@+X zy8B%2G(VPncwo*i!J8wrzIpZ>0czb+tY&C>&2l;Y;$|k@nnSS6{0U@+3rAl&6CbINkZ=3!}M3_ zEFTZjROvbNlm$K}?7;EzUdW|N)1>k4cIjzijiyu?jv4TFfedB%xmGNRZTuFg7$WJy z&NFgyf-lN-8`_%_nK(dK`YjIs!19@P-9U~a4u9a!TcbgT6gJ6oYIH?i=(9|+Y&2CX zTG=b``0d8!+XJ~z?jURr^PCmY!TmFSi&nX#lxTuHEHcF# zNbm`GZeT(!E#)2k)g1pg)EK=nb7)18xxE?a(a9?+_Vhg3;V`@81$lfe`9t|K(`$mO zvN_5V1OFA96>vfdqGsOuE>0;%?q{iBoc)X~`EHqY7|N?7hX=!f_Q#*OX4(0Q%9TU@ zT-e8*k)`S}MCwF5W^C*5s=2l?O%KOw=4VG<$HHc_&aAzWLIgiu-b!=jyG5$C20oyo z%M{s`a4m1^F^{v(NA=%v?pk?nwW1fH5{BddyA-tL*vr7o3ojuZP$^#Y=`3U(r{|MF_4rT6xn^4hh4`N= z$oU`7iP>e4!lJ6z=_0I&S7}SjX;+EcH8_X(^o^uIJ~i*)*2O!8l_b1&Ljj{aKsoqw|bUHmlr8S|aK*Bl`Gpc?!#e0JOv z?tnkZqJ$vNLjd<%UcdKs+!0KAwcOfP^&CGtr=g&`QVB+^6!iRT)%n94wb3Q#A*O>r zlt~)qr;`Sv8})?QvjladJdkn#$DBibfKqMq!)e-eYL--*fmglIumS4*97< zbF~|ISxjG2|&=mr8m zG3L!*=c;j$_pt`|1iJ5Dp<52Jd5+7~-5tmt_9mb9nBmyt%x~~w5(u@_&g5P83Tkn_ zdF?3F=$*eY`;Ox&Fvq!$_jP&{p<;cN+Ya(QudGY-I{rpku$Gjecw#(IDEt4Ydhd8P z-}v7@p;oQ7K345&X>0ElqtzC*t1W6bRg_R$kcevOFsn8}?b=1uj=lE|5_=PSk8}H; z-|sxm`6qvnJRaP+uj_ta*Y$oqpB?XVHTa!m{ri+!BH-*SZTNl8_Q+rp)stE|BOl`} zt`tX|CF2T{`q&w=9(|gRSxMImL(&CTI%79Pzz1Hmq|II3wWFbmgAdAOieCH7Xe)rZ z7yP;Eu!KFJ7)+P=_Ky%m=H$*X4>CVBnKzkN8pWKZz# z!}oP&OfK!6_?^wLA2@FzwG&E`sEu-!R=J0gwzGlYJT7iSNEr^^^)JT0fi`%@Q2fQT zWEmLXi9XXjc4cSf-8S+(C&Ig_WPe_3@$ zd6&HsVK>$TXPUe!U}yPZgsMZNvcOnQQG<1$;mfa?S-In=Lwvv4>4J2S27u$$=^t5? zOl_6j%PBk%uoN1v=g;C52bb7(?VkR=o!9+DmMV|;I$xh!vbXePCarHkS-JoTi8}XD zNC5a|9zoD}V?K6$^0$^cWscJW5d8M^uc9d;K=75D%!?w#8tT4a%4qa*4~{y_sJ8|~ zBX$M^oHBuvNN6ghw%`3nXa(Z-0%D4)iQ+i%R1= z38F7d_BYK~(vyd0e&@|=dp&kJ+aR0^RrdEQoi4Uw?pF;lq-e!(&z&~_%D;-YN3Z`*RR@33aJzBbC^2s8{(gj~!c6;!udvfa zVNKVZ3!fV_W=})Y5=f=3gliTPX25-gEIwD(O;C~uCTHV%3j#L?ng?%pD0q&8{5Vc$ z%=FEKU&GM}+^o`WuB&YxLf>z!tIRE(B3s& zJFjWK!&W~!dVO~xmowE{P>TtP+hvUBjSSx}`lDOEx&?c=j|4LLp|CB!zBb@;gt?L3 z;hpKnkB_ZxEBX_rCCIAWAewqKZiC_4wm4( zO0kD@sbs&uykW6|3p>td$p8c)wJv0)NLyKKAd{mPh4}2ZpkKUQ^JZ_X<6s8&(v{E$YW9wv3D#34^F&&qK9DaJ?G{q z@)v8T1AUK#9ov$+2!`G!@`!(0Zgd|1SVK%*b&*=AKrKwGhJ7b%Y%xAO!T;X#caV$} zQBy&5Rsn>xnlDreOR;l0!LmafX)&6f$~z9;cXr?Qv_NwPeGM>NFF3V#&sdN#o1(Ps+{U;ROf)i%1~FYI@J{-Ft=ped!r2!9TtLcF9F<#1h8TYUUX1h+1Vba3d;+G}$3S?A!n zm3iq=wdYxvCrpV4QjU*3E4aUOJ&56dfRI(Ty792cNobLPX#|!%a`?H)p*G+MLrwkL2zQy~f?TukP4l?PF?ESo_|6LcZuf3l%;%X`7}I zjYq1eUiIKml&^fxh8$~As1({ukz~^pKSY!FDsX#)gTAsJp0vbL?zXPDeK2h7V+UO` zH~%ipHKgVT&qCu*$sFb^-oSW*6AvC*=4)G@=hQXll?zX+JQ|JFgozs|e=qMsK0ME1 zx4-2=r2JXmo8Hjos>`CPHpzYBjb=RdDf7v*P$Z$VuPS|(IVKpD>ldX zh5dkRhz!H%(VVPC z2mUt{ID$eiuab;Ji&FO19eF2ynswu&y@6&Q-viYKc{-bi*%|uEN37SxmL1_7vpV9Q z#@=^d&H3>>;l#w-yBYUAzkqJ%UINN}Zl*0wx6PgFrU$3-cG87&5SZwk@UvqZw!%=? zq6o*}x01vRI{2yva=Gwcowq$7zl3LURo3IG3nV?fKb6_WSsT){3h~REzBgSpZt;6i z$e%-MrcsVOZT0&uTduRR1Cel^tZA4@n|r9qo`Bq0o^~>5w#J=NQTuZ@@sU?ah*Y0>-*5UM0${nU^_$*4`QF)P zd9*wzqI&eFljdXz@rlB2yZ*>>l4bof;fWM%c=fARi4(Iln4-!4_jEzUW~MFw*+f3; zu$-{RC#6ocvL5IPGu0H=}bR(QBV6xzb>cS7g zIZv<|x;m5&6uDte)of|yAWrg0((ymRj`SKK9IIsY09>g0C;zM(+KZ2m^siH}o~`tM zP0Q}CIZgs`*&s5tfzl{xs1`|gFe5(qKLw}2%cK{A<_;vbB6CsTx8jly6#j4#OSJUY zgBe@PsEvlN_cQnqm;U~Cg5Gg24PF;rgzs*vPxGk0XB9jdn+6 z!T{-GrzJrRa;ZoKe%1&^4_?d>pZ+kp_a-aPoJp>J%b+-Ws+3B-CFcF|R#)TE^^UV6fNuabLC+bSqr6(~hOwMK z4HEsyI>y8!&Aklu4x|IS`NoHwY8KXtxd-<`|4SKu^^7>g#o3x(ADIc*F_d|3epkBY z5eaX7-yP#rv-`+<_1JLm&zfU2GO~5K!?=9b;&hwlb+7yG?`nO%{e-7eSr)!*y8A7M{EPX>Rv{JM_68SrUx0cPJxdCe+z{7TeYo4;5qa-7AN3 zv+0r_KcZDlu8G9-Kpg&#N7pmcH?TQY-gDrDn3zty19Sd^nJWL(@V;iNf=T=r2Nfo+ zQt;d)7wX`-=`|J_$0_D8F=I7BJ9hnBAHWv`VVWB;!ybEx5eeSR%zNb8p=8UKCv^rO zt&{(Xz~R7FbSIR@xyZEWeI}*|HfGOWz2y1WgCBUG_LD0m{_J%yDa9 zI%W}eJy4rBU4&EY+)8``r}~;xDGZeJ*8ily>jMeN3MX542=^1+hm{@G{MB?5WU>x7 zy*`>7UDqWseHD0o^Tz5+dNw9mEA*|}A3bgZl(LTqj7bhU9^P7tQA>spn}evaky;7< z;0|bz4YyCTFIYtk?SJPejn^U6mj7h8t3-s-?8)dB_lPu#+4Zj&6^l6Whi0dTC|@`A zMY4Nml2v2zkFx>R>b%74A~^MQKeNL}X;kfWDm^z@Y-}NDVvzyfPGB)yoUPDmO>3vD z3Pji`4KaA_UfVb%R~Pl3{Cq_4+{@fgk*u!mE_Aqc zD7Nf+EZYOIUEbPid8=kiy)bPM%PO8#vmbDDhTpz6FZA{DDn%^68p{x%=K7$}%o-%I zUk(Ac?mYOLrd<~2F@Np1wp{%31fXeMI@y;YgoRX-Ov)v-$9-aNC)n|$G>^^Y zAk46)Vi=s=M0G`k{|@}uPV$U}3FWuyD zDXjRKYzz%6Khylt_0lFFbX|sN15Le^q;-N6s0-Xs;rj zsld~wh+JV}-uaeLsJmUL%eMc?g-kL1m2NoG)maEzzNEIOxk zw=qtPOW-3Qul*1Ae!>m7j9lL+Z+IS*;{R6dg|Kg|cT z|D15T+={`z{$(az9sIyr!FomkKX%e23l~_T+G2dHAHOSHeRo}TPVLQxwW5aO_U|)# z*NN3NB6?B6;TJG`CE+G<;61 zKGaxtmu62=@$i~Eb!?8xe5xGnw79%mAf|3%q6#Z*U$g?|A?2vv*DsY_Baclvepzum z9_>$RnUXf{J!LWyz!X#rAM(T;AMTa(gdC(8F2i(;9@k3Ys=C!pjI(YJy7~X0r)GL5 z{OcL3w8b8Uh_`D}T^MkcCke0qUE8HKPBYwEd54lS_@@LLESdB$pdEQH{_g-!Oc7PN zfBkxwO6HOF9kM(j(kawmgI?Y&n(L%q6C55rE$xlPRY2?6M3YQ&_R#p`m{a%Tj^%-V zzBICpcgNd3QCzA&8XSjxijqDngV0aWu~xW(txRniPRFxgIXf?c-NXRrz=t|jI^~Ca z>YOFbbM1%NH|HvG3bVIb;g!F>{nd8)l?eWWXfhH5v6_P(AcVQT52iE+Btfd;y@d8{ zzs%RUZ0-Ub^YXz6JkDcTDENUui$vI4@y52+v^i_0J_z6a`uE4eA70`n=_Jzn3u33_ zvFcEhlZkn2mv;`w1txrf3rhDG_|Z5ahF@7?1`r!~wXH&6cx z%y4+Ued`MyfhU|`QVMI{&{Q{7w5S(?qU~a9&e)FYmJ4iFe4BJ;g@IF;N$1CNzYqLk z1?`F#r5QjI{HHEDww~qNOxwpSUt4HDYMwo3K&%*S)EmUNcHSG8_4pyiF;oJ%Yk_d8 z7PsA+9$$s~yU%18KJ&RvA^ce&;U4bCBN8Kw`qstQv`~89-yARvKCjnf8Oc${E|U?$ zVmj^7w50O!Sq#vxhqX!>o`T%lnY8f>HKN~mgrk|Don!sUOze=~*YyOFEY7Xj!FZN0 zAsg9>rBzR9YH~s|3UgfozXXN;c=DLf#kj?O&v3I74a!t{UqmV%oea4r%wgiacU0QH zIFZiw*R-D@*ve(pNJq~C1!L}Jy_LGNifOxP60Yy*h^{mH{){j?-v7wk@cdVyGONw( zbG5VD2F+d5Qslr-XmqdZ zWH3a@iK_9{q#%ku?8`GVn_#)Xc1q>r(vp&kc!|3qbNJbPJW@A0K@F_|@ZnyU23*|# z+pVyF;NW1No3Kklexy=R9n)j2w5TI*0(7~I*4nilF%Dv#s|~k{rz;K3!b66uIKA#n zITN6EF@niUiGCV_=7JAff|HG=eoke$0((WpZuL)Ng7W@YKz&}mJhw2De+`--Te=B; zNwK8FDr@dLb`bQ$R;2rP*87+;!~&GcR4 z@C?Fdh3(5sK|_-?EhKsD*vsj{SNyXEM-bsBmE5iw8vz|%q(dg z_ssE5mTGWg5EtSOtYYesT`_o$v<>|4FCiGr99v|lsiX*hm=2Ug%W8mzUG0Y{Geevw zM{br0OluI_9Qbte{j^+GsN0u+lc5WD!v>r? zk>ttj{L6YohhIk1J3og+P$PmtINTL(E zFoW)--jr1Kdf(kqvxNR!03`2IZK$iG6vPGB^7LDF#aq4ddx;YINO%xeRN)-_qs^JOC zEit=WBi{x+2-z4U=^8eyK5feqa~LEP`?|oiLHC(#7`gAzhUd5CI4{N>v`{GImwq{H z`PLZ~{OWi+dnBkb;pG7V`|Twhn>N{^3!wqX#u-dDJTmSd$1T;%zEjb-%yOrNz}r4pxL$m&%+x z^IfeY3iI^J^WSB%ZKM&;le0yRWda{6CDWZ&RyQVS)ANJr+r?-7h~7IRY*83XUcZ9{ z8Rv(O>w=JLAMEWMB-PWnh6{iBJ;&%jG;?zrvp$+rUrHFTShd&IMJ_n07K~}UEqfDr zjGkk@5a94O3szXWZU3C@T8Eg&!d#Q#H?>?EdyYySbPJNo^@S*`I8DB zE&}~hF3Y*ctC+5X3*gib?dGt5YObQizAV&hcW}ouwN6oCI1L zRM`!8)Q-M!0Rs3h9>~e;$Oq=L^5XEBDYOEAg_Oi>ZeTG*-`E*t<83Ld``kCJQp&%Z zzp{aiGaLOF^6#0cad2Rh8A(LF-I!La*?d=Cn0w!dObc$ySnW-URMX^v}nGPrKGGWa-T+9Kjh%OS2n|3K4+ zd8a^M_{}}>5H(-!-e(@6)C*?sL|6L)dcUN0ooYtaCgdyJh6}G1iVFo(F2fI_FtBty?m7{F*rF`=nt-nS zh!#&(t~x1=NY7eAto_VN9X~Suu=lz2Kh8?r%Xl<_>?7-OchZu>pCfv#$C(4Y$yuAs zi6Dc@iSjPBnzWp8$iWKV+oBFy(!n-|YWql?BtfoT&G-TQmvo9P8cyYV z?ppWN9-RL1B!0Gm3(KWToO=&Fx9e-$&WU7SxDfoCmhT;^PxKz~lz!FAd-rj9(8eb~ zyuHcHiJ3#i%#~F4ns6Ox8#Aee04&FzgD+25hQ^h!n4O}0fCtHJx#pRZDv;hxAv`)9 zQ*v4sN7MV`;W$@fH2YXwG&@9ymP77=a-r8RdWH1)klXJ1tTH7zTlIlm2*t-*yWPg^d2d|^mu?q4=+myy6rZy6|AdkEVSf9vQ|e~A|u^c>zLhqn)z zm{kr+w*B%{Is3`U$Qn_1xl)D4Rq^|U5Z^BuW_%N8J93; zx$hw0<_HNJGqz*Fp%R+vgZEh{b=wmar}hVxcUQRxUFKQp1X;iRrvoz41S2kD`nIPh zGLHCy3#PYIPhHzlzGFB4h{Hh^e1NKKv4W3U{Ji`dX-K0cgC2p@ft@oHZzx@_%Lvhz z8d+QyP%)eH{Nh)AXCFETUnFLGI!?SG+FlB>WuS*ID+;~a{Kj6JEx5(>@An*9J479Z z%w^4sI1v-dYU&Vab>C~_4c@;$ytm^y=W}_QSzPCx`3SEWvEKdeskI#36U2)iG9bd6XW%rR=ky*n57n-wn#VFIt7SqgXcy$%KOW?p*0KrN7elM(FNOZg`D|~^3}m{ z>!`ofQAoJxHCur_cm6E@IUM6Ho=#Xy&_+rRBk$$n%3@F@F#cGNiGbI@*yGLOO=V+`XIk1E zyqOENUA>hKnHT$OiT@Ga8MDg2eX+`-1f|1bs)<5G0ABpk%Ry^!mw54_WG|%mIa)DC z4AJYRNMKyuFqtbEL@_AY&no}OY)KHb{_}xJw4u@KyFW1X}GO6L02HL>OU<1#y-eqUIwx;&;U2`*TET$PEp{v^1 zhza4j2}tDP10}(#OPRH6>&xfi&Yi$L@P(~y#YBpIUlV}yr>8<)6%N{=e%Tb36XIcT zF=g);w`GB&RE1*NZBqfHe2WwbCVzX3p$Nud*?r#7=%ID0=wQ=Y1S__1K~9M|s9ouR zsj(4|oQZ1(Ht4`mBR<+|W1+DOMfod1yx2VGKj zXNk-YRt0_7e!1saA}NKadkL{&Q&g|heTdIIL0FN*UEa(lx3~Y3lNo=a(F|Hyi&dB7 zGstF7J`#)2@busy7b>Ny%d8_=rC^bF*gKN^2f`r0V&`qWB3KU(nhCy@#gtm`h!ym< zqJxQ~(r^WJ(Sp@P^p7+^GRmLzNUnF)I2RHM(9v&TJ+%B(7HaSBMy@G(oeg2&V;{-^ zcbe;4AFrO;E5y$$Dv`g?9ceh40KvA`oXD_~J12dHjbH zEA_k=o|%$9hQRlEF1lw(G2+tfVK5QKXk?i#`JzNJm{q}YmFn62d4^OXwU3P|hjBJ`mO zJ-KLn#gqZf=zHR+>iL5#R=c_u%=*Tuindhuz#Y_9C zeVOmnEgg5#S)~^HV|6WtCJsX+m92l3GU*B<8>-~9JFgpXXl!sA?O+y}E8-2+7|bkW zx|GozWn*OCyfCNhb0oo%(2yw=AbsdRZ@wQ5pp(1f2tgCPPij$bzCi4wF%erl=>N9f z*U(DSakn~pIoY*012EG_I@zP5e-Xv}mejsywZRgwTQH;hwMimM^-jvk z*(d#O+>4}>j#eM&QwSG=1%>aDTKu{@pnhZu+Ezz8FrLh;meD!%=>f_j2oW6Qyi zry@MOC*->$v*+47FhAR1*N=HaVfJOmPUMJ2e4KqAql~P5=kN~UU0FHC&6Qtqh7XwO z`utW$cTF~`PMk({0D1BUZvZMc*V0!!$KS)#xHAysm~ychcKJ)`ahilCcUU{HZns}iD!;*#SBlL`+LHpJp`Ya@BgSgs!Ko2OFy*tJO^Zop(a+! zr#n^IKN6y%`dw*&D_%GhqZq#MA0=+XYT{jWPiCT_#j!hBPL)qEIWX9hy~=5-p9_n* z{!I@wQ9EBpJbOq#5j8tdj6tP>CcautFy2air9#buTeg~D>dTj)?!ix z1m0T@#lcs`9Ox$wK0je$n_ivP-=J`4w^mpM9EY=w7)=ilW`pKic&i)jn6zHR{`(<8 z#MO*=H7%BkW?L7H%pz7h1_!@oB}Pb3zS{-S#%HLaK0VX>openl{IWeODd}SAGIAiI zHSvb^KE?8*RWazk)b628#_*^qKQ{RNO1SSSspCP+mMy=1Ym%p?=On_f^0YQ+0H89) z7gQd!TnS*G8Ha$VMEVpDmAKD|zh)0!qICV%zImz)lQEm2Hxbhj_t$`CRH_tFrk>rl z*7D9mQ7OgaF6xzLL%fz#-J$*!9%0T*7iHoTdRszi3f>z4cm&UgfG+~G5l!oUGiDW# z^*#65Eb%*=5(ZIT;_TNHi8Zn=k-FLEu2F>?+rB^^Bo#rHoCjBuv++VuE{a3|KO-*c6IH_ESfAm@iw8f9$+=c8JDZ83Jd=`+W8Y& zL7a+*k4KGWx{naE-7?5UdmSvn%m_PL7-v919qxeL~#x#ZvV3aXCcKWer~nvVikZ%lp6IcAEq zD1(2?aP^IW_W5rl7&A8*t|9Q!($)f0?+e0!1AF#ssrK8UBlUn72eCB-OEVr%%eK>8yF!%goI<}gG!l2wuC?HHY%q55a7036FL9ssj_0H|`eMIU)O!9OfWMl{fCe-d0lQXhj=6B4JadDOVd&TCu^8Y(5>zr`zo}@$ z2#bWF|6Mu&GiJaE*~ptNX5GBUOzUWog9NZ$WH;t{vhPKe8}?C9M)LXWXq4w`(lKK& z4|G7hGe0&uL6C23(z^6HmWq3^OKL>t$`SYmP@gY>-fW44*c_31Fm5#wXDc|X)@u#D z8enA`p({&urg?wVn2-F2piSAu`V{!sS2c_C&`0V@Domh(a=)W2kb_etKvbR0b;`H` zA#Qk1r`D^bcb?_)_xv4F(&aZM063Q#zJNQ20s6A#S(70S}Jj61;w1eV0tYk{(3b0FsTv$TL z-J$Ozeg44UjrB2RzJ&WF&N8g#K=Ufmv*I7bNuP2Mw`8G_jNH~nxjyySE@?t#r0JlE z(Q7a9?YYH5=g|k)LN}EPHm6!e9I=wkNqf{(Spc{HEz3eZ($G|H;MKY4TMrYD_9SuN zx{U*ctSd4pV7Z@IiD*sE!Hy)>7VGTXcw>1^E2C|LcCBYOpTmQ3oZfpmd(*tS*FaiI zzcLbAe3?F%)?cZ0=4Q9E&|%(67elUBoDXegb6@yvfV}oAU@z61!vioiVpT(9N#6}4 zcOHKBL#KwgAn;FPfD3y5@CpqJ_Q2iX=B+mFJVZui;Im6JEZ9h{?rn*AH}?uSC)!m{c7Xb3|i(V^(@|kmKlQ4|Pqt2|<;^ zgEqte|8XT*H^gsb?@Y&S1nqVo0BDa~(+P943^8%+t-&%=zPWB^-?B3Qr}_I+i1hd! zP&R-k7Y$jj93K2_VZLPgQ3J*D))7M*tGWaFEvr)tc+jpDHL>NPa%|t38_88F!-j`R zo=m*2;s8#|9oVttAlnx@Rm~$484|rp$|^sB?pI!j#PF+{s5DNM_~N-3D(qaItH?Sz z>`i_U(f_#`hpR832k*VpR4;-o^J;3Impkk^O%80mgQaULouaF{CQ4SFQ0Zd+#2qXQ zZPEVDJDsa#p$G#GvF%n%UkzS2$b4D{wk(ws830^^(j3U4;uS9BMQ(QE6$pnDwaS8s zza%dp$tVZh>b5|>A4%%gwLwlv!UTVf#aYFL_0RET(#qf=^B}J3<1J}7FS6I|S8i`g z(QlrqZeC9y%qHM~(tCJ7iIu!JlV!1h#NR&`9F43t2WT-sLHe4!VcT@lk;2+3X2NpN zRtA1Qd2|^U=d_V8j4Vqnh0yO;r5zBFTXyoq2F(Exzw-o8{SXW=;ardV#oXH$mDx>+ z(mwYR@HU_A>7W28a$C0EN84FTAbV9039f44{fc6~|2~<>YqD*R-Oa)-1AUqg%Gc0h zO6^{Z-fyM9)^eu{!CHKe+L`ULCT;u<>Fe6W#f6iA;W=Y`{lC^r7i06SVgNY(#Aim6 zEjz#hn|2_++4Q)~PG#|RT$UJL-#ms=GWWj?B!Lmnff-o>4r*B@i=e4i!Yb?(#BeJUdf;#;o^ag3`=9!JL3knxQqQ+%@$b;*Q9-3 z58*8quhaQKk!!vE$civWpsi`FOpef(YK8NdM=?P6AfJsftUELhSXz9zT9GmZFH((E z=253KL$`)y@ERi3d*0pvX0Ks__A7-X$t{3Ze!3D6R;N{1vv2P86)Z1YyZ#46>-9f9 z>=DuY;~bt-TPr$-qVov6z7;TPF``2iHvg)DXi;CC8KSiTaJYTE`&!aU zs#-FQNXwzpnPigB#l3CLuQyw>b`Ptc4oj^@Bqt#k;q|;luu~>KG>6k8cA;$?QL*fBmxVVuUjVy^IK_EYsbJ~PLI@4lmKyX zCa5q@^V4_RSa{t#non&@+XONyp%8F2s1ry$DC2Zkn{ydi$mU1tuHeC5gqp1#Hhp{r zv*`eB`W;ZB$2!$Whbr3?0D|VBs2z?*@fS;hVZejxsrUH?{ogL%2t1NFx91xz02&$W ztzq9bB87agO+MyNxxS+Q@Zd|Be~- zv(jy*=gPcs^|go=lV{8|u(0E_E}v0DFn(qgsXOuZ((Mq(Rthfk2oSP_Gtk zu#ESFQxbr^%r@(R6ks{HUwa{TqsRkWL_0{JseUk+S3vlV6yF-z8+^o^$-^66SdaVP z!}7z4f3{%}24&vL?)L(K-CzC+l|A`a3dG+&8{R2YoW53AM=OnoaJWut6!$H`T1{w; z^iX&3pC8`^8FPBix9?#a!_?RZMQSe)IEPNy@8I_fhcR2XJ?WYl&aO(1;#fvoGV|e&5lNn;B!_12`+Kw^o~Wl}@=5@!m^~R}k-K@|GLVbGy`g zYqvgBE>;>lAG3%W+FnO7ZPB$2$a(lyGv5JOHc|hFN{1Yi+JpTHYq!mDz@|f*V~nY@ zcEM@+j4@U4FI^|g{K~f8Z5NmqV1#&b*{Nnh%?mbi0W(hPD>xPpd(5n~tr?#6!p-uA zm>DaQNb}6R^~%{HCru?cmzXajqs@45KV?>TMP7m8E02*4;+<+VZ;A~LCw(qKgQ0-L z?`ztLa}RS0WghSD+tTM%osK>k&T}zwq&{N-XP$nTo9}B33k{*L(rqhr1A-iQ9(3m3HNC+MV;`cAPL9CFl70K(f0b6oWI@-aEUeaiHZHFx8h zLqN+*PK+?ekFygJ=t)P(%_3MpQ2yD@q^#{g2keK{1mzfp)GgA2@s^c0P+DrIOiY6u zvNCSc_A(f5y$5QaiMzkmz$kT{XvM0IIihr);eJ zYfMkEnI&D{K*&ib4=JVK5rXKo+f%g>_kPM-?PQ90$*#f}_7uy6EjOd~vzGh58C&+R zuxhjTklj+KXiUu6@|&9%`N-D$IpY?A7hP|hx&0*uvV(m6ZBU7d2RLOFIRe$a>9;8x z9+USP$Vy*?*{*fj=oYov#}a@-|yY+kj0%|1#@xYPf~gk#-qt!#rZhcTzJ zbKF1EV3{K19=nmxXbpWvNpFoPE}S`AT5)Awf*y;si(%i0rxHy~$@M5^ zl@8`zr`%tuL3Mq0Tt=f$Ubc};7m;QB^&z#VK5iMqb(*c=m|Ni30Eu%@xE(v@SNP6W1x3)Ov zJmk?RMVR8TZ{=z{+aLgk`?=r7KXa_z(sB5uG(r~BqKr+nslto_6joA6CoKUYun@~R zAEIE1Z)L_x3v;jyhaJTsz!*8@x~QNHmt*2Zmjq(zLEuJ;^$A-ehJFXZZ26;g#J@!g z+X=+wfjA@ZwmoU@u|>H8>LQpg1}k}I?LKOwY1p-{bZ++kS*h~fhH1)gdSJ}PbTJ6v z$CWiUzUGh!;{|@OU&JBn^6cVjGu~ofvFK`3y4Fc5jEf$K4*!4aEg_s_uP7x7g!*Gt z3Xz60znL~EWu_OjB>Bi-)S19=L$8)%S=O6+df^KN2yhwm2^Lh_3l_d2>G*aN$DW7a zuI}F=xslUAeRfZ{lcWuF?*QOKgY4n_J>WO*%>6a6H)CWK&nGo+K2*8yA6n07^t&LW zx{IDn@7FO<9^Fx)5~(ift&&x_mEN6kGx|5HQRhUMO~%~FY%@s*D6M0sM*2~xJ{J;L z&ri`SYnM+s>w?5oo?FfvFPZiw(P8&|H)T?sz&S&nKovmpF!|$v-Hh=BnXzzXLS7}ck^EHO`ldKIBF+B`JTj~NhiQa;5Jj+EaOqpOhCKf#(CDTZ}wFN zB8!1K{(fh3)k-lxI^^#>BX&KcoZO-#QX0bRim(2PD7;)G8@VH1xukRoq}m^<;*XXV zz84PkY;Z5^U9+dQq!9Jx4h#BCy+!-0c6(7F{!Im@ zroX~}1Hw-(f=muMsuq?oD9g1CTlPxZ^$S5#h8Qy?4tN(;&8M=*os%9*hzbsLqAG=M zH|(A3;^eMKX!q?bP?Iia?3zKXNP5WU)FOSTDYfDwB8N{yj=lvXs`TQjdZ^`ErKahw z%+J3%!BFMH$lPt{D~=pUL0ZU9TKZAC>nyH3CdG7BnWk~;+Y*QO` zO3;si^W1+`^BYy2nxD38TWvpR4vOMe$(*N8TzI-U~Aai*0a7oI` zmQ0*!e6F~50)2YTD3`Ojkx?T6aNPOc#XhQ*w3oP)@lg`_S&J%@RzXI!%EaC9cyjt( zJ70PGpBtxPZH3jy2AJ>XO}ld-wS%M4kJhH}+r5hy4^yrDL?Ilvl-e~B9o%84*xZ6x zZ?}P(+H0P7NdRrT1qITr3DOT?8oHGI;JhhBFJg|(eG5-MurGtZ#iofhUzpa!hiv?& zRh(n&G>9#C(ir6`l7{3;h;%T02Voz+F8IMpFE_i);`CVBE6gSxHt)PR&iv%fc2kX~ zRV)!yI=gv2V8+S3gfx0{IociiLJ`3Wp-SSu*3dVTRN?3N-Ka6Q$4GHie|6<*QeHXb znbx(iyx&I)B*c9{6GuobAT*BiS}~Hb?4$*X>?@8&U8WOT?SSVpDiN(EejqPbDj@@9?CQZP(}(;Ov80P_iMW)))I zDqa=V*o?-dUrs+{H&3;KKJ-!c8XM^vTkL5mo+@_kub_{#A zs)xk^%W4SfuCGG0AX3mt&oW$k0JF*)(wS(gv~by9VGhhoUe3jA+zg<(TSCX+eo*1+ z^(PCDx;1raUcji;UD{cD?XY?)GdLA{JO$eYKPgP@Q;QV()aq)BiUd*4jBTC!zuJtloBYZgi zWWUHB$kL}Xhl6K8i~Pvmxd8#RR@4v28}&d{MGXroE;<6A?T3E;F7M(zL(paqP>s7I&_>IU4Z)dXk^ z(*q3%g4tj!OnT_uGamsVR@@zhIDXUy_U662kA2?R6opetSxz|p-$i68p^y_)CDzIR zIV7OQP?*>kTCnxsM8&)Hjl;C$Y6A=Icu46FTMK`3 zW2;aLuyTSNND<);909(UI~EoOHHwft|8}`bSxo>A6{T@EyfU@hD?BLB9Y7dJi3VsaLc5onMvubGcOk4kx|oERQ{N3ugFmG*C-9l>PRUX97&m*do6^7o1D{&GC%ic zeh}v{#DF6u2t#&jJnBV6Z1#tvk-AD7d429~tc9M_1x2h}))fOv)u{*zkg?^)hUX;L z*;XRiX0)d;&DmV?rPR&{8hZ+0|HN?&;Q(pZCSCb*w@%Gut2E}>o4=S%@`NWQlp;I* zHWp72UZ-`DttVFkPRQ*gVdkps#fd?ni;B}O*xd!$pJDh#u&blG?&s+vZ*g(Aqqcuu zdI=wtH!KskmdW>!dv;eNk@Urgm})Dq2u{C%7ao8M$SZ(dqk@m5JN9C_kWe!|TJ)GX z{R9zExF;X}5hdkdu=&hvzm~$bdUgS{qfx=Oi#ViR9B#`xKD!O6Y$}3F ziG#wICr3&UhjT(8a)rbpDA*xr{pHyZ&&?Cv|GPx%-u)z2XJ1(L$m=i1$381?k6d)G zPeFfYKM7~b3&?Z-CkpG;|BI>j4rlw1-~EH4Evi0RTeP%j?NKv9tEzTYYgFw$BCVMq zL={zg)TmjrYOkn@(b~HbwNgRt8GDQ0+xedJJLfMKu8Zp;dA*+R`+47w+Ye-IXJxUy zdgOB()OArJb&I~G0&Rwhs}c~El1H@WZXZk>pr^;@l~sgWCJX_4yd`JFVsIB|8pF>P(~^Wd!hB>FzE(=$b^)Zfsy*y@^%)3)0xyH{ zzYtn~$m(CZ)@nT>_JtY5u-2f~@{`fx*p^d7da`$Xj5}n}$+v4pT@AY1YF;lq{g;B4 zhb0q*y9En2daKF^ncF;^Z(q!iL~NORE`Ork z$@S>YSH5xlNQi~|o%JVau1e}a-k@C{YlUhcQJBF}r~Eor5ntQg$>sz{pD&QjoXMc* ze37W~+-RtT%Ih>vwS^PjDZ-sGzELLl@n@gSnwLha&b^vVu!>kMrV3ooN1JxdduE8P^!0w{W-6V&BBl4|lM6#Rw(oQgORi zH6hqY*;xbbE=uy;4EGU-yt6Ap&4m_rKk*~Oon^=A@)CPb zqG;dnsoX37W$FWMz|9&c3K}Ri;f)ag{w%_ghs`U6`3=4IV0j`}je8AnisnIBU(O@J z7kkJVhh@P|k0VFIbR}&T@e}D&(b^MpO?e8+_;5nC_T2t1wplXhU09jqR`#TCQa@)E z7Cxsx@y#vxoieQ8Z1Y)%Dnj2Y-)XLytMefGHHAxv+-8Pc7fyOxf>!y6@GHh2YTyl7 z^(A%{LO6H6BGUS`?$L?`hhEcLwD5F#|7yCDP@?R|2k~4?(=e5O*Z)-UCXp_SETGu4 z7&0Z2>-lt{w5!1(pNvcP*YqU~GZ+iKrWSwU>T0)%bDN_AP-J?`dUNM)N(}*LVkjH% zp@AMLb{k6NI1yP7?AmScxx4yDy6lIFtWpg&i7Duvycl$dndwk?$~ULA4AB4jQ5Lm(>B$e1$gyIUrYZk@G0a46%x~(l zgyz=R*vJD;K&2HPOGVq%n>(k!Lt`(a(}hn!1oFj$^gQGjVBaGEj~UsgC=e#Wup1(p zbq;Sm6atQ7h^zWS06Vas!`!_?>8xzYDdE~#J_3AtRfR&e;SsN=-ZqG)1uUqg9Y(8XxBy0igi|aVrZ#LHQp8!Z{{4Ti3UK=Wzh8D&)ga4q z{nxEqmq`uk;6Q~*z(7p#^puT)!ZPf8OCYW%?5#4Z#Dc+wD{QZT`SD6c zKk}oM(EEpgJI)LA4TO0)7An&h0*sKzu_yEd4XFzeH`^4Wb=8vd4Cg#G{dOGK#OMG! zm^FI(77TE+WdOHi$YV!3OS%^u+ZEhE2*UgK*Ly`awtiyVoO0H_WSGs<_cOrX2>XBh zwa(w1h6>auOQ^Z(v%B9~bb3%w_2WcV^1%Oofc=;B-@YbyUixtRbeq0uDP9QY)<8N7 zrj%9k`~c0PKAE7_Z1lT|Y-+R$0e!?|hC z_fXrR@|Ai_#$j+zFdg>TS(&tDywo{D{2F)8Pv6OP2(K-E@?>U;jU^ARI5n2+RF!m1-YleoEhucPG{+ zV}j6;U&k9lbd4DdOEzFy-{e&AKZRKd-0o5`NWcv2zergAj}VSJD!o#PR2 zq;pKz`evL>1E4#FJ)X7#A^}52y+QP z$nd0%liQr>A)g!Jw6*bp=-O6Rh6+pu?2oAu>Jtz{2+=tdoasOI__=MtDH2Kk-Sp_) z1G>ARCGsW9#cyn>etf;UJQ}_C0Wx=YFuwV^qb}*sY6h@!yCU-3a{Zt*@q|}}C0_$6 z_Q9xxgwk3*QH$1;9Vv2P2Uynk-c0q(o;V9ZFrR&jZ1^64&63Zj(+hG}&ER`mC9>wn z_j8{m-s~J%3@9%H_L`8IN89TCL=o4|My8nH34o3CxI5Rx;^o@r0@9D z2>3o{!at&zAY;@eTRPxAqQsz(qen`E-M}Bqv?YUbtn9BISO}*JCA{*x_CVDsawCd;I4wvQz{Ro*3anVA$WarBEEX?;~LO_1UTvw>=%IO zZ1K7=im`&5z9Q=MRUdN<@NYAPGH^b)t?WW8E+@m_%4Zo-csqB3WMxW$fw(T}nJ*N4 z;{nX|wt%lW8H{m$<-7Skw)iNcQ?;yGIJCqYrR6>Uav_#zTmNk6Qxu6a>vmvo^|M^1 z9n&6_{J-7_lCRw-*0}HQIM_fxy0I(yKcybI*7Se&q=&$s^fn!Bi3rT(YbtoBq_Px9 zzWG0OiFWEM)aUmV$}J~vmsI|>clqcY=fCxx9GX@Dl(x!?HjL;-ogI#9pT&h6J|%B^ zYV7NGrjMcIJqO8mn1QWXW;Apr(Q`qbH4j*d2pJvehK{?aAY>?_UBY?n>Fk&dWgMURk1?(w3q! zL=?i2kYr(%|Fzj$WhKIaRE%Lu;-8aAvY+@?JinjGwT^d<8%yw4&hc>ljQ!U$32SsW zQtEMZWri(C=D)2kEiA)2yPXu7k%}j^oea}|XfD9_q@BiZ{y-StAb1YlomE#%utVjD z)K)v};FCJCE30rd>`t$eSZ9lNnRrPaNfLL7exbC@WFHMMGOY@CU%!dO(R&o5JJDw6 zFi{ve=zQ8of5san<1+m6*tYL5d`KO-6Z@`t?&}v3nO017>Et%mA*xKD zP;Kp)tw{rZ=WkE_I{tnl)Ya5UNF&=n{rjazs?Fcem5NXr>D-Z0N-_)3^q(?&vcYU8 zT@aVm%NZhMB5`nj)j+eaYupBLbQ&drJ-uqMQ_W(NOEuY{y#}ZkVe49pyHpkZPE9Ar4{Pv*th}A5K7;rF~4UiKQ4q*=R z5%-G(8gjbcZyB@yQ~dX&QJNQ!DYAHXWOy4%g#taV|By|5$U zqz0SgF+{h{3bXFVxtR|~FqxG>o5E(5^@U{+KjEIKV^5%JIhroq0&x_zJ5*?qKiGUM zy4Y+mrXLamobSuZqJW6Yf0fV{I8x7QZ+GN-yc+?>0q4N=!4oNor>$&b$U3O%P{8&P zNfO1(>eaZnq3#6OS+9gB+tf}yVHa!M_decd4Ai(-wsWBzlN|JZ_3oy+v@xs$hE1=( z15HExrk*=vJW8xIsVTxBc*9Q9?jPB}HD{pJ7SMJVJ-6(Pyr^agr0-h3b6XC;O4pwq zws&YSSO~*R-Q(0j>2<7N`?iG&F3K4X{?x@aD;dztcFY;&7o)-J{^zPOohL%&^|wXp zU#MqB7E>3P_4({vDk-m7UND7mp6&^IJS=jU+8r^PNAm4ySAUIHa`5N+#Bl!!qv|Z6 zwP^mYbvDfb_Nj$L06fLZwCljZA|P$!`zpC+W%p06a^lYvC18-oal!**rvj5NT0m=B z0dHkpB+PETV#m$u=}eGS`(}sfpjzL%=haQirOiDaEt=H_$~V}RW&GJfv*x`ey<;Wb zjKy_^S~PJ469y(SXpMcW)~J*)b0g zlyVxBfd+!8uunE=y>kK3vo$&#Cn+Fa=~!6oj`E@OMLwaDZH>hCXP<=HHHj3l^9Klq zN9W}@xGYh0M_hPf$T+S3_|-D-L>x|z_XeUn1EjFWS#A|W_JFIKCuVWCE#K&M2ET}G z{lA~aZHK1BbNKPfm~XR8KMi4E>c3n_<8gyZv$6ZCSg|pUC*> zw08mTLi=c3!eYM43~P?x6stzUPMd)&s%;evgj;i@}S(wYchaI#PaSNY75@3*w3Q=I^-WL}8bOQT%|M ztV}-?o}_-S7z?Cqvyy8(@|V^0l6FHlvi7tIoDZ}tGCwT_q}6`Hq*qRSe*06p;rFX} zqhcjD?a`f!5a{0Djy@-c%(TvBp9|Ojfcp!8MU;>cB|9le5GLhcIP&MR(T8LVKT^JY z+2~|_B94U==D@jFD?6pLT2g1#B{3a1QtD$*SdBnj+dRN-swj8kt6QrXuBA`8r>*00 z^3CxD|1-@1Y=Q8Zh5x?RO`Y5klP=Q{YudI~*qXyP2yEL=Nykg`#{u43LH?UlZuNWK zU&eysf&Pg@V1pZ_wsOR3&XP_2EC<+_Y6LS;E}gD%INRV9hi9&gi7&VuD0$n?(EtZxvvS)x|jpTd5xn|$is0Rm~?ttT!0l; zJW=Go+8v;7DcySthv2A9)zNaQ*v^w)h*pNEW(LuldmI$B`kdUI$dd;_s=ar$jFhbvS#+mL_ND$ zHD~Va=mt91b&y?87sB#>t$NA+PBY6_>w3doan!w|&Ett+y z5eqyGs}a5XY$VYUC7pD}zmQcs3~T za>J`&KNqBgRvJKZZcBp8-YAjS9yzRRf9HXq~QeJJ3u00>k z=ER@7v_vhRD~cO*0w18$p}T?Jf8)ukzv=MPGre|ch5s1EdY*Z=tN(F3+ymg|bv{vh zf~SYDQ$oX#O=p-u6$7!ZI`c~__5k`zHZv$s=A;k4@L$V!3zYr>Z2p)BeHAT&jO1?C zUJ_B*JuAsYy!cnQ>6lqlHqC^b3XYktZD;lfs*V1ZAY@)<3Tsdu`59)LHkYy6=V20E zpj*7%yR5~%g)8CKR6J74Jeu-D^0^8BP>Gn{rFvB9}dlM37fd#$T`={usC z#UiMwge-eI7894#25hon(~9@<&C$87lYp^&+X27hJalbG{kc|sll=0qRO`kopxt&h zLxBr$>ZC5l;9sTs>{(V%JgeUiHIZN$Z#i_Fdl)$O9xB~y_&*wP6p(OF6*NnD7d{b( z0A~qg`HpZg6R=+e{m_)7Fg>k>t&Rb_}o)SpKvPA*b5YG!|?J|ix!@4{gA(l#>S!}+l z!cc+L&pgy_3Io6OJSH@sm>dtD&%^N>e(P!E!^ObByp$=VrMY`8+PlxueTr+Q}B zlf|yr9aCW?*rNg3<=xZ%|BzdDVQGuwKUU!>~@%*quU%g?zzlX$llk77{ zCwZ&+ZDHUNgZsjFZgAY}`Ej~WSAD&h*!QJvPs^mDD$Xe>Qix8+pfi%fiGCxQ)m{0F z(5!Wl6m#@JeQEBRrR6~}sSlES_0COq?+xCfd#QfyPqhg#1oV(|9{$Pd^bE3C8X0(a zB$hkb1?AfxA&|n`Q~AQ1kN4m?Q=h~buImFYzzwyF=%6r>Jq;wB1}cqa^s4yE?;!;g z9HxmK4@+pei$pm{?VJF8>`KwyTSVovk{!N26_eMevG0|ef$n;(zAFofxGXZ}uv!iQ zjJ4Jh|LT?IxsN7!&gEH}?5uE{FWsR^lYZxXz|#QfBj)&zb@P?RGJ5y*aq|;!Y!i83 zkI6$vnS3@F5XW-TAHUToLG_*Xvvx|2Yu62Kr!Dt^Bi1l;Q7@D0+v{U*nQ=3n8S&mq zn_bd!=VB~E@Hce#v*|J-SUnQu7bE}1c$P^xOHnX-%yG`vQdGb?5BrgQ`xP0AuECgL zZvZX)`eKvQMHbsRJX8bIPMC1O?5(b#4+qcJ7nhA%lQjmLw!INyH|t@2bpq_UwA|h= z&{`8uEy({yBNi+7SEW6dHfwg5x}EA%Ip>l4rJHYZR59{)rxB=sdjnpq&bKd+(!GIh9G^lXKXbQYyhwjEy}w-~^+)Uk9`RnXBoMVaX`(+9 zpD+>tMXgE&En+=IF|lOFsSOMPj<5VJ!NG4e#<0jsDXwwr!UFKjfQ@|5nY%_E_EiRo zGBjfKYbziZc)FLL@Af#WAGOQ4m8U(BSJ17qGH#iun>v@niY&i4b}fdG@PRT=8e7w0 zHoo$7X24lKH!k73BMyB$(63<^$=Ql4Y5vK-{N6y9QJt~y8CqrSmJIbXCbq|gM{b@Y z_+Mgj<25fZKxM#!mF4<#Dq6KnU@(8?iq%P%lR2xh$=H=ug=-4A?OH0V^s(YA7qN+k zQwv{q&pF?k9xKzPe<~=e3Q#!ftM(XAE)c;GMRl@{2xGB$mS!`TMn_K02M~*8rhT>w zFJRb-uIiE7Z|KJ;c7&2sj~d@Xd3)70f@RMJzT^Xa3D9)R(Mxlq%6U2`pp{eU&96&= zBaa`g-^eISZx4{V17KXmNzQ!G0V7JHdQDoWp!gvc^I9 z@<-1EI}d)f$_FiQdgprc3E?QeX{8@fT~iX48A^X8=<@kF?Kba=_f2CSq%$0_od&~H zm5-AK6z`QQb|o;F-j30gKU;Ied+nr)F_0)(ux4(Gt=zMaYhE);11$$gk*%_TMsPqG%=nx)GSw}na_Sy`@ z-v8_xlM@ruG)JCIdUjd;V}JQXts26@;v}UaXc?tX_4keR4p8^UFw1G|QnmQS2QT82 z_h-adNTtsDsnp{h+euC*x|X^t+d~4GS2izQ{wZ$B7O~SB8?e{!(`sqrGRg2uD8Z`2 zRmSmejT43IbNh;vzAcRZ25Z9{tNrZH>db_r(^&6k>&b$@3=kj%x$lpYT9yWr&HaQTwP+aFDe0B>=c;GoR>!b5E-?$3+4c4o z=JfM-5#^WVZ#>FNeLHM%)uf9CEI+?xibZjw{)?Xh=#+X|KJ$Zzmn^5-7p}rNL7{NA zROpWRSl*!UzOa9!s?G4fUvUqQRTbmRZeIub9X_`>5~T z*Qu=fMj@42Pj=-8HHmzXEST=5Aw>!WwTSP2cZ4>vk+4yrb(a|_(cwtbh4)eEvbQ@( zuAwUDy6uFS3`m|fLENA-$s~KebHLss4$LMD+_eE|k!m$8ZY}=0RaK+||Gtb_GT6D6 z;SxtJ-kN)W9R_zRj8IaJS@QiHtN1X=0^Hws%GQ=3MD$OU>o#XqLghN@3u?Iu|Mjp* zxji?ZR!U|4abTO#U^Sr_LqJu7O_39kj_SUYAHo>VOqRp1%)E66vH)s+__T`)b4cP5 z74{Ifx_nI7(9w!ElMzv}d^RDpdKRO6&hJU5l+^3Lr0zS`O;0@OSh>=eQLq*0)(pfk z_|)^i0G>(^VSKhI@}YU}+WX@2w3{C3^?gOuRu6dQkMDdbBks0( zMl35b__&6;hWc428}%-o!6lG=H4qlm3FA&?%^~^Uu!Cg~294({`QbavinNTNf)AXj zd+#>yVLzg;czWF{->#q@`{C(2e(Fk8c+5VxoroWZM&4M;*4_`PL@e7Gseit!UJ@HE zH&e1}wV?-g^DL`SpS%7QmkgB<*fKYhVc=?b#Yq9=pbh&1j%x2K=i;XIsVn-c@HF7C z=x-px?Ogu4F9Tlzn1{pzuSjEYMpK0-Sf7&bdF~)?!;ccY^PNoz3c%J#Z#ZlRoS4!c z0SE<;A7=w6|1NbeJ7lt2SuSa4|7-(+ObHr~AHFE{4yuK$xLQ++j~cs7R-Ey4Rn`3( zc8beXvWl6`7cgE;&T#h%ApaDw2<_vL#Zf|0F@ORA6@Qdo5xDftXX;#fp{3u0dw%Nj z2nXJJ=rgTo{pn}dRnd&aq2PakgYq$BPYsDy>fp2_b6=HLu3g7Y$*oz}(&ovXQ+fw> z$EUsJ_=knqbTYoB8khJ|wh!i|HP-9uU1%05kp_80Q&At3GVxCRzQASJE?M2;{Xl*( zqB9Gz%^nvP#pB(8mzH@k#+=`eROtnZagrKU_+1D5^jMRm7vwPeRE$uT9ga@+LS>Fc z$0~}hRfZA~@BVe4No63T_yp$VasJ#2|1}xUC?oDO=zSwiPZ22XjrA(Oq~eNq-vh`C z@TM~>-|f_7Yf%*6>Oh)Xz0{fmOu>D3>@P{x2AT`oOZ8f`Ei4ci3fixvcV_vEu2?d(>25Y+Pmi;- zfd;zFG1Ve0$@`>4Q_#3~uYfEeIU)xO0>ifCy-4=`- z{L*Sl<#N%|gIM6arg~8|@joZ~j?M#Pg366==q);bHQn={JE!T*rhr9xbOMWEpCZB_8 z>Lr2h9x;}tv)D$S&$}VB7WAl0%75q!gWlWWwPPTg!Dt^`hj{$p1RQ5(KC(ULlnib4 zAOSFhsPE47+Ojj-1&A3y$uBC*VorW6p7QDKK_es0=A#1i-=(Ssg3Qs^58=(uKk+NGJ}~7AC7Y9-Dqg5 z3v%lVluyQqXAd;u%$R*&{2{xH&5~Ez-MMt#>U!&kF}n#NOz1JnAw)K#NE@T#_IF4$ z@gv{f#6h=%`G@{0a+e|8cIHwEYCpsX=Y>och5CCQ|Ji)&bbz`>-_*rdzau0g8N(Z4 zIE!1r9bN&oEg-2Jf#7Q!N#M?-S)KFacdqe%KYG(`qw#R;@4fMOEruRcJZsLluEdkw zZuO&swH+~C)#QxKq>aA4l~%t$Y=9p*_W*k6h^7X)wgia_E(~!x0F)d5volZfC)q-W z9j*x4o#fa@6Wc4RZXe|lg=XH2ztLt@KyFd}^W?!_*^XhZc^bfFC4pYYLS3~KLCCHs z`D}D8%hE-Jx@-RH&@9z;3hY)MUxetbd=GVsUuknt_ z^`yeUu%51Sk+)0}R#-BADBuUAO*<aiq^+PZD$qAL z7|pDG=*iRaKD;-?Ou?ofhAW1nlA`5i^LMjCi78}!EGIrl0GdVgC#_0bMHS=WG`gCz z9eS6c=ftqA&Y{3P$v}MZh^%PPe~fHz9^_O{IEtxYeEZ@K0)d+aMyjY3pSu7NnhMb^0Zyf$CPP$qLM-<1o5x-ylQm4TS< z{?-_jXgjdw3yj)fI+ikV*HuPe2YMvMjpCQyGUpIw%=;^$qqh+i0n$Fzsi4t?`QoyA ziN>Riu*xs_x)W$_yQpBJmbqvX^=dEgNmRKb?a}A+*ED@j?V&Y_ilBj&{ImDBDjp6d z$rkK>VKBD&QaR}xFYeyK>8FcOKvqSSR*xM_T%HNye&CeaZMME;eo&N-TiBs=nYUZJ z3Dm1yCVfG5E9sd7sgWGBDsa-E3%n_-z%~FP1Tjr>p_7rQe-L(AC%g0&wM+iIy!rn! zuHfGRN2$#9yVzTnBJt7s9h;ovChWG}ojM5qhJ3|u-3||d)^xc3F+F(8TxWsR42br- zY`;(N0qRuvXY|gDULk$8vs<^NA_PKPlbcUa+65Wou`@?|kKG&;izE4I1Z>11ygQf1 zgh`KjEa^?KQG+yv4v`$pB^r}M^6Gx<_pwK8SNNVpF;HzB1Rf`U@Ms^i{|dxY3iO`AXof28X)5v*O=th(3P~S^BJG~?6jcia?)1`*qY-F5-R(lgjGepJ!!g$ zbQWM&W_b|3cw{HN<37AlY1!qKOB*D6{7CL2egGmCX`O6l?Ae%8%Y@ zCd%*3yzSrTyB#AxlI*PbpwrQAo*MtfJe+3w-LnwWU$pK$_5h+7^~w6TtWdM^!kUgc-+-$#HGY$uL{u8gMU3B(m` zaicCfDsQ|uu#q6K35aC_9?LINoD_|5NGc$3aSQDA=$0^1LU`SMI<9hDWt6g%LWSk` zE+y&T{pjsT$6REuK|nHqN&GjA34P3_;9W26h^vtn9!#6ZN%+eRymHGeR0M(1FBCqk z?Vy3Oi6ztY?MRg90V(r@8({4MWB>?FOk=Q{?WIi1Al7`EPWMQRbboFemEpyic2 z{_@B+4b{lEV9%af2~djS;cq_PHd^-e-lU}r5T@6Q0>fOrR(I8$Qnj-n2zJSJYuK&D z+YyAp1Nacy7z%($uzni$IEAysjyXzORb5+~yVa;4B9qLmKrnWoQ>e7boLTR2iC#1p zvMlBM8)dn^3UB6V5lEeKn4+RW*M!eYe@K8fIRPQ#aqpZEFy+%qre)<8LNUCFbtfi) zXN@;obhF!U{TAa?FNp{d8_L>%S*nij#7iIy3!6(;p-O%!*g8q%0 z2gnmBCX`x;Wn{eQgwuKOHHL;Cla{5mEYMu#UNt*ceqV#Yr@7Fs;Ag%oT1N8q*-BFO zWEnyk&-=;S3#G`*w|e*p{CpYrCv{|}m@vsYGf%h|;DuMG`e1kdMjUAa)A-SD@TPEk zn2dvf3)L;Co-O8HxrOhG#!+*h;5Y9poQfLvsGCC4DMwcHII(DZmpj>5=?#1RCpHax zcf1p08D*s2`y;Q8ncMjyv|WT`g1ldvFQ4Etfs+x&A_F8&xB4kS z)>aacJlkcbI$ZufR|f&@;iw!qI~_wWlF0?1C)6$m%EUK7lyHz!;F+zJ z$`BR9e5<4QL(1FiAMCK!+n)J&1`o8blvvhx-s&W-uW^>Rfok#+M(T;^HJ2Z@Ktw~qLopIER_A>(I1 zbql@(!>LF?6#SYriMjG*nQ*CCwzW@%ZG_5AW|}W-Zgh`qFIfihOe=`pLBYaCAB|Mj zOPWZ5+qZo0i*d2x>4!3(EaYe&hf2(M-Big;PcX1dSBqevZZVA0Iy3H2?{59oZTqd#U=(ZAlj%Z9zw(-guo2_k-6mO zn~Kh!Xq8CMh&w9pLjcqAo_v)JqoBu?>Tb&QkX1yOZvoT#U-E-^9QmT>5rb9Gqv|gv~YuXwJqquHz z$C2~Q&L2ljk4$bcb~U}sksIw7e+8oWZXJ;yQM^esUO0^>E%REgCz%?l&v8Gz^o+t~ zne7~S4ASSyx~qi^GFy`Lha_%!8FmKwa>UkUq#Z3NzlZ&xw9Tr_2ecNMIU#h@G25R$Q%$zi)E)v=R&2Q zgJ>&%h&QLW^U-c~^xNh434F!Qg|E=VRAtBIxkGTF>^1Ir9V7ysVI~}Ds?&e%ZqoDsMhaTZd z0}8mSeAS|SMG~?4o?Ty@y28^}m;!k<4x1P34jDdHo4N%VZk1N`o#rQlm4=&8@c3=C zS~Tr~gv`E!1&VX$Qzfr5h{=x=?IK!X)pPIXY0To}fx@lT^{GM!^W!_vqtnji@nmHU zy~ab0AB&ZGxT~vX$w^D%O6na84T_*KKZFq@iZ{li%B;Ywjh9+M@yIY^qF#ttT^*&d z-qJ+fLm|J6QYrKwE=1QZ+`+EW&B0qZTSgZqoQuoZzoer*7UDHUzhNX|c1}qQCDb?y zmvP<|{T{*fm4p3;Z_n=MJ~5I>J2+&vrw}hh$%nkS?U{K6Dgs|sFYNA$+Fyw4krCa| zr*Qf6)3?2&Iw6jjR=1>gYW{hivP)~<-{`P2I3Z$Usl-FHk5pwoaHg&P(V`66Rka8* zMp-kD`m9i#?hv^cHuDJ~m+M7DSAj}ZD>`GB3`t-P`L$8W)U}@zc0x6BKf+(Gc4;_I zqRXV?xaXL4a0EKi1am!T@RKV*)nUH04$u2(Tgn;rfv<2qLVV1`TCW}KUIfm_d>IAE;Q`S0+Cd=$s%*7LB!U@ zJi>6^;D3*uK<2}~N<7C>=Q%o9J5*p%W0zAqVpV#t!c3lKPEgjG*j3uK1`F%nU!n5{yX5`t)9Y4 zdYF^I$fG}$*M47G8YiKHTU#p9!r!YsOFB4CnmPYd7kE`HZ^(wfZRl=<1>}6UnQX@P z_R#w44b{rZfE`tY?UJ}?P>QH>YZx{EAT3R@d%i(n6|nesiuCZm+1g}6j<`A#^QA}7 zCoy&AG%N6U>Q~pLrin>f$0AcFy|ZpXk#i|w_B9qk7Nv>VW?+js`=N~b1K&vp+h%eY z{n6ndut=-f_vM@;lk>J;4@8J%*8H=&a`PSS)Q4){Db<+v9GZ%$u9MSe5=uz8)WR@( z483k^Yj6Y*~a|#e_r@7dV~Ts zXsJ=c^)KiY0g;N-3Sh~{PIAF^wG}1uf|Q~?qSv^XxBHE6GUFKtyM-x8i#hM11G~tL z+8DZTWQi6vke^ASAb_tDvGCtJaXc_2JU`!5c(8!{@x3j)Z_}{m9>2~If%vI?`|nEw3%pX5k`lmQjyNKu0gQLpUp`cNYK7WbNs46t`u)TJaXO37E z2oxxc)gnM@r~1*cT~cb6Po(am#aj>G*;1csH)9Ez*UHCh#UT9a7fbIdSKYqfZA_fk zY?!FX$j$V4Hlmgb*k?mM2iEVJ*b+iCqbn7o{{1r)uM^9XDjo-4EHV4zJ5Q$H@zwdG_@$!#gK4 z$!@#tPTkR4%d%R+sjC4VZQ_4AqFO33^Uy05|L#s)9>M9Y@$SS#LCgnis$eYv)(%U| zUa^d4j3s~I=qKw%O%vLFpHgbu4Thyi!wJOg#;RosVq1&)El#L~C&+yK+An$I*=0Jf zt}y@w-7hyl^C?jk7tvw-_S+A%z9Z^P82W( zh;V!-?0aA`^{FWv%DFACN9%t9q2VCbRwQfATPT zKssIfMuDjg((Bb&_(`Yf;IE`1^&*2lezr_|7H{uVoi_d(bB#9n(MF)#ZJ!!;Kf1Ki zMRd|qiwge&`>k`bz5&5Sd!OWL5Z*trI(Tq|gm3@L?`{CV0!ts*booQw@eY6l!BQju zx`e*lk>8%4>nPT|Z-}69k)ok~)-|vP7<`yj!gH%5g|bV@6<>$TqD#JVzIk=xLgjTS z#s6$lIkb?E>eEaSlxq^45yfm7J2^Z`M94P;}BB$(K)A!;kd3ZP{9Amdz*sC;L8kzMSyIMf_$IB3k100z*B&|EM z-l~<=ux};}EmT2KFnwt#Cy7Pu?%E|dEr}vHQjqn$_CHFBnV0MxoWJQRu4G@Q8Z@Rs zgeNJ5yy{Kf>P~x+CDR*d_22K^wgkNhMD9g|kjtv>O{Jr4ixk;symZ^S zap5@4+I1~ywWlSJcl54x^}KKF+r0+^$YeIzW8*-afk3*auWJfuZTXV=%wAYOogLe1 zPesb@6YMo%=lS;Hr|jsg1EoXZj=G5Fh>vpmmaZlxGQ}2w-E4&sk?KD*EKKe=(-K*+ z4rq|I2_5Gx7D*`YZXZAwD7;`u`>f{KABhP|l3%;EMi8#hd0aI|KBVXuMvGkI(t0U2 z+k5zZciY990=C8ia4!(KCR`?)HwLW+mvvfxHZm%49FO|ga&}#L5jT)BT)3bgp|8G6 z@%?y|Qu{9?;>$%bAmTbaKzvH=7@2EXh?S#Vr=*62!6w0|42=Ef z`1u!s6Ku~It(3Yzw|M97FV5JiRZH3%%v&is;xvH{x&Htfz5LgUP?_<)MX1bV3M^8u z`IP?=HJ|`hS_|xO#o3=;k6Tqe|DFphvvXa`q?yPO@nTPP zjRSV~eH3*ea=6k}^5GNV&F=3JWO_Wst)@43qgets zhgc;#kjwkjwmTKvBlqr5ihH`XU~p=puTD^#Km0JP!rNPpTZ0Siaf{K=gK_1F_JYlU z22z4s^TF1xhMQ0Xyo!*rc*@*zk)C)7#KvUaKXx?p2AJXiGPO3m^Pd#~04f>)NPXn{ z$KuhWul*Y3s$k~4vn`Bu?Re&PaCuXfUpkIY*KzEIEdP@Fw(kBo;*k#sg0zZo=r*4M zxL&QuccpH`q>;}pan9342iWzl3<*kqCZ5Eo){Rafl^&X3i}DQY?E>RA&BkJEX`;N7 zep6IA^d5N(e@CZ|Z>b0!KZTCV$$Ue(n7uiAIT{LuHCjej=>mJQdwVZ0JmhzJa&3_L z9Muc1-evU(0n8hrL~#e9!S80?_4Cg=m%Q-pve>xAm~vd4LsRU?OoeI|5npYg`jF`P&63y{s&t>Jg?tTT#dEOj;Oj23J#DP=&a}7U2FE!6{%RK;jXv#k)PMpN4gR49Ngr5h#>?D|X1 zq8>CGz4kJz$=xrM9S-(qtr#hYJO49O*OZnz%6%Aci%_|(w0~&uYoJy_qzvopnl<_p z{c68fKK<-xK#uBMzH770?WL0c5n?!sXk4UB66H9~CxO&DzwpOoD?5FsUqW(|QLe_x@uYHhe%iLq*p1wTrJkaT>C^z5L;Q_jOC_+qVA%6aHeqvdL^m z^0mVQug`O%{&r6erlIO8z9tSXQExcvB}i|Y_CXWUu8jS|Qx5vAx4iM>Yw)RF9b#wx z5$?SjI;%~L>Rb1k{;7WH@K)fFID$*ny2TORatm$fx5}8Q+4C~=fIhc+pz+qWSauxf z1X8^D+qnGb?vzmHd%C})gA>!hV_Mf~08?eU@KfzEe)xM@EJ&ZLXvnaC+kE)L-*^jF zUGXvY?qlZZMKdR@J&MaqaMmG&;amt)x7O!6t-DedZPr(>7jw^3EjgSz^QkAZcY!XU z?(rsO=*pt=LEb#WNd!!8BUE2YqDAht2P_fel z=?O(qP!W(9rB@>&lF(b|zDkp>0wPsuN)5deloEOiB@lWKE%Xuye3N&defB;-K7Tld zsAD86S!>SuJoj^5wfnZ@wdpRLE|e5EFyffGr8#$8B$}9Ab1`@=W5DdJ?dE-8iyCOz5ue&i0usy1J)eZxP! zwT(7ETKQ%}|HgO8gf(q_KfagMN2IBxiZ)BpsV9Zj!7JV}o^wPTwf?34W?q$|8_~ z^3$JR>aeEx)5EudGn3Q-9_1yEv`=(o?!duyu?)2r{G}^rg4y^>6T|$FMb4WlU?zAH z8Eg@v#$SqLIr$6R#<&wrsWT)_gi|-_@kahQ(aN+y`G#Z9o{1opj-O672bBXTgfgBp zab`qF={BE<6R%~vGvGES=X%u-f7qZ{&)D6%w^1XAo2%KVscv4_F0y5W)^UZBXB-}s zq4jcZA;KZpP#+Djz39l_>oM z`K|EXmA>B%mi0|(&^73tGFlT8f38;fqY0W`CE*%fLxew}cb&8P;^ha52OCRUsElko zUl~gMSEpW4PrlE!V9-hbe2)~F>KLIooBbX&8hpA^@8S!;D_zftefNyO++XyFkDl7i zYbzcM8$vEAKNV<+RTTEnQN&t7nOR%E z$3i?eeyF4sNDD175)+>M#&A_6&+W`&2^ipj@j7xOlas^F|Q5$(MG(|2N?-(`79^nHBpY7l52s1)B}!J z+{L@*m#_I<^Qo@-#G4WOH6s&GWE2dFjc4mjs4rZ z@jM%KLZc&}VXQ;VlwV=8DO_#R7ZUnz!x(7YwC2=}ZiSfAGdbCx0O$o_!8_N^!(N%p z1YjbZj)!Hr@#2!NT2+I5yk<+>UKXK4AMd7v`kqiYP^c@sNe4PohWXoY4>OisfY9lZb%_6Y zLa+q@^r*LJ>GPItKOGPeQIoQCFYT$sJTm6fuU@y(p7VcNcmczwfD%t~_l};SJZmJfZesk}*&09%Y*C>^dDp>4;4+So1hQl_r}jcr=^q^l(O&)7*{}3a;2Rc3I{m8LzSCbRH6tgQE}{h{{o%c~m4xXATW?zPgQBBP zqOG0QbYVufm@n#~hHU0HOl#eREVm*Y@K!-&#qz<30AGqcR9&C%f1p`Yk=+(cO zgIz;$2u^0RTg;0h0ODXxQA=9LdMQNnIBZs~SNi*p`ew!oVfoMmkc)Gl)2khI#HHiD z49aCtJ2pF5Vm%b4nx6EWw;XxEKT9|)dqx+wkuMDYS?HGS^(9PzG^5wbmN{J1L~4`8 zdKL(M9PuQPnvT~s^xi1J8)47fV_83Gnzo5TZ!vRKSdf(h#v+ei_vLJ5x0GdE%NR@Y zxjS3BwK==`^w_1(-@30HX7=>g;xFWreSf9>$>gK&YLt9oCkDkQ^4krm^TX;s>3lq! zJn~lv*Y=^a!h_v`KK&}(3fnomiw47)YldVF-`+39&GMa4RCqiZvFn+}yd8TuGwJE& zkejQim-`g1R^o3Fk$xpux zIS!U|wvzfpWnR0ZP$89Wn)m4JbHIL zJCOXPP(^fe{Af^mWB%2j(-1C}HQHszXXjMY*R)S8c-VBFoRur**&3N22+tg=riVSf z_{F|!-hXD=dwV4TqT$=NAQt_YT1xZp5b9rL9tN^gTBGyZW8-1S7e9-wirlV6WN94S zOwCU8oYNUbZ>bbH8h%&hUzahW%aWIhF>7fU+1@L7<~!V5gzE1BMsRhU9(UM^$mp)P zD9-^pN}y|A81n>YWOUdqnYr9Cs+*qg!CUX&m!Fv@)R!#5bC7MookT0JIq{wSoMZZ0 zkP^R!zrf+c{i6u0j-`T|&#c?Ba=%4HEPB6)cumqK7QJ>!Fj0-hR+H6vrG-*w#t8?m zkzVRYVYB<5__ZkOiJ$`OmV(bZ*zPR5^%1B*(eDo#GgaCE7}y5krpAjV$M9qR;<*6R zUGMPRc?=)T+I89u^gzkH!7oVK_l7o?W$wOhp<#dmjsr9;J}|*Xfe@_cT*)w`g3(#*^wwIjCr4l zS6a4i7{l$NL&m6%Tu50twrXZxKk#L$x<+yC^RI}A>6opYt<5rfag;F2&i7aA+R||n zv4L^7+5mdt_PA@Anx3b0wwhq{ZogYm3s4THS#fuaP3PW7R#*!DB9u73D#^V6Oj5O=Dl-({u`9{N9q2qhom}-xWIfH#VU8-1%U;7${>t!57qeCeQ&NH$pL9UF##rZgc1ebf3+W{#G15-v zUBJ|Dd|hnGJnr8GY4NZ~6f6m)&xqCTjd6Y3G@el1b^VyrXB1lhzH&z7>EHGj6i~lF zL;48**T15U|M=^U9T@J&zWxi6GgZH{a+V-ar^`f$CM;RRZ}Lc58~XhYgDg>NciQ29 zI#A(RR;lo!H>vaigqUX>6Ou~K7!gbS+*`Na0mpYLy*OmzDS84tc>n}-2F+Jwcy6%d z1%F#QoCfFxco;-z~-DLuNwrPGB^_6GneJ+tDGq&7DajYQUrNp%Q>U6jkAwt?EU zN@g#W{YZ8m<1t6}k30PxCna)`dQR@TSd(M0z_H4Cu@~lPf=8=bzl~%>3Z8%9e{;%^ z)8*U~>hV8(7yK%svdG~C+!zoAqs?Sxow(&p8upQ5DA67q$=eT9OgY|(H(Tl#4mTf|8|7aUs9_=TEFiD zGE!?_v*$1MB4@_?R0HAP==BK(|3wG-_Xk;`Y`K0FsH)whD0@F4p5&q(#NN^+$}h2b zo>afz)cB^V+PWW#GgNHAdC3wEutpGF$0RDerZ9i02({t<0R(YF*H6U0*t*h3XYR(l z+?$csU#S|vxlKg?ja@QMb_LU!yJzkB=u~WYn62KUQ{z?6Io{$3ZFhWAI-C+kstw{? z+dFfzup*}l7(ro)l4Dg#n<=r%D5busa+dYsWoGxD@2}%o!wwIs5}p2%(T9>Vo)%U5 z2$5SBjBr|NCsL4q`4lOUTY5AD5@URDd#{u-I9E82Y}Vx0-+t-P%6l>QnQn~Bf4nsM z_eB-9z^1YdHreVZR!;8++%$A_thQf}p-3EOi#nfoNLSL~@mB`YaHc$2dz(A|-M+P* zYMyis+2&VJdY`aKDq;^QT)$FY8yyH&c$JpeWUrwln)S+2c^5AbV~KTNUD4Ggj?Q@j z2yVL6plSXAw9q<%JiwK2xfpnP+Cb?`e9|@zpLvc9E}3lCTiBMlWLqu8=d}=JFt3}! z_QPhTKR)Rf@gTF8wJG{2yUEbyLC*+b`7c$93t%r~07!(mDBvCRIZ@=ymHAWQ*E8<5 zJq0T90J$o;ZSfdyr6)`TYhLuK$U|}W1A(?`^HtDP37}j2&vdz*AEwz;D2H9d65h~0V1XFIqT-1 zv$(pR{&e^XM88S_p;poGi<$CiGl;A7?78m>vEkdGN|`BA^L)!7Rb<(YBK_*Cc#al* zrkRMisEC5|+d;0*!aj3#-p#37GBM&}SsMZ|J9tmb7D!b#t?Y}^oiItJsV{9@PBe3J z7b-2TjNZ5OX`xjSy?<^Rm;N|RBIL$)IT(e;0g45i%v?4W_kisIRO0?uQ}+Z~zNLaQqt$PlC|s)U$uLS&VS(OWDPH2YXqV(M=Xh;fJ91p7BKUQ$ zlt8-J#Ta>oRYu1agUfSRnmfS}$h_HcvoH4nWPytp&|6lit?j#VAsFM=mIpL14Nj`@asJ+P8?oG#^jL1N6|5Bd z6y8%aKX#yIB;dRdWxYL~ci!Ox4^I3?$>{ZXWHgJy)Ysw;SLj1eK;|x(lV8ewU)0_V zCs9~JjpS|a%qj6lMMWh}7n;#SY?|0>;EXKK`8|7)>^Ei~)c}SF*<`49`;wa! zO}2=7;~4;=n(ld7Yp|EF)z4#-3mtTm@;*)z2*uXy3@NR|&36{;+fimWB4}9Tg(1~R zl$XMlD`eulspjCgvr*aVZdIoTZ)5sR=!_i3X$vpuF8L4n&5ISCcw(^%1z3%lg!$Wm z)tEciQt#>oOwO<4?_aqDP^R-1*#cX?;v2)X;V&>;Hi&XQ*RkBiSncyVSRjsckxM`w zH0WqX5UTVN3sev7jKzvZK3ZdFSUO7U&R#y>feikbd=IYo_&s6qgjuEBK0q}<4azAZ zz(WFf$WYb1VLzWK}>Wb2rb`^Hx->0m)@3pd0Ziq%rcyQqLziY8+H&GzcqFbxEKOF}>8YRKb7l%a)}w=}aOYv`0vAj&f;5feG&@#I^qovU z!8ZiU#j*TmymAAe5lq&^F;2x2hX6a(WIYP>yxkSBrI0WFAluou1#~+==83vw;xp?q z8vnTWOX)C_z?PYaG6jR}o71Hf!fLh5a2W;(?v{TYX#v5(u$ZXEIv~OHE=y+IRnKEH zkrn6WOP=4tf9`6!OG-PdJVJc{#Ep*yd^RjSmPR9bvB}LI{_aZP($p4O`cWw zf83b|CcFOc^f%%-4esb4D(tt16a>|^sU`cc*WFHSJvcpdsd^{C=m!%?i82*m?N==Y zA)vTT-M17{qTK4T?W*I)C=*4_3zKSrzk@NZAQkq04NNrw`iyO+ST9KXa#Ft6bjk1j zfQ)*ImV168YZ|$f0Ckbphjnu666ryF?OrRdbrqj$Fm1K7k8jn02Vg6#Y69D=1DQuNVD3_(L!GKL8gB1I*vzRJXfFE!byVt90@@iL`xOQ9<=l_!Gor5Il znSrX2V%O#gFoV1==97pW+#!a*I!6XZdL|}S>D;wxNMi?UMJMQs8F8c0x}5R+YI4Aq z-LX((!rRX-=GX~I=XQy+e^P+a3MK<61}0$d?PrvcFdtd)kB>Fi9l`Eg51-$Yw4Z91 zeAT{z1on}BPigb)&W@#AfrU{Z^tegMKfaN!E{c%{mEJeLhri}~>mwA{`2*hvGBKx= z!bjRA?eX~H-r^UVUKCsT*{z~ffmszyVMOggy&0HNCSn4Qqz2-njn+ROG^WE-fdj}D zxFy0b3LML54Jv)}Ch7~C6JSW+TL2kX?wqqNUuBM*2qIc91kFzQ!oPGhzTc)G;tO7k zqmDnLdtAqmUES$(k4q&Y&#|>^?)(YKZzd{g-qkg_D^jr4P&{~qowplno!-G69=MY4 z5PIE4HkB79kMOUfj|9G*c(ZPL8V^uT{0^7TkM%57yCOHx%&Eq%7GOA-LiFwG?BG1C}3U^$4V`VbppX1C(_%s@pnXE2Hzf7lH@}#VV7$6$W_jVk253rGRZ~gc#Br#f>Uv($k*Wp5YSHm;Vz%88m+8HY5nyU?&~ox>lWkBR?!T{G#+bzvoO-@DRd9np3g{wycd%S+#tD;3`YVR`Y8f(*!`G+j|}ru(|K$q!_F4J*Qyfw5Wzja#Rk#~;= z%X;EUpGr>)2L(pqM;#KKwPHd}!pYKO9-@mjWStmf!GuJUysyB`|{Oq9K+-S2gUaH!a+?c`?=Mf$hL8=gqcu zcOFNDW+wGZFZd;>&7Le#{SHUhiDZqT+6fB+SeokLWz-10Cw$Wxvxiw$3b^Q6S-B;w z(&4_@wDaE`;+JAIbe!bG9$Bw3piK~aea?V2mgY8gaIH)$T;Smh<}(4m-eqj576{9zEf+J>-Ed z>wx1gsNoZww;XN++;auKxd3f!Ik_>PgH`aO_N#ZDIJjv*l%wz=inMlae>R^p0yfYZ z5?r?N<34w&WQfYX_;7WZ4vRhQVtZTa1+lS2msNj3 zIc|ioOzOn5HmgGO&milKm~ZNmbBEmgxkf>b!O(+}yW5+E(Os9_Yw zsa9XzR$v><_a7tG9$?q%)-fck#@7oJhyD_oowj;6(*~&PtW;(irE~Q53Tl{x_|wMi ziIG6H&+zOAjb8CVQiAUIRk3r>4N;4l*mB!ZE19;?7eVbx6F{S{jSA3)z->8qI8~CKHb`!~#;5BS8YFhFN_h&hMaO`91j#ZNP zz0ema+E5NXsqKg-X3G^_M`!1?e1T}EF`85FJX#k+KIj$7(*P>K){B?VZHZ8vr#s59 zQA3=j9ty~cj-B=Dg7$BH|X1>}hoFOK^wQ zOLv7Z`-v#yjk5%Z&!>SL!j8qxn)gT3xCIb)>_|lDid%!J#tQ+T?dvq9zmkTW=7s?p zL6Xn&@K#UemZy{H?W`?dC$#SViF*7*5BgFt`7^}tI0~B-{@G#5XnJV3`UBCsT4}HO z{CZyR`9$KM5HT&S*)8XY&pe0&Y?8b5)@L+p2<3H4@p^_)20gzAOnMJbm7tVBUWy?m z#1{HILzg<$Z^=yC{!cw5H2Z&dkL2&f{JFoy>7ez2SZhK3Ha0_ehs(5VA>JT6r_%6Z z@~H{MgFo5CzY+FB;%*He#T!4{68KynpLoLk%tC@{|6I7kztRqEy6Sq{4#H34I*M{_ zr5C8S&97a}2nwkj4X;u)5G|Z!rk=2VoyNY%B_SNXGUh^hPX+ram=7^kDH}uCT@nc(U-rBsxak7|QPkQ*dKe zws9<#_VLbSZ_jiji>X**Z1M_<%wZke7?2jntEtYZ=xHv$+{)J*L!T30+FCrV&D&V^ zp1urjp{wH?^whwddQYUYxfKky@-Yh4Jp+QEa^K;&d1pJiFaDR%Z{qn}!~N44vF^(N zDv7n8 zTX9YSw?DVQ*@i|7s5~JykV~WTzNhXu4}p$^_Vh3WPpyK&=k)@%iMI7xIs0GqC!VA> z*bew^k0yceP^RTKXh81KcQ-Ai&1wdd#%ap8rPs6&J9riFdsnYi?ABUXAqz)qt?MQd zwC08~cHbIcKD`5hjJqAi-nm=0ns6OLxhQxNeN3jt>n3G@1__Nh0dzytE7=AyG;0WdEE9 zy-@p8Vdf%<#!dNK&f?t{etO`~VpWrOECqGR7&-kVy##~xmqhv-x7tZqeqTp-6d{sk zfjmdFGGkHRZP4j_S~lij2ZE}ksZ|=-O6{PJMO9^8`$d`{DY53jyUl=~;-9!UbJ1uj z!)WzUvM=SpzVha_&}QFbx#NE&J^s-qd-R$y>l=k8Cz&Hl7|fJFF+cfA1C`i--SJv% zxiFow`>d4Asb^*;rmz$J}-M_|KG@OS#y1By_L3QazLmR^0u}8r4EO~d9{a-u36XR z*W)5@KWK$aouK%bEWTg*P@r$|#I$eLU$4lHr)AfcPH}GU^OY~O@cq=TT1~wFOM}?9mMvKIlEItO)6p2ka<-hK!c0Gh~#CWj8c_Vl(2Q$6*S_a(vte z_0!}B+LoFE?n~7!`IvA$C*N19-P_jk7E#sY zsomy?*W~Qr^hSl5Na=%0@wVm_w=n_vGJqSU(Eh8MrNT+K6!2EjN zCptG8P>wkLe3csEG`7MT+@0m6=W+_K%U?P>gh20v=7^!LRYvyHk-@UW=f(!O0S}dE z90WYBE)5iX#`*Q&aFLyl4Fp&CC^4D+Zivx;mD2 zNfxrHRWa2W6MIVs;!nzTl=x|L=LcDt;zOZp6ZZu#6*YE94R$@*9XWpF zu@-d$2YfFvOJ<{`gk{3Ru(i{t%mjBkcAzU9$U27A>|Ae|n`P(yUU+W&O_L|Py0}+~ zFC&jya*z|m70rp^YYC&k!Y6or1rYrrW4O@H!*4!!;NC|la<2;~yOZ~;zsgE+?4G!k zW_q3&xxO#iU5?Ww*hd#P`Yfdh<`0RBKwXBMTTR&Yq2|FB*84AFv;xfvROAmPy>SIH ztDkn1?3RjmO7G%+_P`g_n>emS$(h&XA%3h)N!Id}ZPgSQIzR6!w0WyU6e2&*QK|Ejoq{OxN^FU1{iGh?S$fId zyKR-|bPu+v&^uUthsdB-Otj^-gk{YR2~4D9>_qxVXi-lsEPzv_CL)yI;eztqs4dv4 ziHwTNR@<5RtmMksM$u2KRj19j{BDXzoONo6C;40u*V^7a8St-q+K|9pSNgD9r0%&N zFcTrgF_2{t+5hlDkQB3d^?(YWH;@s@FjoMxi!hM``jr^bJ~2P}O_IbA3rz~mobhVw zQh^`16TtKRwl{ElPxdTGlwVGbd8{507bPDM(dBWl(_JLWHTssoCpcH$fmiJsL7nHv zrro2$mx>37uo60V-0@RZAY}@ym4Cg!QsY@zL91^m6dP9_Cxer(# z*?6u0eKaxMRWF@t1EphSwbQxLq-z9UalbA`nY+-ZS+$(BO^TgN95$=U={>G_UlJY8 zb*_>|zl!{A$@zWeLs67R6~osg`tm1QnGBfO3v}o~udbt7g|@89BGxm$8~P%?p2XAfR^-&S>yZ<$N=?QyVVm7!?c~c5}(_DIb>KL`M$7V1LJ|&ZIFP* zp6L^nH1<;j81248=CsZ%W#UFk5p!@N^O_fE*-$EJ1#T#Jr zFN8JG5BcjyEa3zMMh8^klxX*A@pg0%d?SNT{GN%9hAttJJRsNt2@9MrWKmrH83b=H zvdVjS;MIH4d_|QR2*Hbf8=4lY1XjHBL42z{YPFt+%RE*yDbtZZZlT}?=yN$O*iP;T zmZ(PRfw8YH@77)ChUPQXHn}_jKx9)kyKeZ&>+6jHr%AA}BIf{DwGlkm1(%(V`8>8i z@p+1_8@3&w$B6!#C^ z#I1Oe$y=LZm?dwObl`gz2@ttcOpTxi9p|?`kWy+qsCktpfoGXKff29z=tF^!8x~zs z@svbulkAaKDcqm2c~`9Z+ysc)kDM=!Rm9!!D_mSSJY$hMSpeVrAnDplN1VKoq$S`> zwk+%PUP@#Y{J{P!hkLI-p*T(*tBZ7c^Z|5RI^%vR@RzciL!npiXYYg1G+hJ)pLY&b zW;lPjJ|OcNsG{h8tF;^`(aHpq=+|;<=PJcnhAd!1n1gAh#$QtFn4x0L&e09jn(YjKr_r2SiEj1ft zf^|v%;2GYi4)gls7(gGi^D>%CV$aMww?k2hP-MT1cVBcoRe*X^|H$)8@%?YsG&fui zNhrAY37)_oxKYGMCxw~lH+0>P!p;C?GbN$^Y%c5S`PU1#co z{Ku^#vqSk6--+*3b4buuSw9xXCSJhKq0FdRsd51}-g-ul!@kwQ<`_Kfzex{yW!p|P zLS>o&5$jyF3I)8w;}`Ps^DTlQgP@qct}_Z-(*G&}p*eD6vj3%6chAxwEhwz5nc#Ym zNQ2<8av`_BwWtK$o50^g%LfmGiz8Bhx{FX;$#L@;AhoIEF36es&7#xu?XJRZn2(V6 zgj^RlMeZ8!PIwR1Y~!~Q2&Y)6%!i}dg5S+_DWvkY)q+QOx8wIep6)MjqRM6On?YW$ z4c`7aV1}0NW#I zKwi{#&^#<vfEehFF}NS-0_-ryyik(zXda# z10U#j%j5j3HGl{X9klH;oy+>_;BDg8QezCm=v!ePCxiv>Eopq%3*I*{bz~%P7o*qi z#Txs#w5QK$?QlW!-8*H3s-kXF#>=ck*!sQDK)v8XQ`mdibv_noTYGPM?zialI*HYA z?5du!Pr!w?mrk9Gg2bO{MbAZwai^Ohj_gVs2Z$u!owms76ZvV);V{%1m_8n&_AVvA z9QE01(JQD(5#9+3`}2pi%QB#u5KWxR8~^AdQ9x&7lLZnNAaD?vA)uh&_OtMk6Jj;I zN+9UjDGs|`OPa+dqtzfb23t!P*eWFg6O?#hP&URM0v#)cZ@%<RT^i3`1tDvwis5&Ei;o7L@{DPk$Zy5{P)Y^p|X%O1_|r3_TpdFZxnac>?(ay zJla|+_sRu!$2<2?qmM|1nMx7);a}*Mb?euU@W^hB#oKwLW8g)Fyb`4AMdF<~HCqhcYFkQZje#Ns#OKJD`O5RnY z!8BX=?J<`wnsr1oTKjD0gzq8gi)i`YN3Ny!fnf}&%H!{HHIp@hcW2f>H}kYXiEPo@ zRO9hnm4GV4mmlhH$v(dfb6C~sFFNZl(56&q$c@DwlS#*J?n)sMH|cWAORKBhw>ix- zg@4o>P?V0Gj3@Tm%#N>;S3^NQR_ zog{f~Nr$)=>)>m#EE9+0$9f-`4a{>D*>#^J%U^qexV96C0Sw(w2!eOCst88Y(CgQv zb=_a;*aNkV5lvrcqJ%H+ZL-DFMXtnybPaI?Dj#@J09@}%ouMdSJRdNE4MojQMCtUV zoP{M*`EPcm9@+mr={8Qe)ieV(*s*z4#M&S!uX%-CwemaDx=i@SSY`Ee%<+)e%e!Az zR>!D$mtgao#k{33`%fJ{Zb?S@<%?v7*q+$V+$~?)flne%EYC{9pVu$a0_S{?5f2M~*YlHr!C%(m*Q<1X`dFA>l9EJQZE1N)|&z5=aC{iuZ zwt|u*d+zip+TLKdITCfL#K+PhpZPmqtIsEQ{s=E%w>Z7w$wHPVr~Aqd8jdZI_}e ztVr|kD>bk$f*k8T*hDT8N;ibD$!{k&wJYAPLZebnjM{x?t@)M(=b8N6?#S<{y-W4Rdh+E!MC&;GYGF;cxuo!+b( zIXs^uf*UsKdYTBw4v5=uUWwv{oGjG|#0Ga&jt}!_Sx4?9|0MOGG1Pd+rrONI0mL)x zZW9?4)6PS_AGcU@M_47hX#@Gd-8pw%+^*X2SC)zXnLorRz>obt)}p9M_}AWP-Ty*MrO`@*-qehyYjxXzhzylylYsI-0-m9IQ7OJq*_iWhYNsc)Q`W<>BgiiA%7Y3o8G08Qp!lIG z6$zhBFoTqy2irS(CfX5ZBGBBCT#etMe|<9ZW?9ttmjhLCKX zxcB0HrP#e-_FMtHad;c!Zr_`TXtS}>%mmlhJc_R!4c~5sf)H1lr28Y(Z z<{f*yF(SXo4iA@Z#E%I*?U0DFteZO&8Bzq@`goe3mi41a$ho${m8bMKGsPp49!LXm zqM$trfkm_KY=47C7&@iLQ>Toi+CHF5iv4;Xc&mh5$c zYa|;TAD`e9r#tk`GujDM)23^~O<4F7J{q<5dt?2|E37K7z`(#~Mo6U977eQy=g?C_ zSN{<=r^SdXgQ6Fg_b;LeU+pv)14A7uf8(X+h#%gz>rpQ6l-bKbR}0mSd{_619-8g; zF-z~x;A|7Man-+g)T3)fH!FQkHN*=+#5Mb6k&`A}au0opT9n0urg?Ia*`@=hIYApj z;J8;EM9&MJ)VN3nP=R`C>C@-E{P7oj>h_CAA2?83lxO^<=ZC$C_~PP*Aof2Tj#_?- zJ@jPUESfxg?MFUl-v@!0E^^wca#o)Cp4gW04z9;o=W32kao1$=@WClf)qwnqYu*LW zsRw4O+sxcUwD`Flgo4(1ggPIR2B8hQg5qAkU*dJ)Ic;=m`=o0zJV`z!GaJ9)&3kB1 z9^BIA%eMW}L$QWuGiP0=5b>2tlv~W`oyhu#kLs5Gf_osh!PM2yu4kC;nV_x~D`a#O z1dH}+7ibmsavv=6f-%;=aDVWS^n80`1#!@~JM6EAYEgL<`6QwvFS9V6y=vbron@Q? zpK>w!PgGIh{n1-duI!V|5V^MbhxLPb{95}o(Xyvs*hE8#zc*9Vd-`-!cI7HgKn7gW zn;h$fZCY`RKaoOVT+so}z%dE*=I72`h7fc&+fF?Lx;v4}XASA-)PU5T#V_|H(in}( z0qG!#Xsg+pqph|ohvBTd^X>!Cb!t(+m4=1PIU-rT6=dx0C9a$f_QU=bjMd@;dPgJw z2cVWt^*eV{Z`J^Xj{&OLLDWLf>A34ss9g%1!KDAv*!zB;D%XU!sY|1eyUQ+uE&9lQ zqLuD9X+d|lGg>Ai1<$0jzNIG%@W{Q?=ehG5^laMNTb-RwVr&L;rCYe*|1Puv$JIYU zN2Opj&AX($Z$1g9x~D~6(3JS{ovOQ|crbgJa2rs>_>lC7%y}=6j*e{UlytvX=jw4} z^yq``=7my=`#iF>8)zi^n;hl3Gw6p!$sL;Y@6mB;aJviZ7F4D1N|4W5YbF#Gfqw+7 zXW#&;4SHVlfb5E7zITrt9^B&L-bR8A| zPRha}iP4d5qaLl88qf#)cha=~Iga)5f8LK&za-|Lqr)jQ5_psS(jW*cXR1;pq*)>Y z*e~yig@60(*LuG4{uSZD+DHcY8FM4bIEa-5bJ_zTeKzGsCkz4A(|E5M4X*np7Z7Z+ z;5D`06Xt+y(qs)t=|zTto#9`>(Ej2#I{y`1mJ(qIW2<+quw^C0h;6m*Q~3Z}DPL^3 zC8(EC;lu6DM+k-luC#+&kfn~}5>UZ$EiEV<8BRNBvKnDYMzcM;Ab4Y>a;2RPL2x1J zAHWdVKEGfH0&t;bJoK;pJv{r`va;eNh%D@8L?%l7=S?eJ{r87>@=4;T&!HU|>Rfxb z7VqavLrUv`F7r2Nme7qp1J8pM0Lxq$n6Q9WhVg@dCDk4Mr0u7JzGv!U0+E1%s%1oSC*hZ=UnO-tPDHfd3*NL5t1>uSF30J|O?=V@&zie7@~GaESO%f8$~NTXgBk z(c+);#V__Vb*}q;1?^jFrbu-rLZ^eZITgZaCm>gO7EEO4g`~cwDP0M&U^HK0A`PEG zH<~TMSXPYlz>6wUxl$|@fCS6e(xl{Ver$U9Sp!$qVtRD${VS5D@&hbvPOgX5 zzpsZcxE_W7zw4nB@*IWN1(z1wRWD(P8yw(Mfeaz)bT9V0g5vn_kP8vNFuQS_OATe; zYpE}VKLsgNtI}%$NZJ4&GcZtos(To~LIg9)n9(n=XV8fpg*cBy8}el^zWYC?DPWNO z&xHWmX?pfLX^_z#r4j$bc*-2AFi&9V&b{k?5oY^j@#`Ih<8koEBB20eyvmhOgKI9K z)?MjvNM^aSFu24t_%o*g)|byl4WydO2=%3q(!0Hp;P)3w<<5w1dp{)Kxk>C(5WXJ0 zL+O|?h_n5tLiia}2u_Aw9S$qL~*Ofc+}S5N-;5rJb>PsJSzNKVuJl)G<2 RhJas>A8J0xyKnyY{{yymox%VB literal 0 HcmV?d00001 diff --git a/docs/design_docs/cholesky/cholesky.md b/docs/design_docs/cholesky/cholesky.md index 66fda5836..6d78bedec 100644 --- a/docs/design_docs/cholesky/cholesky.md +++ b/docs/design_docs/cholesky/cholesky.md @@ -1,6 +1,16 @@ # MLU Cholesky 分解实现方案 +- #### 文档基本信息 +| 算子名称 | cholesky | +| --------- | -------------------------------------| +| 编制人/日期 | 曹文轩/2024-07-21 | + +- #### 修改记录 + +| 版本号| 修订人 | 修订日期 | 修订描述 | +| ----- | ------ | ------- | ------- | +| V1.0 | 曹文轩 | 2024-07-21 | 首次提交 | ## 1 Cholesky分解算法介绍 @@ -63,8 +73,75 @@ l_{11}l_{41} & l_{21}l_{41} + l_{22}l_{42} & l_{31}l_{41} + l_{32}l_{42} + l_{33 根据上式不难看出,每个$`a_{i,j}`$等于由$`l_{i,j}`$和$`L`$矩阵的其它元素组成的多项式,例如$`a_{32}=l_{21}l_{31}+l_{32}l_{22}`$,并且多项式中只有一个项包含了$`l_{i,j}`$($`a_{32}`$等价的多项式中只有$`l_{22}l_{32}`$这一项),包含了$`l_{i,j}`$的项另一个因子都为对角线元素,因此为了计算$`l_{i,j}`$,可以由$`a_{i,j}`$减去不包含$`l_{i,j}`$的其它项然后除以对角线元素,这样就能算出每个$`l_{i,j}`$。 -## 2 Cholesky分解实现 +## 2 需求分析 + +### 2.1 算子需求分析 +| 算子功能简介 | 对厄密特矩阵进行Cholesky分解 | +| :----------------------------------------------------------: | :--------------------------: | +| 需求来源 | pytorch | +| 应用网络 | - | +| 输入数据类型 | float/complex float | +| 输入Shape | [batch,N,N]/[N,N] | +| 输入Layout | input/output:ARRAY | +| 输出数据类型 | float/complex float | +| 输出Shape | [batch,N,N]/[N,N] | +| 输出Layout | ARRAY | +| 模式 | 无 | +| 是否含有 dim/axis 等类似语义的参数且该参数支持负数/其他特殊处理 | 无 | +| 是否含有 labels/index 等类似语义的参数且该参数支持负数/界外情况/其他特殊处理 | 无 | +| 是否需要支持原位 | 是 | +| 是否需要支持stride机制 | 否 | +| 是否需要支持广播 | 否 | +| 0元素检查是否直接返回 | 无 | +| 其他特殊需求 | 无 | +| 本次开发优先支持的规模/模式 | batch<=32,N<=3000 | + +### 2.2 算子功能和应用场景描述 + +厄密特矩阵,又称自伴随矩阵,是共轭对称的方阵。 + +对正定厄密特矩阵$`A`$进行Cholesky分解,即求矩阵$`L`$使下式成立: +```math +A=LL^* +``` +其中,$`L`$是一个下三角矩阵且对角元素均为正实数,$`L^*`$表示$`L`$的共轭转置,是一个上三角矩阵。当$`A`$是一个实数矩阵时,Cholesky分解可以改写为 +```math +A=LL^T +``` + +### 2.3 算子输入输出参数要求 + +| 参数 | 语义 | 类型 | 支持类型 | 物理布局 | 规模限制 | +| :---------: | :------------: | :--: | :------------------: | :---------: | :---------------: | +| handle | | 句柄 | | / | 无 | +| input_desc | 矩阵描述符 | 输入 |float、complex float | | | +| d_input | 输入矩阵 | 输入 | | [batch,N,N]/[N,N] | batch<=32,N<=3000 | +| output_desc | 输出矩阵描述符 | 输入 | float、complex float | | | +| d_output | 输出矩阵 | 输出 | | [batch,N,N] /[N,N]| | +| upper | 上三角/下三角 | 输入 | bool | | | + +### 2.4 算子限制 + +| 限制类型 | 详细说明 | +| -----------| ------------------------------------------------------------ | +| 数据类型限制 | 输入输出矩阵的类型必须是float32或者complex类型 | +| 布局限制 | 输入输出矩阵均为array | +| 规模限制 | batch <= 32, N<= 3000 | +| 功能限制 | 无 | +| 数据范围限制 | 张量的最后两个维度数目需相同,upper为bool类型变量| +| 原位限制 | 仅支持原位 | +| stride限制 | 不支持stride机制 | +| 广播限制 | 不支持广播 | + +### 2.5 验收标准 + +一方面输出结果的动态阈值 diff1, diff2, diff3_2 精度验收通过,另一方面使用输出结果 out 还原后的矩阵和原始输入矩阵 A 的动态阈值 diff1, diff2, diff3_2 精度验收通过,并且当 upper=False 时,out 为下三角矩阵;upper=True 时,out 为上三角矩阵。 + + +## 3 Cholesky分解实现 + +### 3.1 计算流程 将输入矩阵进行分块,然后使用以下流程计算Cholesky分解: ![image](timeline.png) @@ -74,7 +151,7 @@ l_{11}l_{41} & l_{21}l_{41} + l_{22}l_{42} & l_{31}l_{41} + l_{32}l_{42} + l_{33 SYRK(HERK)、GEMM和TRSM均为标准BLAS库中的操作,POTRF为计算对角块(完整矩阵的对角元素所在的块)内部依赖的kernel。下面将按照计算顺序依次介绍。 -### 2.1 SYRK(HERK) +#### SYRK(HERK) SYRK是BLAS的标准操作(数据类型是复数时为HERK),定义为: ```math @@ -82,12 +159,12 @@ C=\alpha AA^T+\beta C ``` 其中$`C`$为$`n\times n`$的方阵,$`A`$为$`n\times m`$的矩阵,$`\alpha`$和$`\beta`$是标量。 -此处使用SYRK是为了计算橙色块的外部依赖,上式中的$`C`$代表橙色对角块(完整矩阵的对角元素所在的块),$`A`$代表橙色块左侧的所有黄色块,$`\alpha`$、$`\beta`$分别取-1和1。 +此处使用SYRK是为了计算橙色块的外部依赖,上式中的$`C`$代表橙色对角块(完整矩阵的对角元素所在的块),$`A`$代表橙色块左侧的所有黄色块,$`\alpha`$、$`\beta`$分别取-1和1。使用矩阵乘法算子可以实现此操作。 ![image](syrk.png) 图2 syrk示意 -### 2.2 GEMM +#### GEMM GEMM是BLAS的标准操作,定义为: ```math @@ -95,12 +172,12 @@ C=\alpha AB+\beta C ``` 其中$`C`$,$`A`$,$`B`$分别是$`m\times n`$,$`m\times k`$,$`k\times n`$的矩阵,$`\alpha`$和$`\beta`$是标量。 -这里使用GEMM计算蓝色非对角块的外部依赖,上式的$`C`$代表蓝色块,$`A`$和$`B`$分别代表橙色块左侧的黄色块和蓝色块左侧的黄色块。$`\alpha`$和$`\beta`$分别为-1和1。 +这里使用GEMM计算蓝色非对角块的外部依赖,上式的$`C`$代表蓝色块,$`A`$和$`B`$分别代表橙色块左侧的黄色块和蓝色块左侧的黄色块。$`\alpha`$和$`\beta`$分别为-1和1。和SYRK(HERK)相同,使用矩阵乘法算子可以实现此操作 ![image](gemm.png) 图3 gemm示意 -### 2.3 TRSM +#### TRSM TRSM是BLAS的标准函数,定义为: ```math @@ -112,8 +189,28 @@ XA=\alpha B ![image](trsm.png) 图4 trsm示意 +trsm的实现可以依靠对A矩阵求逆来完成。上式 +```math +XA=\alpha B +``` +可以变换成如下形式: +```math +X=\alpha BA^{-1} +``` +因此可以将trsm转换计算矩阵A和计算矩阵乘法两个部分。计算矩阵A的逆矩阵时,由于A矩阵为三角阵,可以减少计算量。假设A为下三角阵,计算流程为: +* 计算对角元素: +```math +M_{ii} = \frac{1}{L_{ii}} +``` +对于$`i = 1, 2, \ldots, n`$。 -### 3.4 POTRF +* 计算第$`i`$行其他元素,从第$`i`$列开始向左求解: +```math +M_{ij} = -\frac{1}{L_{ii}} \sum_{k=i+1}^{j} L_{ik} M_{kj} +``` +上述计算过程利用矩阵的下三角结构大大简化了计算复杂度。 + +#### POTRF POTRF这个函数名取自LAPACK中Cholesky分解的函数,POTRF的目的是计算橙色对角块的所有依赖,POTRF执行后对角块中的所有元素计算完毕。 @@ -143,53 +240,59 @@ POTRF这个函数名取自LAPACK中Cholesky分解的函数,POTRF的目的是 每个列块,仍然需要先计算该列块的外部依赖(该列块左侧的所有列块),然后对列块中的每一列分别计算内部依赖,对于这两个部分可以分别用两个kernel来实现。由于这一步骤是严重的串行瓶颈,因此在划分小块时需要尽量让计算的快更小,减少串行瓶颈对性能的影响 -## 3 MLU层需求分析 - -### 3.1 算子需求分析 - -| 算子功能简介 | 对厄密特矩阵进行Cholesky分解 | -| :----------------------------------------------------------: | :--------------------------: | -| 需求来源 | pytorch | -| 应用网络 | - | -| 输入数据类型 | float/complex float | -| 输入Shape | [batch,N,N] | -| 输入Layout | input/output:ARRAY | -| 输出数据类型 | float/complex float | -| 输出Shape | [batch,N,N] | -| 输出Layout | ARRAY | -| 模式 | 无 | -| 是否含有 dim/axis 等类似语义的参数且该参数支持负数/其他特殊处理 | 无 | -| 是否含有 labels/index 等类似语义的参数且该参数支持负数/界外情况/其他特殊处理 | 无 | -| 是否需要支持原位 | 是 | -| 是否需要支持stride机制 | 是 | -| 是否需要支持广播 | 否 | -| 0元素检查是否直接返回 | 无 | -| 其他特殊需求 | 无 | -| 本次开发优先支持的规模/模式 | batch<=32,N<=3072 | - -### 3.2 算子功能和应用场景描述 - -厄密特矩阵,又称自伴随矩阵,是共轭对称的方阵。 - -对正定厄密特矩阵$`A`$进行Cholesky分解,即求矩阵$`L`$使下式成立: -```math -A=LL^* -``` -其中,$`L`$是一个下三角矩阵且对角元素均为正实数,$`L^*`$表示$`L`$的共轭转置,是一个上三角矩阵。当$`A`$是一个实数矩阵时,Cholesky分解可以改写为 -```math -A=LL^T -``` - -### 3.3 算子输入输出参数要求 - -| 参数 | 语义 | 类型 | 支持类型 | 物理布局 | 规模限制 | -| :---------: | :------------: | :--: | :------------------: | :---------: | :---------------: | -| handle | | 句柄 | | / | 无 | -| input_desc | 矩阵描述符 | 输入 | | | | -| d_input | 输入矩阵 | 输入 | float、complex float | [batch,N,N] | batch<=32,N<=3072 | -| output_desc | 输出矩阵描述符 | 输入 | float、complex float | | | -| d_output | 输出矩阵 | 输出 | | [batch,N,N] | | -| upper | 上三角/下三角 | 输入 | bool | | | +### 3.2 测试 + +#### 3.2.1 测试样例构造 +测试用例覆盖多种类型。按照数据类型(float,complex float),矩阵维度(单batch、多batch),输出矩阵为上三角/下三角(即输入参数upper为True/False),是否将矩阵还原(是否将分解出的L和U矩阵相乘),可以得到16种类型,对每种类型分别测试,diff1,diff2,diff3_2结果均小于动态阈值。 + +#### 3.2.2 性能测试 +float类型单batch性能测试如下,表格中数字为运行时间,单位为微秒(us),最右侧一列为mlu的运行时间与pytorch在gpu上的运行时间的比值: +| 规模 | pytorch | mlu | mlu/pytorch | +| ---- | ------- | ----- | ----------- | +| 64 | 75.9 | 280 | 3.689065 | +| 256 | 161.5 | 1177 | 7.287926 | +| 1024 | 709 | 5576 | 7.864598 | +| 3000 | 3182 | 24220 | 7.611565 | + +float类型多batch性能测试: + +| 规模 | pytorch | mlu | mlu/pytorch | +| ------- | ------- | ------ | ----------- | +| 32,64 | 118 | 502 | 4.254237 | +| 16,512 | 1003 | 5405 | 5.388833 | +| 32,3000 | 97264 | 143560 | 1.475983 | + +float类型的cholesky分解在mlu端运行时间在pytorch运行时间的10倍以内。 +complex类型单batch性能测试: + +| 规模 | pytorch | mlu | mlu/pytorch | +| ---- | ------- | ----- | ----------- | +| 16 | 56 | 68 | 1.214286 | +| 64 | 73 | 612 | 8.383562 | +| 128 | 110 | 1465 | 13.31818 | +| 3000 | 4826 | 76277 | 15.80543 | + +complex类型多batch性能测试: + +| 规模 | pytorch | mlu | mlu/pytorch | +| -------- | ------- | ------ | ----------- | +| 32, 16 | 56 | 68 | 1.214286 | +| 32, 64 | 73 | 612 | 8.383562 | +| 32, 128 | 218 | 3786 | 17.36697 | +| 4, 1024 | 2698 | 24535 | 9.093773 | +| 32, 3000 | 132817 | 922743 | 6.947477 | + +对于mlu/pytorch>10的规模,例如batch为32,N为128时,使用cnperf-cli进行性能分析,如下图所示 +![image](32_128性能分析.png) + +图中红框中为调用底层的矩阵乘法,且由于没有复数类型矩阵乘法的底层实现,当前复数矩阵乘是由4个float类型矩阵乘拼接而成。可以看到矩阵乘法的时间占比总和已经达到了60%,矩阵乘法所占用时间超过了2000微秒,已经超过了pytorch运行时间的10倍。 + +### 3.3 防呆检查 +算子中做了如下检查: +* 所有指针不为NULL +* 输入输出矩阵的维度为2或者3 +* 输入输出矩阵维度数相等 +* 输入输出矩阵的后两个维度数目相同 diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index 7ab0f3d81..4e47a8977 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -1,7 +1,5 @@ #include "cholesky.h" - - mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace) { PARAM_CHECK("mluOpCholesky", input_desc != NULL); @@ -19,7 +17,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t in mluOpDataType_t dtype = input_desc->dtype; PARAM_CHECK("mluOpCholesky", dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); - int type_size = (dtype == MLUOP_DTYPE_FLOAT) ? 4 : 8; + unsigned long type_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); long int size_a = 0, lda = 0, size_c = 0, ldc = 0; long int batch_size = 1; int dim = input_desc->dim; @@ -35,11 +34,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t in if (dtype == MLUOP_DTYPE_FLOAT) { - *size = size_a*size_a*sizeof(float)*2*batch_size; + *size = size_a*size_a*sizeof(float)*batch_size*3; } else { - *size = size_a*size_a*sizeof(float)*2*batch_size; + *size = size_a*size_a*sizeof(float)*2*batch_size*3; } printf("workspace size:%ul\n",(int)(*size)); @@ -51,11 +50,23 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t in return MLUOP_STATUS_SUCCESS; } +mluOpStatus_t MLUOP_WIN_API mluOpFreeCholeskyWorkspace(float** workspace) +{ + PARAM_CHECK("mluOpCholesky", workspace != NULL); + if(*workspace != NULL) + { + CHECK_RETURN("mluOpCholesky", + workspace_free(workspace)); + *workspace = NULL; + } + return MLUOP_STATUS_SUCCESS; + +} + mluOpStatus_t MLUOP_WIN_API calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace) { mluOpDataType_t dtype = input_desc->dtype; - printf("batch_size:%d\n",batch_size); int recnb = REC_NB; @@ -64,7 +75,8 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ bool is_row_major = (input_desc->strides)[dim-1]==1; - int type_size = (dtype == MLUOP_DTYPE_FLOAT) ? 4 : 8; + unsigned long type_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); int size_a = 0, lda = 0, size_c = 0, ldc = 0; if(dim == 2) { @@ -81,18 +93,13 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ ldc = output_desc->dims[2]; } - - float* work_space; - float* work_space_h; - CNRT_CHECK(cnrtMalloc((void **)&work_space, NB*NB*sizeof(float))); - CNRT_CHECK(cnrtMemset(work_space, 0, NB*NB*sizeof(float))); - work_space_h = (float*)malloc(((unsigned long)batch_size)*2*lda*lda*sizeof(float)); PARAM_CHECK("mluOpCholesky", lda >= size_a); PARAM_CHECK("mluOpCholesky", ldc >= size_c); cnrtQueue_t queue; mluOpGetQueue(handle,&queue); + int jb; const float s_one = 1.0; const float s_neg_one = -1.0; @@ -119,6 +126,9 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ cnrtQueueSync(queue); int stride = size_a*lda; + + + if(dtype == MLUOP_DTYPE_FLOAT) { @@ -130,23 +140,23 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ { jb = std::min(nb, row-j); CHECK_RETURN("mluOpCholesky", - ssyrk(batch_size,stride,false,is_row_major,jb,j,OFFSET_ROW(d_output,j,0),lda,OFFSET_ROW(d_output,j,j),lda,handle)); + ssyrk(batch_size,stride,false,is_row_major,jb,j,OFFSET_ROW(d_output,j,0),lda,OFFSET_ROW(d_output,j,j),lda,handle,workspace)); cnrtQueueSync(queue); CHECK_RETURN("mluOpCholesky", - mlu_spotrf_rectile(batch_size,stride,is_row_major,false,jb,recnb,OFFSET_ROW(d_output,j,j),lda,j, handle)); + mlu_spotrf_rectile(batch_size,stride,is_row_major,false,jb,recnb,OFFSET_ROW(d_output,j,j),lda,j, handle,workspace)); if(j+jb < row) { CHECK_RETURN("mluOpCholesky", sgemm(batch_size, !is_row_major,is_row_major,row-j-jb,jb,j,-1.0f,1.0f, OFFSET_ROW(d_output,j+jb,0),lda,stride, OFFSET_ROW(d_output,j,0),lda,stride, - OFFSET_ROW(d_output,j+jb,j),lda,stride, handle)); + OFFSET_ROW(d_output,j+jb,j),lda,stride, handle,workspace)); cnrtQueueSync(queue); } if(j+jb < row) { CHECK_RETURN("mluOpCholesky", - strsm(batch_size, stride,false,is_row_major,jb,row-j-jb,OFFSET_ROW(d_output,j,j),lda,OFFSET_ROW(d_output,j+jb,j),lda, handle)); + strsm(batch_size, stride,false,is_row_major,jb,row-j-jb,OFFSET_ROW(d_output,j,j),lda,OFFSET_ROW(d_output,j+jb,j),lda, handle, workspace)); cnrtQueueSync(queue); } } @@ -165,11 +175,10 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ recnb = CREC_NB; int nb = CNB; int row = lda; - float* r_start = d_output; + float* r_start = d_output; float* i_start = d_output + size_a*lda; stride *= 2; - set_half_zero(batch_size, stride, r_start, lda, lda, handle); set_half_zero(batch_size, stride, i_start, lda, lda, handle); cnrtQueueSync(queue); @@ -178,10 +187,10 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ { jb = std::min(nb, row-j); CHECK_RETURN("mluOpCholesky", - cherk(batch_size,stride,jb,j,r_start+j*lda,i_start+j*lda,lda,r_start+j*lda+j,i_start+j*lda+j,lda,handle)); + cherk(batch_size,stride,jb,j,r_start+j*lda,i_start+j*lda,lda,r_start+j*lda+j,i_start+j*lda+j,lda,handle,workspace)); cnrtQueueSync(queue); CHECK_RETURN("mluOpCholesky", - mlu_cpotrf_rectile(batch_size,stride,jb,recnb,r_start+j*lda+j,i_start+j*lda+j,lda, handle)); + mlu_cpotrf_rectile(batch_size,stride,jb,recnb,r_start+j*lda+j,i_start+j*lda+j,lda, handle,workspace)); cnrtQueueSync(queue); if(j+jb < row) { @@ -189,7 +198,7 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ cgemm(batch_size, false,true,row-j-jb,jb,j,-1.0f,1.0f, OFFSET_ROW(r_start,j+jb,0),OFFSET_ROW(i_start,j+jb,0), lda,stride, OFFSET_ROW(r_start,j,0),OFFSET_ROW(i_start,j,0), lda,stride, - OFFSET_ROW(r_start,j+jb,j),OFFSET_ROW(i_start,j+jb,j), lda, stride, handle)); + OFFSET_ROW(r_start,j+jb,j),OFFSET_ROW(i_start,j+jb,j), lda, stride, handle, workspace)); cnrtQueueSync(queue); } @@ -197,10 +206,11 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ { CHECK_RETURN("mluOpCholesky", ctrsm(batch_size, stride,jb,row-j-jb,OFFSET_ROW(r_start,j,j),OFFSET_ROW(i_start,j,j),lda, - OFFSET_ROW(r_start,j+jb,j),OFFSET_ROW(i_start,j+jb,j),lda, handle)); + OFFSET_ROW(r_start,j+jb,j),OFFSET_ROW(i_start,j+jb,j),lda, handle,workspace)); cnrtQueueSync(queue); } - } + } + CHECK_RETURN("mluOpCholesky", transpose(batch_size,2,size_a*size_a,d_output,workspace,handle,MLUOP_DTYPE_FLOAT,workspace)); @@ -291,14 +301,10 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa ldc = output_desc->dims[2]; } - float* last_addr = d_input+((unsigned long)batch_size)*size_a*lda*2; - float* temp_addr = last_addr - 10; - - - - int type_size = (dtype == MLUOP_DTYPE_FLOAT) ? 4 : 8; + unsigned long type_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); if(type_size == 8 && batch_size > 16 && size_a > 2000) { int stride = 2*size_a*lda; @@ -311,9 +317,5 @@ mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,floa calculate_body(handle, batch_size, input_desc,d_input, output_desc, d_output, upper, workspace); } - - - - return MLUOP_STATUS_SUCCESS; } \ No newline at end of file diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index 39807bb4d..becaf6b9a 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -1,8 +1,24 @@ #include "cholesky.h" #include +unsigned int next_power_of_2(unsigned int n) { + if (n == 0) { + return 1; + } + + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + + return n + 1; +} + __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; +__mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; __mlu_func__ float recur_add(float* input, int length) @@ -46,20 +62,23 @@ void sgemm_fixwidth_device(int m, int k, { int id = taskId % 4; - int span = POTF_NB; - + int span = POTF_NB; - __nram__ float rC[M * POTF_NB/TASK_NUM ]; - __nram__ float rA[M * POTF_NB/TASK_NUM ]; - __nram__ float rp[M * POTF_NB/TASK_NUM ]; + + + + float* rC = (float*)nram_buffer; + float* rA= rC + M * POTF_NB/TASK_NUM; + float* rp = rA + M * POTF_NB/TASK_NUM; - __nram__ float rB[POTF_NB * POTF_NB]; + float* rB= rp + M * POTF_NB/TASK_NUM; - __nram__ float temp_result[POTF_NB * POTF_NB]; + float* temp_result = rB + POTF_NB * POTF_NB; temp_result[0] = 0.0; - if(id*span0) { @@ -140,7 +154,7 @@ static __mlu_func__ void spotf2_sminout_fixsize_device(int m, float *A, int lda) float* temp_b = diag+iter*span; float* local_result = temp_result; float* local_diag = temp_result2; - + for(int i = 0; i < span; i++) { __bang_mul(local_result, temp_a, temp_b, iter); @@ -149,7 +163,7 @@ static __mlu_func__ void spotf2_sminout_fixsize_device(int m, float *A, int lda) temp_a = temp_a + span; local_diag = local_diag + span; } - + if(iter>1) { local_result = temp_result; @@ -180,22 +194,17 @@ static __mlu_func__ void spotf2_sminout_fixsize_device(int m, float *A, int lda) } factor = diag[iter*POTF_NB+iter]; - __nram__ float temp[1]; - __bang_rsqrt(temp,diag+iter*POTF_NB+iter,1); - factor = temp[0]; - - + factor = std::sqrt(factor); + factor = (1.0/factor); for(int i = 0; i < span; i++) { - + nram_src[i*POTF_NB+iter] *= factor; diag[i*POTF_NB+iter] *= factor; } __sync(); - - @@ -206,7 +215,7 @@ static __mlu_func__ void spotf2_sminout_fixsize_device(int m, float *A, int lda) if(id*span0) { - __memcpy(rB,A0,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span_b-1); - - } - - __sync_cluster(); @@ -287,10 +291,6 @@ void sgemm_anywidth_device(int m, int k, } - - - - } @@ -337,12 +337,11 @@ static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) __mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float *A0, float *A, int lda, const int localstep, const int gbstep) { - int id = taskId % 4; - __mlu_shared__ float shared_data[SHARED_MEM_SIZE]; + int id = taskId % 4; + float* shared_data = (float*)sram_buffer; float* sdata_A = shared_data; float* sdata_B = shared_data + m *POTF_NB/TASK_NUM * 4; - sgemm_fixwidth_device(m, localstep, A0, lda, sdata_A, sdata_B); @@ -357,11 +356,11 @@ __mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float *A0, float * int span = POTF_NB; + if(id==0) { for(int i = 0; i < span; i++) { - __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); } @@ -374,8 +373,6 @@ __mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float *A0, float * __sync_cluster(); - - } __mlu_func__ void spotf2_smlpout_anywidth_device(const int m, float *A0, float *A, int lda, const int localstep, const int gbstep) @@ -383,7 +380,6 @@ __mlu_func__ void spotf2_smlpout_anywidth_device(const int m, float *A0, float * sgemm_anywidth_device(m, localstep, A0, lda, A, nullptr); - spotf2_sminout_anysize_device(m, A, lda); @@ -391,22 +387,19 @@ __mlu_func__ void spotf2_smlpout_anywidth_device(const int m, float *A0, float * - } __mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, bool trans, int m, float *dA, int lda, int localstep, int gbstep) { int id = taskId; - float* orignA = dA; - int batch_id = id / 4; if(batch_id >= batch) return; dA = orignA + batch_id * stride; - __mlu_shared__ float shared_data[NB * NB]; + float* shared_data = (float*)sram_buffer; if(m%4==0) { @@ -421,7 +414,6 @@ __mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, bool tr if(id == 0) { __memcpy(shared_data,dA,m*sizeof(float),GDRAM2SRAM,NB*sizeof(float),lda*sizeof(float),m-1); - } __sync_cluster(); @@ -437,7 +429,10 @@ __mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, bool tr __memcpy(dA,shared_data,m*sizeof(float),SRAM2GDRAM,lda*sizeof(float),NB*sizeof(float),m-1); } __sync_cluster(); - } + } + + + } @@ -512,14 +507,15 @@ __mlu_func__ void small_sminout_batch(int m, int width, float *dst, float *nram_ } __sync(); + + + } __sync(); - - } __mlu_func__ @@ -609,7 +605,6 @@ mluOpStatus_t mlu_spotf2_lpin(int batch, int stride, bool trans,bool uplo, int n spotf2_smlpin_anywidth_kernel<<>>(batch, stride, trans, n, dA, ldda, 0,gbstep)); } - return MLUOP_STATUS_SUCCESS; } @@ -627,15 +622,16 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( float* orignB = dB; dA = orignA + batch_id * stride; dB = orignB + batch_id * stride; - int span = n; int start = 0; - __nram__ float sA[8*POTF_NB]; - __nram__ float rB[4*POTF_NB * 8*POTF_NB]; - __nram__ float rC[4*POTF_NB * 8*POTF_NB]; - __nram__ float rBp[4*POTF_NB]; - __nram__ float rA[8*POTF_NB]; + + + float *sA = (float*)nram_buffer; + float *rB = sA + 8*POTF_NB; + float *rC = rB + 4*POTF_NB * 8*POTF_NB; + float *rBp = rC + 4*POTF_NB * 8*POTF_NB; + float *rA = rBp + 4*POTF_NB; int calc_length = (8 * POTF_NB) > m ? m : (8 * POTF_NB); __memset_nram(rB,POTF_NB*calc_length,(float)ZERO); __memset_nram(sA,calc_length*calc_length,(float)ZERO); @@ -650,6 +646,7 @@ __mlu_entry__ void mlu_strsm_rectile_batch_kernel( __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),GDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); __sync(); + if(trans) { __memcpy_async(rA,sA,(1)*sizeof(float),NRAM2NRAM); @@ -733,6 +730,7 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( float* orignB = dB; dA = orignA + batch_id * stride; dB = orignB + batch_id * stride; + int span = n / 4; @@ -743,15 +741,17 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( } bool if_execute = span > 0; - __mlu_shared__ float sA[8*POTF_NB]; - __nram__ float rB[4*POTF_NB * 8*POTF_NB]; - __nram__ float rC[4*POTF_NB * 8*POTF_NB]; - __nram__ float rBp[4*POTF_NB]; - __nram__ float rA[8*POTF_NB]; + float* sA = (float*)sram_buffer; + + float* rB = (float*)nram_buffer; + float* rC = rB + 4*POTF_NB * 8*POTF_NB; + float* rBp = rC + 4*POTF_NB * 8*POTF_NB; + float* rA = rBp + 4*POTF_NB; int calc_length = (8 * POTF_NB) > m ? m : (8 * POTF_NB); __memset_nram(rB,POTF_NB*calc_length,(float)ZERO); + float temp_b = 0, factor = 0; float sum = 0.0; float c = 0.0; @@ -771,7 +771,6 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); __sync_cluster(); - if(trans) { __memcpy_async(rA,sA,(1)*sizeof(float),SRAM2NRAM); @@ -815,9 +814,9 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( for(int j = 0; j < iter; j++) { - temp_b = rC[i*calc_length+j] - c; - t = sum + temp_b; - c = (t - sum) - temp_b; + temp_b = rC[i*calc_length+j] - c; + t = sum + temp_b; + c = (t - sum) - temp_b; sum = t; } temp_b = sum; @@ -841,11 +840,12 @@ __mlu_entry__ void mlu_strsm_rectile_kernel( c = 0.0; t = 0.0; temp_b = 0; + for(int j = 0; j < m-1; j++) { - temp_b = rC[i*calc_length+j] - c; - t = sum + temp_b; - c = (t - sum) - temp_b; + temp_b = rC[i*calc_length+j] - c; + t = sum + temp_b; + c = (t - sum) - temp_b; sum = t; } temp_b = sum; @@ -900,6 +900,19 @@ mluOpStatus_t strsm_rectile(int batch, int stride, bool upper, bool trans, int m { func_type = CNRT_FUNC_TYPE_UNION8; carry_batch = batch < 8 ? 8 : batch; + if(batch <= 8) + { + carry_batch = 8; + } + else if(batch <= 16) + { + carry_batch = 16; + } + else + { + carry_batch = 32; + + } } dim.x = carry_batch * 4; @@ -914,6 +927,9 @@ mluOpStatus_t strsm_rectile(int batch, int stride, bool upper, bool trans, int m return MLUOP_STATUS_SUCCESS; } + + + __mlu_global__ void add_c_batch(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) { @@ -974,10 +990,7 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in float* orignSrc = src; d_c = orignC + batch_id * stride; src = orignSrc + batch_id * m*n; - - - __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; if (beta == 0.0f) { if(id == 0) @@ -1016,7 +1029,6 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in } int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); - int32_t data_nram_num = MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; float *a_nram = (float *)nram_buffer; @@ -1055,8 +1067,9 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in } + -mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, int stride_a, float* d_b, int ldb, int stride_b, float* d_c, int ldc, int stride_c, mluOpHandle_t handle) +mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, int stride_a, float* d_b, int ldb, int stride_b, float* d_c, int ldc, int stride_c, mluOpHandle_t handle, float* workspace) { if(k==0) return MLUOP_STATUS_SUCCESS; @@ -1089,6 +1102,7 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, &(max_batch_dim), sizeof(int32_t))); + mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc; @@ -1121,10 +1135,9 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, int requested_algo_count = 1, return_algo_count = 0; - float *workspace; size_t workspace_size; - + cnnlGetStrideBatchMatMulAlgoHeuristic( cnnl_handle, stride_bmm_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, cnnl_d_desc, trans_a, trans_b, false, &(alpha), &(beta), m, n, k, lda, ldb, ldc, batch_size_arr, stride_a_arr, stride_b_arr, @@ -1135,22 +1148,19 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, if(workspace_size > 0) { - CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); - } - else - { - CNRT_CHECK(cnrtMalloc((void **)&workspace, m*n*sizeof(float))); + printf("sgemm workspace size:%zu\n",workspace_size); } - - CALL_CNNL(cnnlStrideBatchMatMul_v2( cnnl_handle, stride_bmm_desc, algo, trans_a, trans_b, false, m, n, k, batch_size_arr, &(alpha), cnnl_a_desc, d_a, lda, stride_a_arr, cnnl_b_desc, d_b, ldb, stride_b_arr, &(beta), cnnl_c_desc, d_c, ldc, - stride_c_arr, cnnl_d_desc, d_c, workspace, workspace_size)); + stride_c_arr, cnnl_d_desc, d_c, workspace, workspace_size)); + + + return MLUOP_STATUS_SUCCESS; } @@ -1231,7 +1241,6 @@ void inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, f float* orignOutput = d_output; d_input = orignInput + batch_id * stride_input; d_output = orignOutput + batch_id * stride_output; - __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; if (id == 0) { @@ -1247,6 +1256,7 @@ void inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, f span = m - 3 * span; } float* nram_offset = (float*)nram_buffer + id * 3 * m * m; + float* nram_src1 = nram_offset; float* nram_src2 = nram_src1 + m * m; float* mul_result = nram_src2 + m; @@ -1303,7 +1313,7 @@ void inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, f __memcpy(d_output,sram_buffer,m*sizeof(float),SRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); } - + } @@ -1340,7 +1350,9 @@ __mlu_global__ void set_zero(int batch, int stride, bool upper, int m, float* d_ } } -mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, float* d_a, int lda, float* d_b, int ldb, mluOpHandle_t handle) + + +mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, float* d_a, int lda, float* d_b, int ldb, mluOpHandle_t handle,float* workspace) { if(n==0) return MLUOP_STATUS_SUCCESS; @@ -1350,8 +1362,6 @@ mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - int32_t *info; - CNRT_CHECK(cnrtMalloc((void **)&info, batch*sizeof(int32_t))); CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc)); @@ -1375,8 +1385,7 @@ mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(info_desc, cnnl_info_desc); - float* workspace; - CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*m*m*sizeof(float))); + float* sgemm_workspace = workspace + batch * m * m; CNRT_CHECK(cnrtMemset(workspace, 0.0, batch*m*m*sizeof(float))); float* h_i; @@ -1427,11 +1436,15 @@ mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, KERNEL_CHECK(inverse_kernel<<>>(batch, d_a+m1*lda+m1,lda,stride, workspace2,m,m*m,m2)); } - sgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,d_a+m1*lda,lda,stride,workspace1,m,m*m,workspace1+m1*m,m,m*m,handle); - sgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,workspace2,m,m*m,workspace1+m1*m,m,m*m,workspace1+m1*m,m,m*m,handle); + + + sgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,d_a+m1*lda,lda,stride,workspace1,m,m*m,workspace1+m1*m,m,m*m,handle,sgemm_workspace); + sgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,workspace2,m,m*m,workspace1+m1*m,m,m*m,workspace1+m1*m,m,m*m,handle,sgemm_workspace); cnrtQueueSync(queue); + cnnlStrideBatchMatMul(cnnl_handle, false, true, n,m, m, batch, 1.0, cnnl_b_desc, d_b, ldb, stride, cnnl_a_desc, workspace, m, m*m, 0.0f, cnnl_b_desc, d_b, ldb, stride); + return MLUOP_STATUS_SUCCESS; } @@ -1441,6 +1454,23 @@ mluOpStatus_t set_half_zero(int batch,int stride,float* d_a, int lda, int m, mlu mluOpGetQueue(handle,&queue); cnrtDim3_t dim; cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; + int carry_batch = 0; + if(batch == 1) + { + carry_batch = 1; + } + else if(batch == 2) + { + carry_batch = 2; + } + else if(batch <= 4) + { + carry_batch = 4; + } + else if(batch <= 8) + { + carry_batch = 8; + } dim.x = 4 * batch; dim.y = 1; dim.z = 1; @@ -1449,35 +1479,17 @@ mluOpStatus_t set_half_zero(int batch,int stride,float* d_a, int lda, int m, mlu } -mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle) +mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle,float* workspace) { if(k==0) return MLUOP_STATUS_SUCCESS; - sgemm(batch, false,true,n,n,k,-1.0f,1.0f,d_a,ldda,stride,d_a,ldda,stride,d_c,lddc,stride,handle); + sgemm(batch, false,true,n,n,k,-1.0f,1.0f,d_a,ldda,stride,d_a,ldda,stride,d_c,lddc,stride,handle,workspace); cnrtQueue_t queue; mluOpGetQueue(handle,&queue); cnrtDim3_t dim; cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; - int carry_batch = batch; - if(batch == 1) - { - func_type = CNRT_FUNC_TYPE_UNION1; - } - else if(batch == 2) - { - func_type = CNRT_FUNC_TYPE_UNION2; - } - else if(batch <= 4) - { - func_type = CNRT_FUNC_TYPE_UNION4; - carry_batch = 4; - } - else - { - func_type = CNRT_FUNC_TYPE_UNION8; - carry_batch = batch < 8 ? 8 : batch; - } + int carry_batch = next_power_of_2(batch); dim.x = carry_batch * 4; dim.y = 1; dim.z = 1; @@ -1487,7 +1499,7 @@ mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans,int n, int k, return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, int n, int recnb, float* d_A, int lda, int gbstep, mluOpHandle_t handle) +mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, int n, int recnb, float* d_A, int lda, int gbstep, mluOpHandle_t handle, float* workspace) { cnrtQueue_t queue; mluOpGetQueue(handle,&queue); @@ -1502,12 +1514,14 @@ mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, i { int n1 = n/2; int n2 = n-n1; - mlu_spotrf_rectile(batch,stride,trans,uplo,n1,recnb,OFFSET_ROW(d_A,0,0),lda,gbstep, handle); + mlu_spotrf_rectile(batch,stride,trans,uplo,n1,recnb,OFFSET_ROW(d_A,0,0),lda,gbstep, handle,workspace); strsm_rectile(batch, stride, uplo,trans,n1,n2,OFFSET_ROW(d_A,0,0),lda,OFFSET_ROW(d_A,n1,0),lda,queue); - ssyrk(batch,stride,uplo,trans,n2,n1,d_A+n1*lda,lda,OFFSET_ROW(d_A,n1,n1),lda,handle); - mlu_spotrf_rectile(batch,stride,trans,uplo,n2,recnb,OFFSET_ROW(d_A,n1,n1),lda,gbstep+n1,handle); + ssyrk(batch,stride,uplo,trans,n2,n1,d_A+n1*lda,lda,OFFSET_ROW(d_A,n1,n1),lda,handle,workspace); + mlu_spotrf_rectile(batch,stride,trans,uplo,n2,recnb,OFFSET_ROW(d_A,n1,n1),lda,gbstep+n1,handle,workspace); + + } return MLUOP_STATUS_SUCCESS; } @@ -1550,6 +1564,7 @@ mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, CALL_CNNL(cnnlSetTransposeDescriptor(cnnl_trans_desc, input_dim, permute)); size_t size=0; + CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, cnnl_trans_desc, &size)); @@ -1557,7 +1572,6 @@ mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, if(size > 0ul) { - printf("transpose2 need size: %zu\n",size); } diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index 9bb8f448b..d6b36498a 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -2,6 +2,7 @@ #define COMPLEX_OFFSET(A,off) (((float*)A) + (2 * (off))) #define COMPLEX_TYPE_SIZE ((2) * sizeof(float)) __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; +__mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; __mlu_func__ void small_cgemm(int m,int k, @@ -90,7 +91,7 @@ void small_cgemm(int m,int k, __bang_sub(rp,rp,rC,CPOTF_NB * span); __bang_sub(ip,ip,iC,CPOTF_NB * span); - + if(if_execute) { __memcpy(rdst,rp,span*CPOTF_NB*sizeof(float),NRAM2NRAM); @@ -127,7 +128,6 @@ void small_cminout(int m, int width, __memcpy(rdiag,sram_buffer,width*CPOTF_NB*sizeof(float),SRAM2NRAM); __memcpy(idiag,sram_buffer+CPOTF_NB*CPOTF_NB,width*CPOTF_NB*sizeof(float),SRAM2NRAM); - for(int iter = 0; iter < width; iter++) { factor = sqrt(rdiag[(iter * CPOTF_NB+iter)]); @@ -141,9 +141,9 @@ void small_cminout(int m, int width, rdst[(i * CPOTF_NB+iter)] *= factor; idst[(i * CPOTF_NB+iter)] *= factor; - + } - + __sync(); for(int i = iter+1; i < width; i++) { @@ -156,13 +156,13 @@ void small_cminout(int m, int width, b2 = idiag[(i*CPOTF_NB+iter)]; a3 = rdiag[(j*CPOTF_NB+iter)]; b3 = idiag[(j*CPOTF_NB+iter)]; - + rdst[(j * CPOTF_NB + i)] -= (a1*a2+b1*b2);//a4 idst[(j * CPOTF_NB + i)] -= (a2*b1-a1*b2);//b4 rdiag[(j * CPOTF_NB + i)] -= (a3*a2+b3*b2);//a5 idiag[(j * CPOTF_NB + i)] -= (a2*b3-a3*b2);//b5 - + } @@ -183,13 +183,12 @@ __mlu_func__ void cmplout(int batch, const int m, float *rA0, float *rA,float *i int remain = m - finish; bool if_execute = remain > 0; int span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; - __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; float* dst = (float*)nram_buffer; small_cgemm(m, localstep, rA0, iA0, lda, width, (float*)sram_buffer, dst); __sync_cluster(); - + small_cminout(m, width, dst, (float*)sram_buffer, CPOTF_NB); @@ -218,8 +217,6 @@ __mlu_func__ void cmplout(int batch, const int m, float *rA0, float *rA,float *i } - - __mlu_func__ void small_cgemm_batch(int batch, int m,int k, float* rA0, float* iA0, const int lda, @@ -227,6 +224,7 @@ void small_cgemm_batch(int batch, int m,int k, { int ldk = k; int ldm = m; + float* r_dst2 = i_dst + m * width; float* i_dst2 = r_dst2 + m * width; float* r_src1 = i_dst2 + m * width; @@ -260,8 +258,6 @@ void small_cgemm_batch(int batch, int m,int k, __memcpy(i_src2, i_src1, ldk*width*sizeof(float),NRAM2NRAM); - - float a1,a2,b1,b2; for(int i = 0; i < m; i++) { @@ -305,7 +301,6 @@ void small_cminout_batch(int m, int width, if (r_diag[iter*width+iter]<0) { - printf("cccnm\n"); printf("iter:%d,taskId:%d\n",iter,taskId); } factor = sqrt(r_diag[iter*width+iter]); @@ -333,7 +328,7 @@ void small_cminout_batch(int m, int width, } __sync(); - + } __mlu_func__ @@ -446,8 +441,6 @@ void add_c1(int batch, int stride, float beta, float *d_c, float* src,int ldc, i src = orignSrc + batch_id * m*n; - - __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; if (beta == 0.0f) { if(id == 0) @@ -569,6 +562,7 @@ void complex_add_c(int batch, int stride, float beta, float *d_c, float* src,int __memcpy(a_sram,src+src_offset,n*span*sizeof(float),LDRAM2NRAM); + int32_t data_per_core = span*n; int32_t data_last_core = data_per_core; const float *a_offset = a_sram; @@ -625,6 +619,14 @@ mluOpStatus_t workspace_malloc(size_t size, float** workspace) return MLUOP_STATUS_SUCCESS; } +mluOpStatus_t workspace_free(float** workspace) +{ + CNRT_CHECK(cnrtFree((void *)(*workspace))); + + return MLUOP_STATUS_SUCCESS; +} + + __mlu_global__ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m) { @@ -641,8 +643,7 @@ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_ id_input = origin_i_input + batch_id * stride_input; rd_output = origin_r_output + batch_id * stride_output; id_output = origin_i_output + batch_id * stride_output; - - + int span = m/4; int start = id * span; @@ -651,7 +652,6 @@ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_ span = m - 3 * span; } float* nram_offset = (float*)nram_buffer; - //diag_start:m*m ld:m float* rdiag_start = (float*)nram_offset; float* idiag_start = rdiag_start + m * m; float* r_nram_src1 = idiag_start + m * m; @@ -696,13 +696,14 @@ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_ { float r_temp = 0.0; float i_temp = 0.0; - + __bang_mul(r_mul_result,r_nram_src2,r_nram_src1+j*height,i); __bang_mul(i_mul_result,r_nram_src2,i_nram_src1+j*height,i); for(int k = 0; k< i; k++) { r_temp += r_mul_result[k]; i_temp += i_mul_result[k]; + } __bang_mul(r_mul_result,i_nram_src2,i_nram_src1+j*height,i); __bang_mul(i_mul_result,i_nram_src2,r_nram_src1+j*height,i); @@ -735,7 +736,7 @@ void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_ __memcpy(id_output + ld_output * start + start,i_nram_dst,span*sizeof(float),NRAM2LDRAM,ld_output*sizeof(float),span*sizeof(float),height-1); } - + } @@ -775,6 +776,7 @@ void complex_batch_inverse_kernel(int batch, float *rd_input, float* id_input, i __memset_nram(nram_offset, 10 * m * m, (float)ZERO); + __memcpy(r_nram_dst,rd_input,m*sizeof(float),GDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),m-1); __memcpy(i_nram_dst,id_input,m*sizeof(float),GDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),m-1); float result = 0.0; @@ -831,10 +833,7 @@ void complex_batch_inverse_kernel(int batch, float *rd_input, float* id_input, i - - - -mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle) +mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle,float* workspace) { if(k==0) return MLUOP_STATUS_SUCCESS; @@ -842,8 +841,8 @@ mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - - + + float *r_c, *i_c; r_c = d_rc; i_c = d_ic; @@ -852,26 +851,22 @@ mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, int s_stride_b = stride_b; int s_stride_c = stride_c; - - sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,d_ra,lda,s_stride_a,d_rb,ldb,s_stride_b,r_c,ldc,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,d_ra,lda,s_stride_a,d_rb,ldb,s_stride_b,r_c,ldc,s_stride_c,handle,workspace); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_ib,ldb,s_stride_b,r_c,ldc,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_ib,ldb,s_stride_b,r_c,ldc,s_stride_c,handle,workspace); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,-alpha,beta,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,ldc,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,-alpha,beta,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,ldc,s_stride_c,handle,workspace); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,ldc,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,ldc,s_stride_c,handle,workspace); cnrtQueueSync(queue); - - - return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle) +mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle, float* cgemm_workspace) { if(k==0) return MLUOP_STATUS_SUCCESS; @@ -879,8 +874,8 @@ mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, in cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - float *workspace = NULL; - CNRT_CHECK(cnrtMalloc((void **)&workspace, ((unsigned long)batch)*sizeof(float)*2*(m*k))); + float *workspace = cgemm_workspace; + float* sgemm_workspace = cgemm_workspace + ((unsigned long)batch)*2*(m*k); float* copy_ra = workspace; float* copy_ia = copy_ra + ((unsigned long)batch)*m*k; int copy_lda = k; @@ -894,44 +889,34 @@ mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, in k*sizeof(float), m, CNRT_MEM_TRANS_DIR_DEV2DEV)); } - float *r_c, *i_c; r_c = d_rc; i_c = d_ic; - + int s_stride_b = stride_b; int s_stride_c = stride_c; - sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,copy_ra,copy_lda,copy_stride_a,d_rb,ldb,s_stride_b,r_c,ldc,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,copy_ra,copy_lda,copy_stride_a,d_rb,ldb,s_stride_b,r_c,ldc,s_stride_c,handle,sgemm_workspace); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,copy_ia,copy_lda,copy_stride_a,d_ib,ldb,s_stride_b,r_c,ldc,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,copy_ia,copy_lda,copy_stride_a,d_ib,ldb,s_stride_b,r_c,ldc,s_stride_c,handle,sgemm_workspace); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,copy_ra,copy_lda,copy_stride_a,d_ib,ldb,s_stride_b,i_c,ldc,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,copy_ra,copy_lda,copy_stride_a,d_ib,ldb,s_stride_b,i_c,ldc,s_stride_c,handle,sgemm_workspace); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,copy_ia,copy_lda,copy_stride_a,d_rb,ldb,s_stride_b,i_c,ldc,s_stride_c,handle); + sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,copy_ia,copy_lda,copy_stride_a,d_rb,ldb,s_stride_b,i_c,ldc,s_stride_c,handle,sgemm_workspace); cnrtQueueSync(queue); - - - - - - + return MLUOP_STATUS_SUCCESS; } - - - - -mluOpStatus_t complex_inverse(int batch, float *rd_input, float *id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m, mluOpHandle_t handle) +mluOpStatus_t complex_inverse(int batch, float *rd_input, float *id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m, mluOpHandle_t handle, float* workspace) { int inverse_rec = 16; cnrtQueue_t queue; @@ -967,64 +952,56 @@ mluOpStatus_t complex_inverse(int batch, float *rd_input, float *id_input, int l float* output2_i = id_output + m1*m+m1; - complex_inverse(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m1, handle); - complex_inverse(batch, rd_input+m1*ld_input+m1, id_input+m1*ld_input+m1, ld_input, stride_input, output2_r, output2_i, ld_output, stride_output, m2, handle); + complex_inverse(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m1, handle, workspace); + complex_inverse(batch, rd_input+m1*ld_input+m1, id_input+m1*ld_input+m1, ld_input, stride_input, output2_r, output2_i, ld_output, stride_output, m2, handle, workspace); cnrtQueueSync(queue); - - - float *workspace = NULL; - CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*sizeof(float)*2*(m2*m1))); + + float* cgemm_workspace = workspace + batch*2*(m2*m1); float* temp_r = workspace; float* temp_i = temp_r + batch*m2*m1; int temp_ld = m1; int temp_stride = m2*m1; - cgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,rd_input+m1*ld_input,id_input+m1*ld_input,ld_input,stride_input,output1_r,output1_i,ld_output,stride_output,temp_r,temp_i,temp_ld,temp_stride,handle); + cgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,rd_input+m1*ld_input,id_input+m1*ld_input,ld_input,stride_input,output1_r,output1_i,ld_output,stride_output,temp_r,temp_i,temp_ld,temp_stride,handle,cgemm_workspace); cnrtQueueSync(queue); - cgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,output2_r,output2_i,ld_output,stride_output,temp_r,temp_i,temp_ld,temp_stride,rd_output+m1*ld_output,id_output + m1*ld_output,ld_output,stride_output,handle); + cgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,output2_r,output2_i,ld_output,stride_output,temp_r,temp_i,temp_ld,temp_stride,rd_output+m1*ld_output,id_output + m1*ld_output,ld_output,stride_output,handle,cgemm_workspace); cnrtQueueSync(queue); - - - } - + } return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, float* id_a, int lda, float* rd_b, float* id_b, int ldb, mluOpHandle_t handle) +mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, float* id_a, int lda, float* rd_b, float* id_b, int ldb, mluOpHandle_t handle, float* ctrsm_workspace) { if(n==0) return MLUOP_STATUS_SUCCESS; cnrtQueue_t queue; mluOpGetQueue(handle,&queue); - float* workspace; - CNRT_CHECK(cnrtMalloc((void **)&workspace, batch*m*m*2*sizeof(float))); - CNRT_CHECK(cnrtMemset(workspace, 0.0, batch*m*m*2*sizeof(float))); + float* workspace = ctrsm_workspace + batch*m*m*2; + CNRT_CHECK(cnrtMemset(ctrsm_workspace, 0.0, batch*m*m*2*sizeof(float))); float *r_inverse_result, *i_inverse_result; - r_inverse_result = workspace; + r_inverse_result = ctrsm_workspace; i_inverse_result = r_inverse_result + batch*m*m; - - complex_inverse(batch,rd_a,id_a,lda,stride,r_inverse_result,i_inverse_result,m,m*m,m,handle); + + complex_inverse(batch,rd_a,id_a,lda,stride,r_inverse_result,i_inverse_result,m,m*m,m,handle,workspace); cnrtQueueSync(queue); - - cgemm_real(batch,false,true,n,m,m,1.0,0.0f,rd_b,id_b,ldb,stride,r_inverse_result,i_inverse_result,m,m*m,rd_b,id_b,ldb,stride,handle); - - + cgemm_real(batch,false,true,n,m,m,1.0,0.0f,rd_b,id_b,ldb,stride,r_inverse_result,i_inverse_result,m,m*m,rd_b,id_b,ldb,stride,handle,workspace); + return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t cherk(int batch, int stride, int n,int k, float* rd_a, float* id_a, int lda, float* rd_c, float* id_c, int ldc, mluOpHandle_t handle) +mluOpStatus_t cherk(int batch, int stride, int n,int k, float* rd_a, float* id_a, int lda, float* rd_c, float* id_c, int ldc, mluOpHandle_t handle,float* workspace) { if(k==0) return MLUOP_STATUS_SUCCESS; - cgemm(batch,false,true,n,n,k,-1.0f,1.0f,rd_a,id_a,lda,stride,rd_a,id_a,lda,stride,rd_c,id_c,ldc,stride,handle); + cgemm(batch,false,true,n,n,k,-1.0f,1.0f,rd_a,id_a,lda,stride,rd_a,id_a,lda,stride,rd_c,id_c,ldc,stride,handle,workspace); cnrtQueue_t queue; mluOpGetQueue(handle,&queue); cnrtQueueSync(queue); @@ -1034,7 +1011,7 @@ mluOpStatus_t cherk(int batch, int stride, int n,int k, float* rd_a, float* id_a } -mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, float* drA, float* diA, int lda, mluOpHandle_t handle) +mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, float* drA, float* diA, int lda, mluOpHandle_t handle, float* workspace) { cnrtQueue_t queue; mluOpGetQueue(handle,&queue); @@ -1046,10 +1023,10 @@ mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, float* { int n1 = n/2; int n2 = n-n1; - mlu_cpotrf_rectile(batch,stride,n1,recnb,drA,diA,lda,handle); - ctrsm(batch,stride,n1,n2,drA,diA,lda,drA+n1*lda,diA+n1*lda,lda,handle); - cherk(batch,stride,n2,n1,drA+n1*lda,diA+n1*lda,lda,drA+n1*lda+n1,diA+n1*lda+n1,lda,handle); - mlu_cpotrf_rectile(batch,stride,n2,recnb,drA+n1*lda+n1,diA+n1*lda+n1,lda,handle); + mlu_cpotrf_rectile(batch,stride,n1,recnb,drA,diA,lda,handle,workspace); + ctrsm(batch,stride,n1,n2,drA,diA,lda,drA+n1*lda,diA+n1*lda,lda,handle,workspace); + cherk(batch,stride,n2,n1,drA+n1*lda,diA+n1*lda,lda,drA+n1*lda+n1,diA+n1*lda+n1,lda,handle,workspace); + mlu_cpotrf_rectile(batch,stride,n2,recnb,drA+n1*lda+n1,diA+n1*lda+n1,lda,handle,workspace); } return MLUOP_STATUS_SUCCESS; diff --git a/mlu_op.h b/mlu_op.h index b77ea231b..b5a6c89bb 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -14155,6 +14155,10 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace); +mluOpStatus_t MLUOP_WIN_API +mluOpFreeCholeskyWorkspace(float** workspace); + + #if defined(__cplusplus) } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp index ed2566648..41be2bd81 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp @@ -22,6 +22,7 @@ *************************************************************************/ #include #include "cholesky.h" +// #include "kernels/kernel_wrapper/export_statement.h" namespace mluoptest { @@ -129,16 +130,21 @@ void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int l { A[i+j*lda] = 0.0; if(j == i && diag_add) + { A[j+i*lda] = 1.0; + } + - } - + } else if(type_ == MLUOP_DTYPE_COMPLEX_FLOAT && ((upper_==false && j >= i) || (upper_==true && j >= i))) { A[j*lda*2+i*2] = 0.0; A[j*lda*2+i*2+1] = 0.0; + if(j == i&& diag_add) + { A[j*lda*2+i*2] = 1.0; + } } for(long int k = 0; k <=i; k++) { @@ -149,7 +155,10 @@ void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int l else { if(type_ == MLUOP_DTYPE_FLOAT) + { + A[i+j*lda] += (C[k+i*lda]*C[k+j*lda]); + } else { A[(i+j*lda)*2] += (C[(k+i*lda)*2]*C[(k+j*lda)*2]+C[(k+i*lda)*2+1]*C[(k+j*lda)*2+1]); @@ -179,24 +188,28 @@ void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int l continue; else { - A[(i+j*lda)*2] += (C[(k+i*lda)*2]*C[(k+j*lda)*2]+C[(k+i*lda)*2+1]*C[(k+j*lda)*2+1]); - A[(i+j*lda)*2+1] += (C[(k+i*lda)*2]*C[(k+j*lda)*2+1]-C[(k+i*lda)*2+1]*C[(k+j*lda)*2]); - } - if(type_ != MLUOP_DTYPE_FLOAT && j != i) - { - A[(j+i*lda)*2] = A[(i+j*lda)*2]; - A[(j+i*lda)*2+1] = -A[(i+j*lda)*2+1]; + + A[(i+j*lda)*2] += (C[(k*lda+i)*2]*C[(k*lda+j)*2]+C[(k*lda+i)*2+1]*C[(k*lda+j)*2+1]); + A[(i+j*lda)*2+1] += (-C[(k*lda+i)*2]*C[(k*lda+j)*2+1]+C[(k*lda+i)*2+1]*C[(k*lda+j)*2]); } + } - } } - if(type_ != MLUOP_DTYPE_FLOAT &&((upper_==false && j > i) || (upper_==true && j > i))) + if(((upper_) || (upper_==true && j > i))) { - A[(j+i*lda)*2] = A[(i+j*lda)*2]; - A[(j+i*lda)*2+1] = -A[(i+j*lda)*2+1]; + if(type_ != MLUOP_DTYPE_FLOAT) + { + A[(j+i*lda)*2] = A[(i+j*lda)*2]; + A[(j+i*lda)*2+1] = -A[(i+j*lda)*2+1]; + } + else + { + A[(j+i*lda)] = A[(i+j*lda)]; + } + } } } @@ -304,7 +317,7 @@ void print_matrix(int batch, float*A, int lda, bool trans_, int n_, int ldda_, m for(int j = 0; j 0) + { + std::memcpy(temp_dst,temp_src,transfer_remain); + } +} + +void mlu_transfer_data(float* dst, float* src, unsigned long data_size,cnrtMemTransDir_t dir) { + unsigned long size_block = 1024*1024*1024; + unsigned long transfer_num = data_size / size_block; + unsigned long transfer_remain = data_size % size_block; + float* temp_dst= dst, *temp_src = src; + for(unsigned long i = 0; i < transfer_num; i++) + { + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(temp_dst, temp_src, size_block, dir)); + temp_dst += (size_block/4); + temp_src += (size_block/4); + } + if(transfer_remain > 0) + { + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(temp_dst, temp_src, transfer_remain, dir)); + } +} + +void CholeskyExecutor::prepareComputeParam() +{ +//cpu端把矩阵的一半设置成0 +//然后转置乘法,结果存到cpu端的另一个矩阵 +//然后传给gpu端 printf("start prepare compute parameter.\n"); int long_int_size = sizeof(long int); int int_size = sizeof(int); @@ -384,7 +438,7 @@ void CholeskyExecutor::prepareComputeParam() stride_ = (input_desc_->strides)[dim-1]; ldda_ = input_desc_->dims[2]; printf("batch_size:%ld,n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n",batch_size_,n_,ldda_,stride_,upper_,trans_); - + int size = input_desc_->dims[1]; printf("size:%d, dim:%d, \n",size,dim); @@ -396,16 +450,38 @@ void CholeskyExecutor::prepareComputeParam() printf("\n"); printf("data vector length : %ld\n",data_vector_.size()); } + unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; +// unsigned long size_2g = 1024*1024*1024-1+1024*1024*1024; +// unsigned long size_2g = 1024*1024*10-1; +// int transfer_num = total_size / size_2g; + +// int transfer_remain = total_size % size_2g; +// printf("total size:%ld, transfer_num:%d, transfer_remain:%d\n",total_size,transfer_num,transfer_remain); - if(batch_size_ > 16 && n_ > 2000) - { - std::memcpy(dev_c,dev_a,16*type_size_*n_*ldda_); - std::memcpy(dev_c+16*type_size_/4*n_*ldda_,dev_a+16*type_size_/4*n_*ldda_,(batch_size_-16)*type_size_*n_*ldda_); - } - else - { - std::memcpy(dev_c,dev_a,batch_size_*type_size_*n_*ldda_); - } +// printf("matrix random:\n"); +// print_matrix(batch_size_, dev_a,ldda_,trans_,n_,ldda_,type_); +// print_matrix(batch_size_, base_line_out,ldda_,trans_,n_,ldda_,type_); + +// for(unsigned long i = 0; i < transfer_num; i++) +// { +// std::memcpy(dev_c+(i*size_2g),dev_a+(i*size_2g),size_2g); +// } +// printf("ddd\n"); +// if(transfer_remain > 0) +// { +// std::memcpy(dev_c+(transfer_num*size_2g),dev_a+(transfer_num*size_2g),transfer_remain); +// } +// printf("lll\n"); + cpu_transfer_data(dev_c,dev_a,total_size); +// if(batch_size_ > 16 && n_ > 2000) +// { +// std::memcpy(dev_c,dev_a,16*type_size_*n_*ldda_); +// std::memcpy(dev_c+16*type_size_/4*n_*ldda_,dev_a+16*type_size_/4*n_*ldda_,(batch_size_-16)*type_size_*n_*ldda_); +// } +// else +// { +// std::memcpy(dev_c,dev_a,batch_size_*type_size_*n_*ldda_); +// } if(parser_->device() == CPU) { for(long int i = 0; i < batch_size_;i++) @@ -415,6 +491,7 @@ void CholeskyExecutor::prepareComputeParam() else set_matrix_zero(dev_c+i*n_*ldda_*2,false,trans_,n_,ldda_,type_); } +// set_matrix_zero((float*)dev_c,upper_,trans_,n_,ldda_,type_); for(long int i = 0; i < batch_size_;i++) { if(type_ == MLUOP_DTYPE_FLOAT) @@ -433,40 +510,27 @@ void CholeskyExecutor::prepareComputeParam() - if(batch_size_>16) - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(dev_d, dev_a, type_size_*n_*ldda_*16, CNRT_MEM_TRANS_DIR_HOST2DEV)); - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(dev_d+16*type_size_/4*n_*ldda_, dev_a+16*type_size_/4*n_*ldda_, type_size_*n_*ldda_*(batch_size_-16), CNRT_MEM_TRANS_DIR_HOST2DEV)); - } - else - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(dev_d, dev_a, type_size_*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_HOST2DEV)); - } - +// printf("matrix A:\n"); +// print_matrix(batch_size_,dev_a,ldda_,trans_,n_,ldda_,type_); +// printf("matrix C:\n"); +// print_matrix(batch_size_,dev_c,ldda_,trans_,n_,ldda_,type_); + mlu_transfer_data(dev_d,dev_a,total_size,CNRT_MEM_TRANS_DIR_HOST2DEV); + + if(parser_->device() == CPU) { float* cpu_a = cpu_fp32_input_[0]; - if(batch_size_ > 16 && n_ > 2000) - { - std::memcpy(cpu_a,dev_a,16*type_size_*n_*ldda_); - std::memcpy(cpu_a+16*type_size_/4*n_*ldda_,dev_a+16*type_size_/4*n_*ldda_,(batch_size_-16)*type_size_*n_*ldda_); - } - else - { - std::memcpy(cpu_a,dev_a,batch_size_*type_size_*n_*ldda_); - } + cpu_transfer_data(cpu_a,dev_a,total_size); + } - printf("end prepare compute.\n"); } void CholeskyExecutor::compute() { +// prepareComputeParam(); VLOG(4) <<" CholeskyExecutor compute "; auto input_desc_ = tensor_desc_[0].tensor; @@ -475,48 +539,26 @@ void CholeskyExecutor::compute() { auto h_output = (float*)(data_vector_[1].host_ptr); auto d_intput = (float*)(data_vector_[0].device_ptr); auto d_output = (float*)(data_vector_[1].device_ptr); - if(batch_size_>16) - { - std::memcpy(h_input,h_output,type_size_*n_*ldda_*16); - std::memcpy(h_input+type_size_/4*n_*ldda_*16,h_output+type_size_/4*n_*ldda_*16,type_size_*n_*ldda_*(batch_size_-16)); - } - else - { - std::memcpy(h_input,h_output,type_size_*n_*ldda_*batch_size_); - } - if(batch_size_>16) - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(h_output, d_intput, type_size_*n_*ldda_*16, CNRT_MEM_TRANS_DIR_DEV2HOST)); - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(h_output+16*type_size_/4*n_*ldda_, d_intput+16*type_size_/4*n_*ldda_, type_size_*n_*ldda_*(batch_size_-16), CNRT_MEM_TRANS_DIR_DEV2HOST)); - } - else - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(h_output, d_intput, type_size_*n_*ldda_*batch_size_, CNRT_MEM_TRANS_DIR_DEV2HOST)); - } + unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; + cpu_transfer_data(h_input,h_output,total_size); + + mlu_transfer_data(h_output,d_intput,total_size,CNRT_MEM_TRANS_DIR_DEV2HOST); + +// printf("mlu before cholesky result:\n"); +// print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); interface_timer_.start(); float* workspace = nullptr; size_t size = 0; - mluOpGetCholeskyWorkspace(input_desc_,&size,&workspace); + MLUOP_CHECK(mluOpGetCholeskyWorkspace(input_desc_,&size,&workspace)); MLUOP_CHECK(mluOpCholesky(handle_,input_desc_,d_intput, output_desc_, d_output, upper_,workspace)); +MLUOP_CHECK(mluOpFreeCholeskyWorkspace(&workspace)); + interface_timer_.stop(); - if(batch_size_>16) - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(h_output, d_output, 16*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_DEV2HOST)); - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(h_output+16*type_size_/4*n_*ldda_, d_output+16*type_size_/4*n_*ldda_, (batch_size_-16)*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_DEV2HOST)); - } - else - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(h_output, d_output, batch_size_*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_DEV2HOST)); - } + + mlu_transfer_data(h_output,d_output,total_size,CNRT_MEM_TRANS_DIR_DEV2HOST); if(parser_->device() != CPU ) { @@ -524,11 +566,18 @@ MLUOP_CHECK(mluOpCholesky(handle_,input_desc_,d_intput, output_desc_, d_output, { for(int i = 0; i < batch_size_;i++) { - if(type_ == MLUOP_DTYPE_FLOAT) - trans_mul(h_input+i*n_*ldda_,h_output+i*n_*ldda_,ldda_,upper_,trans_,n_,ldda_,type_,false); - else - trans_mul(h_input+i*n_*ldda_*2,h_output+i*n_*ldda_*2,ldda_,upper_,trans_,n_,ldda_,type_,false); + if(type_ == MLUOP_DTYPE_FLOAT) + { + trans_mul(h_input+i*n_*ldda_,h_output+i*n_*ldda_,ldda_,upper_,trans_,n_,ldda_,type_,false); + } + else + { + trans_mul(h_input+i*n_*ldda_*2,h_output+i*n_*ldda_*2,ldda_,upper_,trans_,n_,ldda_,type_,false); + + } } + h_output = h_input; + fill_zero(h_output,upper_,batch_size_,n_,ldda_,type_,true); } else { @@ -539,26 +588,27 @@ MLUOP_CHECK(mluOpCholesky(handle_,input_desc_,d_intput, output_desc_, d_output, { set_diag_imag_one(h_output,batch_size_,n_,ldda_); } - if(batch_size_>16) - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(d_output, h_output, 16*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_HOST2DEV)); - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(d_output+16*type_size_/4*n_*ldda_, h_output+16*type_size_/4*n_*ldda_, (batch_size_-16)*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_HOST2DEV)); - } - else - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(d_output, h_output, batch_size_*type_size_*n_*ldda_, CNRT_MEM_TRANS_DIR_HOST2DEV)); - } + + + mlu_transfer_data(d_output,h_output,total_size,CNRT_MEM_TRANS_DIR_HOST2DEV); + + + } + +// printf("mlu after cholesky result:\n"); +// print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); + + + return; } void cpu_compute(float* cpu_c, int n_, int ldda_, bool upper_, bool trans_, mluOpDataType_t type_) { + if(trans_) { for(long int i = 0; i < n_; i++) @@ -639,7 +689,7 @@ void cpu_compute(float* cpu_c, int n_, int ldda_, bool upper_, bool trans_, mluO } else { - + for(long int j = i+1;j16) - { - std::memcpy(cpu_c,cpu_a,type_size_*n_*ldda_*16); - std::memcpy(cpu_c+type_size_/4*n_*ldda_*16,cpu_a+type_size_/4*n_*ldda_*16,type_size_*n_*ldda_*(batch_size_-16)); - } - else - { - std::memcpy(cpu_c,cpu_a,type_size_*n_*ldda_*batch_size_); - } + unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; + unsigned long size_2g = 1024*1024*1024-1+1024*1024*1024; + int transfer_num = total_size / size_2g; + int transfer_remain = total_size % size_2g; + + + cpu_transfer_data(cpu_c,cpu_a,total_size); + + + auto h_output = (float*)(data_vector_[1].host_ptr); auto h_input = (float*)(data_vector_[0].host_ptr); - printf("cpu before cholesky result:\n"); + // printf("cpu before cholesky result:\n"); + // print_matrix(batch_size_,cpu_c,ldda_,trans_,n_,ldda_,type_); if(result_mul) { for(int i = 0; i < batch_size_;i++) { if(type_ == MLUOP_DTYPE_FLOAT) + { trans_mul(h_input+i*n_*ldda_,h_output+i*n_*ldda_,ldda_,upper_,trans_,n_,ldda_,type_,false); + } else trans_mul(h_input+i*n_*ldda_*2,h_output+i*n_*ldda_*2,ldda_,upper_,trans_,n_,ldda_,type_,false); + } - if(batch_size_>16) - { - std::memcpy(h_output,h_input,type_size_*n_*ldda_*16); - std::memcpy(h_output+type_size_/4*n_*ldda_*16,h_input+type_size_/4*n_*ldda_*16,type_size_*n_*ldda_*(batch_size_-16)); - } - else - { - std::memcpy(h_output,h_input,type_size_*n_*ldda_*batch_size_); - } + + cpu_transfer_data(h_output,h_input,total_size); + + fill_zero(h_output,upper_,batch_size_,n_,ldda_,type_,true); } else { for(long int i = 0; i < batch_size_;i++) { + cpu_compute(cpu_c+i*n_*ldda_*type_size_/4, n_, ldda_, upper_, trans_, type_); } fill_zero(cpu_c,upper_,batch_size_,n_,ldda_,type_,false); fill_zero(h_output,upper_,batch_size_,n_,ldda_,type_,false); } + // print_matrix(batch_size_,h_input,ldda_,trans_,n_,ldda_,type_); + // printf("cpu cholesky result:\n"); + // print_matrix(batch_size_,cpu_c,ldda_,trans_,n_,ldda_,type_); + + // printf("mlu cholesky result:\n"); + // print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); + + // printf("mlu after cholesky result1:\n"); + // print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); return; } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h index ec23fa895..9903b9e50 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h @@ -29,7 +29,7 @@ class CholeskyExecutor : public Executor { size_t size_workspace_ = 0; int stride_ = 0; mluOpDataType_t type_ = MLUOP_DTYPE_FLOAT; - bool result_mul = false; + bool result_mul = true; int type_size_ = 4; bool trans_ = true; bool upper_ = false; From 1d3fce877b710350e5dcfd171963abdde310337d Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 26 Jul 2024 05:21:26 +0800 Subject: [PATCH 09/27] add pseudocode --- docs/design_docs/cholesky/cholesky.md | 164 +++++++++++++++++- .../pb_gtest/src/zoo/cholesky/cholesky.h | 2 +- 2 files changed, 161 insertions(+), 5 deletions(-) diff --git a/docs/design_docs/cholesky/cholesky.md b/docs/design_docs/cholesky/cholesky.md index 6d78bedec..3b61e4aa8 100644 --- a/docs/design_docs/cholesky/cholesky.md +++ b/docs/design_docs/cholesky/cholesky.md @@ -240,12 +240,168 @@ POTRF这个函数名取自LAPACK中Cholesky分解的函数,POTRF的目的是 每个列块,仍然需要先计算该列块的外部依赖(该列块左侧的所有列块),然后对列块中的每一列分别计算内部依赖,对于这两个部分可以分别用两个kernel来实现。由于这一步骤是严重的串行瓶颈,因此在划分小块时需要尽量让计算的快更小,减少串行瓶颈对性能的影响 -### 3.2 测试 +### 3.2 伪代码实现 -#### 3.2.1 测试样例构造 +整体计算流程伪代码如下: + + +``` +function cholesky(row, nb, d_output): + for j from 0 to row, incrementing by nb: + jb = min(nb, row - j) + + // Perform symmetric rank-k update + syrk(jb, j, OFFSET_ROW(d_output, j, 0), OFFSET_ROW(d_output, j, j)) + + + // Perform recursive Cholesky factorization + potrf_rectile(jb, recnb, OFFSET_ROW(d_output, j, j)) + + if j + jb < row: + // Update matrix using matrix multiplication + gemm(row - j - jb, jb, j, OFFSET_ROW(d_output, j + jb, 0), OFFSET_ROW(d_output, j, 0), OFFSET_ROW(d_output, j + jb, j)) + + + if j + jb < row: + // Solve triangular system + trsm(jb, row - j - jb, OFFSET_ROW(d_output, j, j), OFFSET_ROW(d_output, j + jb, j)) +``` + +其中potrf_rectile伪代码如下: + +``` +function potrf_rectile(n, recnb, d_A): + + if n == 0: + return SUCCESS + + if n <= recnb: + potf2_lpin(n, d_A) + else: + n1 = n / 2 + n2 = n - n1 + + potrf_rectile(n1, recnb, OFFSET_ROW(d_A, 0, 0)) + + trsm_rectile(n1, n2, OFFSET_ROW(d_A, 0, 0), OFFSET_ROW(d_A, n1, 0)) + + syrk(n2, n1, d_A + n1 * lda, OFFSET_ROW(d_A, n1, n1)) + + potrf_rectile(n2, recnb, OFFSET_ROW(d_A, n1, n1)) + + return SUCCESS + + +function potf2_lpin(n, dA): + for i from 0 to n, incrementing by width: + potf2_smlpout_device(m - i, OFFSET_ROW(dA, i), OFFSET_ROW(dA, i, i)) // call mlu kernel + + + +function potf2_smlpout_device(m, A0, A): + id = taskId % 4 + shared_data = allocate_shared_data() + sdata_A = shared_data + sdata_B = shared_data + m * POTF_NB / TASK_NUM * 4 + + // Perform matrix multiplication with fixed width + small_gemm(m, A0, sdata_A, sdata_B) + + sync_cluster() + + // Perform Cholesky factorization with fixed size + spotf2_sminout_device(m, sdata_A, POTF_NB) + + sync_cluster() + + span = POTF_NB + + if id == 0: + for i from 0 to span: + copy_memory(A + (i * lda), sdata_A + i * POTF_NB, i + 1, SRAM2LDRAM) + else if id * span < m: + copy_memory(A + (id * POTF_NB * lda), sdata_A + id * POTF_NB * POTF_NB, POTF_NB, SRAM2LDRAM, span - 1) + + sync_cluster() + +``` + +trsm的实现伪代码如下: + +``` +function trsm(jb, row - j - jb, dA): + inverse(dA) + gemm +``` + +trsm调用的inverse内会使用inverse_kernel来对输入矩阵进行求逆: + +``` +function inverse_kernel(d_input, ld_input, stride_input, d_output, ld_output, stride_output, m): + id = taskId + if id == 0: + copy_memory(sram_buffer, d_input, m) + + sync_cluster() + + span = m / taskDim + start = id * span + if id == 3: + span = m - 3 * span + + nram_offset = allocate_nram_buffer(id, m) + nram_src1 = nram_offset + nram_src2 = nram_src1 + m * m + mul_result = nram_src2 + m + nram_dst = nram_src2 + m * m + diag_start = calculate_diag_start(sram_buffer, start, m) + height = m - start + + clear_memory(nram_offset, 3 * m * m) + + for i from 0 to span: + offset = i * m + i + result = inverse_element(diag_start[offset]) + nram_src1[i * height + i] = result + nram_dst[i * span + i] = result + diag_start[offset] = result + + sync_cluster() + + for i from 1 to height: + copy_memory(nram_src2, diag_start + i * m, i) + num = min(i, span) + diag_element = diag_start[i * m + i] + + for j from 0 to num: + temp = perform_element_multiplication_and_sum(mul_result, nram_src2, nram_src1 + j * height, i) + temp = -1.0 * temp * diag_element + nram_dst[i * span + j] = temp + nram_src1[j * height + i] = temp + + sync() + + sync_cluster() + + if span > 0: + copy_memory(diag_start, nram_dst, span, height) + + sync_cluster() + + if id == 0: + copy_memory(d_output, sram_buffer, m, ld_output) + +``` + + + + +### 3.3 测试 + +#### 3.3.1 测试样例构造 测试用例覆盖多种类型。按照数据类型(float,complex float),矩阵维度(单batch、多batch),输出矩阵为上三角/下三角(即输入参数upper为True/False),是否将矩阵还原(是否将分解出的L和U矩阵相乘),可以得到16种类型,对每种类型分别测试,diff1,diff2,diff3_2结果均小于动态阈值。 -#### 3.2.2 性能测试 +#### 3.3.2 性能测试 float类型单batch性能测试如下,表格中数字为运行时间,单位为微秒(us),最右侧一列为mlu的运行时间与pytorch在gpu上的运行时间的比值: | 规模 | pytorch | mlu | mlu/pytorch | | ---- | ------- | ----- | ----------- | @@ -287,7 +443,7 @@ complex类型多batch性能测试: 图中红框中为调用底层的矩阵乘法,且由于没有复数类型矩阵乘法的底层实现,当前复数矩阵乘是由4个float类型矩阵乘拼接而成。可以看到矩阵乘法的时间占比总和已经达到了60%,矩阵乘法所占用时间超过了2000微秒,已经超过了pytorch运行时间的10倍。 -### 3.3 防呆检查 +### 3.4 防呆检查 算子中做了如下检查: * 所有指针不为NULL * 输入输出矩阵的维度为2或者3 diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h index 9903b9e50..ec23fa895 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h @@ -29,7 +29,7 @@ class CholeskyExecutor : public Executor { size_t size_workspace_ = 0; int stride_ = 0; mluOpDataType_t type_ = MLUOP_DTYPE_FLOAT; - bool result_mul = true; + bool result_mul = false; int type_size_ = 4; bool trans_ = true; bool upper_ = false; From a432bacb71214585b2a603f6d856ea4354da1ce9 Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 26 Jul 2024 06:05:10 +0800 Subject: [PATCH 10/27] add comments --- kernels/cholesky/cholesky.cpp | 12 +++- mlu_op.h | 116 ++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index 4e47a8977..41b5ddd01 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -1,5 +1,5 @@ #include "cholesky.h" - +// calculates the required workspace size for performing the Cholesky decomposition on a given matrix or batch of matrices. mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace) { PARAM_CHECK("mluOpCholesky", input_desc != NULL); @@ -50,6 +50,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t in return MLUOP_STATUS_SUCCESS; } +// releases the allocated workspace memory used for Cholesky decomposition calculations. +// It ensures that the workspace pointer is not only valid but also points to allocated memory before attempting to free it. mluOpStatus_t MLUOP_WIN_API mluOpFreeCholeskyWorkspace(float** workspace) { PARAM_CHECK("mluOpCholesky", workspace != NULL); @@ -63,6 +65,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpFreeCholeskyWorkspace(float** workspace) } +// performs the necessary operations to compute matrix transformations, +// potentially involving Cholesky decomposition or matrix transposition, depending on the input parameters. mluOpStatus_t MLUOP_WIN_API calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace) { @@ -251,7 +255,11 @@ calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_ return MLUOP_STATUS_SUCCESS; } - +// computes the Cholesky decomposition. +// This function is designed to handle both single and batch processing of matrices in either 2D or 3D formats. +// The function ensures that the input matrices are either float or complex float types and performs the decomposition +// either on the upper or lower triangular part of the matrix, based on the 'upper' boolean flag. +mluOpStatus_t MLUOP_WIN_API mluOpStatus_t MLUOP_WIN_API mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace) { diff --git a/mlu_op.h b/mlu_op.h index b5a6c89bb..72a79a19f 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -14144,6 +14144,49 @@ mluOpExecFFT(mluOpHandle_t handle, mluOpStatus_t MLUOP_WIN_API mluOpDestroyFFTPlan(mluOpFFTPlan_t fft_plan); +/*! + * @brief Computes the Cholesky decomposition of a symmetric positive-definite matrix using the input tensor descriptor \p input_desc and writes the result to the output tensor descriptor \p output_desc. + * + * @param[in] handle + * The handle to the MLU operation environment. + * @param[in] input_desc + * The descriptor for the input tensor. + * @param[in] d_input + * Pointer to the input data in device memory. + * @param[in] output_desc + * The descriptor for the output tensor. + * @param[out] d_output + * Pointer to the output data in device memory. + * @param[in] upper + * Boolean flag to indicate whether to compute the upper or lower triangular Cholesky factor. True for upper, False for lower. + * @param[in] workspace + * Pointer to workspace buffer in device memory used for intermediate computations. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_EXECUTION_FAILED + * + * @par Data Type + * - None + * + * @par Data Layout + * - None + * + * @par Scale Limitation + * - None + * + * @par API Dependency + * - None + * + * @par Note + * - The function assumes the matrix is symmetric positive-definite. + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ + mluOpStatus_t MLUOP_WIN_API mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, @@ -14151,10 +14194,83 @@ mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace); +/*! + * @brief Calculates the size of the workspace required for the Cholesky decomposition and initializes a workspace pointer. + * This function must be called before performing Cholesky decomposition using mluOpCholesky. + * + * @param[in] input_desc + * The descriptor for the input tensor for which the Cholesky decomposition will be performed. + * @param[out] size + * Pointer to a size_t variable where the size of the required workspace will be stored. + * @param[out] workspace + * Double pointer to a float, used to allocate memory for the workspace. This pointer will be set to point to the allocated workspace. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS if the workspace size is successfully calculated and the workspace is successfully allocated, + * - ::MLUOP_STATUS_EXECUTION_FAILED if there are issues during the calculation or memory allocation. + * + * @par Data Type + * - None. + * + * @par Data Layout + * - None. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par Note + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ + + mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace); +/*! + * @brief Frees the memory allocated for the Cholesky decomposition workspace. + * This function should be called to release the workspace memory used by the Cholesky operations after they are no longer needed. + * + * @param[in,out] workspace + * Double pointer to the workspace memory that was allocated by mluOpGetCholeskyWorkspace or another allocation function. + * After calling this function, the pointer will be set to NULL to prevent accidental reuse. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS if the workspace is successfully freed, + * - ::MLUOP_STATUS_EXECUTION_FAILED if there is an error during the free operation, such as if the pointer is NULL. + * + * @par Data Type + * - None. + * + * @par Data Layout + * - None. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par Note + * - None + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ + + mluOpStatus_t MLUOP_WIN_API mluOpFreeCholeskyWorkspace(float** workspace); From 23be9c4492ffcb026565b94d9aadac4232cf68eb Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 26 Jul 2024 06:33:56 +0800 Subject: [PATCH 11/27] add index.rst --- docs/user_guide/9_operators/index.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/user_guide/9_operators/index.rst b/docs/user_guide/9_operators/index.rst index bf9b37a28..e3a1e43b5 100755 --- a/docs/user_guide/9_operators/index.rst +++ b/docs/user_guide/9_operators/index.rst @@ -757,3 +757,10 @@ mluOpExecFFT - ``y`` 为输出信号。 - :math:`DFT_{N}` 为长度为N傅里叶变换的变换矩阵。 +.. cholesky:: + +mluOpCholesky +--------------- +执行 Cholesky 分解,将一个正定矩阵分解为其下三角矩阵(L)或其转置的上三角矩阵(U),具体分解为上三角或下三角取决于参数``upper``。 + +该算子包含7个输入:handle 为操作句柄,input_desc 与 d_input 分别描述并提供输入矩阵的信息;两个输出:output_desc 与 d_output 分别描述并存储输出矩阵的信息;此外,还包含一个布尔参数 upper,用于指定输出是上三角还是下三角矩阵,以及一个 workspace 用于临时存储计算过程中的数据。 From 16b4220fd4e41bb24b38f8b2f2f541060a4a19cf Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 26 Jul 2024 06:38:01 +0800 Subject: [PATCH 12/27] format code --- kernels/cholesky/cholesky.cpp | 602 ++-- kernels/cholesky/cholesky.h | 62 +- kernels/cholesky/cholesky_union1.mlu | 2434 ++++++++--------- kernels/cholesky/complex_cholesky_union1.mlu | 1667 ++++++----- .../pb_gtest/src/zoo/cholesky/cholesky.cpp | 1131 ++++---- .../pb_gtest/src/zoo/cholesky/cholesky.h | 3 +- 6 files changed, 2723 insertions(+), 3176 deletions(-) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index 41b5ddd01..a4994578c 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -1,329 +1,317 @@ #include "cholesky.h" -// calculates the required workspace size for performing the Cholesky decomposition on a given matrix or batch of matrices. -mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace) -{ - PARAM_CHECK("mluOpCholesky", input_desc != NULL); - - - PARAM_CHECK("mluOpCholesky", input_desc->dim == 2||input_desc->dim == 3); - PARAM_CHECK("mluOpCholesky", input_desc->dims[0] > 0); - PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); - - if(input_desc->dim == 3) - { - PARAM_CHECK("mluOpCholesky", input_desc->dims[2] > 0); - } - - mluOpDataType_t dtype = input_desc->dtype; - PARAM_CHECK("mluOpCholesky", dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); - - unsigned long type_size; - MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); - long int size_a = 0, lda = 0, size_c = 0, ldc = 0; - long int batch_size = 1; - int dim = input_desc->dim; - if(dim == 2) - { - size_a = input_desc->dims[0]; - } - else if(dim == 3) - { - batch_size = input_desc->dims[0]; - size_a = input_desc->dims[1]; - } - - if (dtype == MLUOP_DTYPE_FLOAT) - { - *size = size_a*size_a*sizeof(float)*batch_size*3; - } - else - { - *size = size_a*size_a*sizeof(float)*2*batch_size*3; - - } - printf("workspace size:%ul\n",(int)(*size)); - if(*size>0) - { - CHECK_RETURN("mluOpCholesky", - workspace_malloc(*size, workspace)); - } - return MLUOP_STATUS_SUCCESS; +// calculates the required workspace size for performing the Cholesky +// decomposition on a given matrix or batch of matrices. +mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace( + mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace) { + PARAM_CHECK("mluOpCholesky", input_desc != NULL); + + PARAM_CHECK("mluOpCholesky", input_desc->dim == 2 || input_desc->dim == 3); + PARAM_CHECK("mluOpCholesky", input_desc->dims[0] > 0); + PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); + + if (input_desc->dim == 3) { + PARAM_CHECK("mluOpCholesky", input_desc->dims[2] > 0); + } + + mluOpDataType_t dtype = input_desc->dtype; + PARAM_CHECK("mluOpCholesky", + dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); + + unsigned long type_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); + long int size_a = 0, lda = 0, size_c = 0, ldc = 0; + long int batch_size = 1; + int dim = input_desc->dim; + if (dim == 2) { + size_a = input_desc->dims[0]; + } else if (dim == 3) { + batch_size = input_desc->dims[0]; + size_a = input_desc->dims[1]; + } + + if (dtype == MLUOP_DTYPE_FLOAT) { + *size = size_a * size_a * sizeof(float) * batch_size * 3; + } else { + *size = size_a * size_a * sizeof(float) * 2 * batch_size * 3; + } + printf("workspace size:%ul\n", (int)(*size)); + if (*size > 0) { + CHECK_RETURN("mluOpCholesky", workspace_malloc(*size, workspace)); + } + return MLUOP_STATUS_SUCCESS; } -// releases the allocated workspace memory used for Cholesky decomposition calculations. -// It ensures that the workspace pointer is not only valid but also points to allocated memory before attempting to free it. -mluOpStatus_t MLUOP_WIN_API mluOpFreeCholeskyWorkspace(float** workspace) -{ - PARAM_CHECK("mluOpCholesky", workspace != NULL); - if(*workspace != NULL) - { - CHECK_RETURN("mluOpCholesky", - workspace_free(workspace)); - *workspace = NULL; - } - return MLUOP_STATUS_SUCCESS; - +// releases the allocated workspace memory used for Cholesky decomposition +// calculations. It ensures that the workspace pointer is not only valid but +// also points to allocated memory before attempting to free it. +mluOpStatus_t MLUOP_WIN_API mluOpFreeCholeskyWorkspace(float** workspace) { + PARAM_CHECK("mluOpCholesky", workspace != NULL); + if (*workspace != NULL) { + CHECK_RETURN("mluOpCholesky", workspace_free(workspace)); + *workspace = NULL; + } + return MLUOP_STATUS_SUCCESS; } -// performs the necessary operations to compute matrix transformations, -// potentially involving Cholesky decomposition or matrix transposition, depending on the input parameters. -mluOpStatus_t MLUOP_WIN_API -calculate_body(mluOpHandle_t handle,int batch_size, const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace) -{ - mluOpDataType_t dtype = input_desc->dtype; - - - int recnb = REC_NB; - int gbstep = 0; - int dim = input_desc->dim; - bool is_row_major = (input_desc->strides)[dim-1]==1; - - - unsigned long type_size; - MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); - int size_a = 0, lda = 0, size_c = 0, ldc = 0; - if(dim == 2) - { - size_a = input_desc->dims[0]; - lda = input_desc->dims[1]; - size_c = output_desc->dims[0]; - ldc = output_desc->dims[1]; - } - else if(dim == 3) - { - size_a = input_desc->dims[1]; - lda = input_desc->dims[2]; - size_c = output_desc->dims[1]; - ldc = output_desc->dims[2]; - } - - PARAM_CHECK("mluOpCholesky", lda >= size_a); - PARAM_CHECK("mluOpCholesky", ldc >= size_c); - - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - - - int jb; - const float s_one = 1.0; - const float s_neg_one = -1.0; - - if(dtype == MLUOP_DTYPE_FLOAT) - { - if(upper == true) - { - CHECK_RETURN("mluOpCholesky", - transpose(batch_size,size_a,size_a,d_input,d_output,handle,dtype,workspace)); - } - else - { - CNRT_CHECK(cnrtMemcpy(d_output, d_input, type_size*size_a*lda*((unsigned long)batch_size), CNRT_MEM_TRANS_DIR_DEV2DEV)); - } +// performs the necessary operations to compute matrix transformations, +// potentially involving Cholesky decomposition or matrix transposition, +// depending on the input parameters. +mluOpStatus_t MLUOP_WIN_API +calculate_body(mluOpHandle_t handle, int batch_size, + const mluOpTensorDescriptor_t input_desc, float* d_input, + const mluOpTensorDescriptor_t output_desc, float* d_output, + bool upper, float* workspace) { + mluOpDataType_t dtype = input_desc->dtype; + + int recnb = REC_NB; + int gbstep = 0; + int dim = input_desc->dim; + bool is_row_major = (input_desc->strides)[dim - 1] == 1; + + unsigned long type_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); + int size_a = 0, lda = 0, size_c = 0, ldc = 0; + if (dim == 2) { + size_a = input_desc->dims[0]; + lda = input_desc->dims[1]; + size_c = output_desc->dims[0]; + ldc = output_desc->dims[1]; + } else if (dim == 3) { + size_a = input_desc->dims[1]; + lda = input_desc->dims[2]; + size_c = output_desc->dims[1]; + ldc = output_desc->dims[2]; + } + + PARAM_CHECK("mluOpCholesky", lda >= size_a); + PARAM_CHECK("mluOpCholesky", ldc >= size_c); + + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + + int jb; + const float s_one = 1.0; + const float s_neg_one = -1.0; + + if (dtype == MLUOP_DTYPE_FLOAT) { + if (upper == true) { + CHECK_RETURN("mluOpCholesky", + transpose(batch_size, size_a, size_a, d_input, d_output, + handle, dtype, workspace)); + } else { + CNRT_CHECK( + cnrtMemcpy(d_output, d_input, + type_size * size_a * lda * ((unsigned long)batch_size), + CNRT_MEM_TRANS_DIR_DEV2DEV)); } - else - { - - CHECK_RETURN("mluOpCholesky", - transpose(batch_size,size_a*size_a,2,d_input,d_output,handle,MLUOP_DTYPE_FLOAT,workspace)); - } - + } else { + CHECK_RETURN("mluOpCholesky", + transpose(batch_size, size_a * size_a, 2, d_input, d_output, + handle, MLUOP_DTYPE_FLOAT, workspace)); + } + + cnrtQueueSync(queue); + int stride = size_a * lda; + + if (dtype == MLUOP_DTYPE_FLOAT) { + int row = is_row_major ? lda : size_a; + int nb = NB; + set_half_zero(batch_size, stride, d_output, lda, lda, handle); cnrtQueueSync(queue); - int stride = size_a*lda; - - - - - if(dtype == MLUOP_DTYPE_FLOAT) - { - - int row = is_row_major ? lda : size_a; - int nb = NB; - set_half_zero(batch_size, stride, d_output, lda, lda, handle); - cnrtQueueSync(queue); - for(int j = 0; j < row; j+=nb) - { - jb = std::min(nb, row-j); - CHECK_RETURN("mluOpCholesky", - ssyrk(batch_size,stride,false,is_row_major,jb,j,OFFSET_ROW(d_output,j,0),lda,OFFSET_ROW(d_output,j,j),lda,handle,workspace)); - cnrtQueueSync(queue); - CHECK_RETURN("mluOpCholesky", - mlu_spotrf_rectile(batch_size,stride,is_row_major,false,jb,recnb,OFFSET_ROW(d_output,j,j),lda,j, handle,workspace)); - if(j+jb < row) - { - CHECK_RETURN("mluOpCholesky", - sgemm(batch_size, !is_row_major,is_row_major,row-j-jb,jb,j,-1.0f,1.0f, - OFFSET_ROW(d_output,j+jb,0),lda,stride, - OFFSET_ROW(d_output,j,0),lda,stride, - OFFSET_ROW(d_output,j+jb,j),lda,stride, handle,workspace)); - cnrtQueueSync(queue); - } - if(j+jb < row) - { - CHECK_RETURN("mluOpCholesky", - strsm(batch_size, stride,false,is_row_major,jb,row-j-jb,OFFSET_ROW(d_output,j,j),lda,OFFSET_ROW(d_output,j+jb,j),lda, handle, workspace)); - cnrtQueueSync(queue); - } - } - - if(upper) - { - cnrtQueueSync(queue); - CHECK_RETURN("mluOpCholesky", - transpose(batch_size, size_a,size_a,d_output,workspace,handle,dtype,workspace)); - cnrtQueueSync(queue); - CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*((unsigned long)batch_size), CNRT_MEM_TRANS_DIR_DEV2DEV)); - } - } - else - { - recnb = CREC_NB; - int nb = CNB; - int row = lda; - float* r_start = d_output; - float* i_start = d_output + size_a*lda; - stride *= 2; - - set_half_zero(batch_size, stride, r_start, lda, lda, handle); - set_half_zero(batch_size, stride, i_start, lda, lda, handle); + for (int j = 0; j < row; j += nb) { + jb = std::min(nb, row - j); + CHECK_RETURN("mluOpCholesky", + ssyrk(batch_size, stride, false, is_row_major, jb, j, + OFFSET_ROW(d_output, j, 0), lda, + OFFSET_ROW(d_output, j, j), lda, handle, workspace)); + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + mlu_spotrf_rectile(batch_size, stride, is_row_major, false, + jb, recnb, OFFSET_ROW(d_output, j, j), + lda, j, handle, workspace)); + if (j + jb < row) { + CHECK_RETURN( + "mluOpCholesky", + sgemm(batch_size, !is_row_major, is_row_major, row - j - jb, jb, j, + -1.0f, 1.0f, OFFSET_ROW(d_output, j + jb, 0), lda, stride, + OFFSET_ROW(d_output, j, 0), lda, stride, + OFFSET_ROW(d_output, j + jb, j), lda, stride, handle, + workspace)); cnrtQueueSync(queue); - - for(int j = 0; j < row; j+=nb) - { - jb = std::min(nb, row-j); - CHECK_RETURN("mluOpCholesky", - cherk(batch_size,stride,jb,j,r_start+j*lda,i_start+j*lda,lda,r_start+j*lda+j,i_start+j*lda+j,lda,handle,workspace)); - cnrtQueueSync(queue); - CHECK_RETURN("mluOpCholesky", - mlu_cpotrf_rectile(batch_size,stride,jb,recnb,r_start+j*lda+j,i_start+j*lda+j,lda, handle,workspace)); - cnrtQueueSync(queue); - if(j+jb < row) - { - CHECK_RETURN("mluOpCholesky", - cgemm(batch_size, false,true,row-j-jb,jb,j,-1.0f,1.0f, - OFFSET_ROW(r_start,j+jb,0),OFFSET_ROW(i_start,j+jb,0), lda,stride, - OFFSET_ROW(r_start,j,0),OFFSET_ROW(i_start,j,0), lda,stride, - OFFSET_ROW(r_start,j+jb,j),OFFSET_ROW(i_start,j+jb,j), lda, stride, handle, workspace)); - - cnrtQueueSync(queue); - } - if(j+jb < row) - { - CHECK_RETURN("mluOpCholesky", - ctrsm(batch_size, stride,jb,row-j-jb,OFFSET_ROW(r_start,j,j),OFFSET_ROW(i_start,j,j),lda, - OFFSET_ROW(r_start,j+jb,j),OFFSET_ROW(i_start,j+jb,j),lda, handle,workspace)); - cnrtQueueSync(queue); - } - } - - - CHECK_RETURN("mluOpCholesky", - transpose(batch_size,2,size_a*size_a,d_output,workspace,handle,MLUOP_DTYPE_FLOAT,workspace)); + } + if (j + jb < row) { + CHECK_RETURN( + "mluOpCholesky", + strsm(batch_size, stride, false, is_row_major, jb, row - j - jb, + OFFSET_ROW(d_output, j, j), lda, + OFFSET_ROW(d_output, j + jb, j), lda, handle, workspace)); cnrtQueueSync(queue); - - - - if(upper) - { - cnrtQueueSync(queue); - CHECK_RETURN("mluOpCholesky", - transpose(batch_size, size_a,size_a,workspace,d_output,handle,dtype,workspace)); - cnrtQueueSync(queue); - CHECK_RETURN("mluOpCholesky", - conj_complex(batch_size, size_a,size_a,d_output,d_output,handle)); - cnrtQueueSync(queue); - } - else - { - if(batch_size > 16) - { - CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*16, CNRT_MEM_TRANS_DIR_DEV2DEV)); - CNRT_CHECK(cnrtMemcpy(d_output+type_size/4*size_a*lda*16, workspace+type_size/4*size_a*lda*16, type_size*size_a*lda*((unsigned long)batch_size-16), CNRT_MEM_TRANS_DIR_DEV2DEV)); - } - else - { - CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size*size_a*lda*((unsigned long)batch_size), CNRT_MEM_TRANS_DIR_DEV2DEV)); - } - } - - + } } - - + if (upper) { + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + transpose(batch_size, size_a, size_a, d_output, workspace, + handle, dtype, workspace)); + cnrtQueueSync(queue); + CNRT_CHECK( + cnrtMemcpy(d_output, workspace, + type_size * size_a * lda * ((unsigned long)batch_size), + CNRT_MEM_TRANS_DIR_DEV2DEV)); + } + } else { + recnb = CREC_NB; + int nb = CNB; + int row = lda; + float* r_start = d_output; + float* i_start = d_output + size_a * lda; + stride *= 2; + + set_half_zero(batch_size, stride, r_start, lda, lda, handle); + set_half_zero(batch_size, stride, i_start, lda, lda, handle); cnrtQueueSync(queue); - return MLUOP_STATUS_SUCCESS; -} + for (int j = 0; j < row; j += nb) { + jb = std::min(nb, row - j); + CHECK_RETURN("mluOpCholesky", + cherk(batch_size, stride, jb, j, r_start + j * lda, + i_start + j * lda, lda, r_start + j * lda + j, + i_start + j * lda + j, lda, handle, workspace)); + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + mlu_cpotrf_rectile( + batch_size, stride, jb, recnb, r_start + j * lda + j, + i_start + j * lda + j, lda, handle, workspace)); + cnrtQueueSync(queue); + if (j + jb < row) { + CHECK_RETURN("mluOpCholesky", + cgemm(batch_size, false, true, row - j - jb, jb, j, -1.0f, + 1.0f, OFFSET_ROW(r_start, j + jb, 0), + OFFSET_ROW(i_start, j + jb, 0), lda, stride, + OFFSET_ROW(r_start, j, 0), OFFSET_ROW(i_start, j, 0), + lda, stride, OFFSET_ROW(r_start, j + jb, j), + OFFSET_ROW(i_start, j + jb, j), lda, stride, handle, + workspace)); -// computes the Cholesky decomposition. -// This function is designed to handle both single and batch processing of matrices in either 2D or 3D formats. -// The function ensures that the input matrices are either float or complex float types and performs the decomposition -// either on the upper or lower triangular part of the matrix, based on the 'upper' boolean flag. -mluOpStatus_t MLUOP_WIN_API -mluOpStatus_t MLUOP_WIN_API -mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper, float* workspace) -{ - PARAM_CHECK("mluOpCholesky", handle != NULL); - PARAM_CHECK("mluOpCholesky", input_desc != NULL); - PARAM_CHECK("mluOpCholesky", d_input != NULL); - PARAM_CHECK("mluOpCholesky", output_desc != NULL); - PARAM_CHECK("mluOpCholesky", d_output != NULL); - - PARAM_CHECK("mluOpCholesky", input_desc->dim == 2||input_desc->dim == 3); - PARAM_CHECK("mluOpCholesky", output_desc->dim == input_desc->dim); - PARAM_CHECK("mluOpCholesky", input_desc->dims[0] > 0); - PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); - PARAM_CHECK("mluOpCholesky", output_desc->dims[0] > 0); - PARAM_CHECK("mluOpCholesky", output_desc->dims[1] > 0); - - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - - if(input_desc->dim == 3) - { - PARAM_CHECK("mluOpCholesky", input_desc->dims[2] > 0); - PARAM_CHECK("mluOpCholesky", output_desc->dims[2] > 0); + cnrtQueueSync(queue); + } + if (j + jb < row) { + CHECK_RETURN( + "mluOpCholesky", + ctrsm(batch_size, stride, jb, row - j - jb, + OFFSET_ROW(r_start, j, j), OFFSET_ROW(i_start, j, j), lda, + OFFSET_ROW(r_start, j + jb, j), + OFFSET_ROW(i_start, j + jb, j), lda, handle, workspace)); + cnrtQueueSync(queue); + } } - mluOpDataType_t dtype = input_desc->dtype; - PARAM_CHECK("mluOpCholesky", dtype == output_desc->dtype); - PARAM_CHECK("mluOpCholesky", dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); - - int dim = input_desc->dim; - int size_a = 0, lda = 0, size_c = 0, ldc = 0; + CHECK_RETURN("mluOpCholesky", + transpose(batch_size, 2, size_a * size_a, d_output, workspace, + handle, MLUOP_DTYPE_FLOAT, workspace)); + cnrtQueueSync(queue); - int batch_size = 1; - if(dim == 2) - { - size_a = input_desc->dims[0]; - lda = input_desc->dims[1]; - size_c = output_desc->dims[0]; - ldc = output_desc->dims[1]; - } - else if(dim == 3) - { - batch_size = input_desc->dims[0]; - size_a = input_desc->dims[1]; - lda = input_desc->dims[2]; - size_c = output_desc->dims[1]; - ldc = output_desc->dims[2]; + if (upper) { + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", + transpose(batch_size, size_a, size_a, workspace, d_output, + handle, dtype, workspace)); + cnrtQueueSync(queue); + CHECK_RETURN("mluOpCholesky", conj_complex(batch_size, size_a, size_a, + d_output, d_output, handle)); + cnrtQueueSync(queue); + } else { + if (batch_size > 16) { + CNRT_CHECK(cnrtMemcpy(d_output, workspace, + type_size * size_a * lda * 16, + CNRT_MEM_TRANS_DIR_DEV2DEV)); + CNRT_CHECK(cnrtMemcpy( + d_output + type_size / 4 * size_a * lda * 16, + workspace + type_size / 4 * size_a * lda * 16, + type_size * size_a * lda * ((unsigned long)batch_size - 16), + CNRT_MEM_TRANS_DIR_DEV2DEV)); + } else { + CNRT_CHECK( + cnrtMemcpy(d_output, workspace, + type_size * size_a * lda * ((unsigned long)batch_size), + CNRT_MEM_TRANS_DIR_DEV2DEV)); + } } + } + cnrtQueueSync(queue); + return MLUOP_STATUS_SUCCESS; +} - unsigned long type_size; - MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); - if(type_size == 8 && batch_size > 16 && size_a > 2000) - { - int stride = 2*size_a*lda; - calculate_body(handle, 16, input_desc,d_input, output_desc, d_output, upper, workspace); - cnrtQueueSync(queue); - calculate_body(handle, ((unsigned long)batch_size)-16, input_desc,d_input+16*stride, output_desc, d_output+16*stride, upper, workspace); - } - else - { - calculate_body(handle, batch_size, input_desc,d_input, output_desc, d_output, upper, workspace); - } - - return MLUOP_STATUS_SUCCESS; +// computes the Cholesky decomposition. +// This function is designed to handle both single and batch processing of +// matrices in either 2D or 3D formats. The function ensures that the input +// matrices are either float or complex float types and performs the +// decomposition either on the upper or lower triangular part of the matrix, +// based on the 'upper' boolean flag. +mluOpStatus_t MLUOP_WIN_API mluOpStatus_t MLUOP_WIN_API +mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, + float* d_input, const mluOpTensorDescriptor_t output_desc, + float* d_output, bool upper, float* workspace) { + PARAM_CHECK("mluOpCholesky", handle != NULL); + PARAM_CHECK("mluOpCholesky", input_desc != NULL); + PARAM_CHECK("mluOpCholesky", d_input != NULL); + PARAM_CHECK("mluOpCholesky", output_desc != NULL); + PARAM_CHECK("mluOpCholesky", d_output != NULL); + + PARAM_CHECK("mluOpCholesky", input_desc->dim == 2 || input_desc->dim == 3); + PARAM_CHECK("mluOpCholesky", output_desc->dim == input_desc->dim); + PARAM_CHECK("mluOpCholesky", input_desc->dims[0] > 0); + PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); + PARAM_CHECK("mluOpCholesky", output_desc->dims[0] > 0); + PARAM_CHECK("mluOpCholesky", output_desc->dims[1] > 0); + + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + + if (input_desc->dim == 3) { + PARAM_CHECK("mluOpCholesky", input_desc->dims[2] > 0); + PARAM_CHECK("mluOpCholesky", output_desc->dims[2] > 0); + } + + mluOpDataType_t dtype = input_desc->dtype; + PARAM_CHECK("mluOpCholesky", dtype == output_desc->dtype); + PARAM_CHECK("mluOpCholesky", + dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); + + int dim = input_desc->dim; + int size_a = 0, lda = 0, size_c = 0, ldc = 0; + + int batch_size = 1; + if (dim == 2) { + size_a = input_desc->dims[0]; + lda = input_desc->dims[1]; + size_c = output_desc->dims[0]; + ldc = output_desc->dims[1]; + } else if (dim == 3) { + batch_size = input_desc->dims[0]; + size_a = input_desc->dims[1]; + lda = input_desc->dims[2]; + size_c = output_desc->dims[1]; + ldc = output_desc->dims[2]; + } + + unsigned long type_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); + if (type_size == 8 && batch_size > 16 && size_a > 2000) { + int stride = 2 * size_a * lda; + calculate_body(handle, 16, input_desc, d_input, output_desc, d_output, + upper, workspace); + cnrtQueueSync(queue); + calculate_body(handle, ((unsigned long)batch_size) - 16, input_desc, + d_input + 16 * stride, output_desc, d_output + 16 * stride, + upper, workspace); + } else { + calculate_body(handle, batch_size, input_desc, d_input, output_desc, + d_output, upper, workspace); + } + + return MLUOP_STATUS_SUCCESS; } \ No newline at end of file diff --git a/kernels/cholesky/cholesky.h b/kernels/cholesky/cholesky.h index cf5f0a7b5..16f3fdf71 100644 --- a/kernels/cholesky/cholesky.h +++ b/kernels/cholesky/cholesky.h @@ -22,49 +22,73 @@ #define CNB (16) #define REC_NB (16) -#define POTF_NB ((REC_NB)/4) +#define POTF_NB ((REC_NB) / 4) #define CREC_NB (16) -#define CPOTF_NB ((CREC_NB)/4) +#define CPOTF_NB ((CREC_NB) / 4) // #define CPOTF_NB ((CREC_NB)) #define __CNRT_FUNC_TYPE__ CNRT_FUNC_TYPE_UNION1 #define TASK_NUM (4) #define NB (32) #define CLUSTER_NUM 1 -#define M (TASK_NUM * POTF_NB) //POTF边长 +#define M (TASK_NUM * POTF_NB) // POTF边长 #define ZERO 0.0 -#define SHARED_MEM_SIZE (((M*POTF_NB/TASK_NUM * 4)+(POTF_NB * POTF_NB))) +#define SHARED_MEM_SIZE (((M * POTF_NB / TASK_NUM * 4) + (POTF_NB * POTF_NB))) #define OFFSET_ROW(A, i, j) A + ((i) * (lda) + (j)) #define OFFSET_B_ROW(B, i, j) B + ((i) * (ldb) + (j)) +mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, + int n, int recnb, float* dA, int ldda, + int gbstep, mluOpHandle_t handle); +// void mluOpCholesky(bool trans, bool uplo, int n, float* dA, float* dC, int +// ldda); -mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, int n, int recnb, float* dA, int ldda, int gbstep, mluOpHandle_t handle); -// void mluOpCholesky(bool trans, bool uplo, int n, float* dA, float* dC, int ldda); +mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans, int n, int k, + float* d_a, int ldda, float* d_c, int lddc, + mluOpHandle_t handle); -mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle); +mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, + float alpha, float beta, float* d_a, int lda, int stride_a, + float* d_b, int ldb, int stride_b, float* d_c, int ldc, + int stride_c, mluOpHandle_t handle); -mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, int stride_a, float* d_b, int ldb, int stride_b, float* d_c, int ldc, int stride_c, mluOpHandle_t handle); - -//side:true->right +// side:true->right // false->left -mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, float* d_a, int ldda, float* d_b, int lddb, mluOpHandle_t handle); +mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, + float* d_a, int ldda, float* d_b, int lddb, + mluOpHandle_t handle); -mluOpStatus_t transpose(int batch, int m, int n,float* d_input,float* d_output, mluOpHandle_t handle,mluOpDataType_t type, float* workspace); +mluOpStatus_t transpose(int batch, int m, int n, float* d_input, + float* d_output, mluOpHandle_t handle, + mluOpDataType_t type, float* workspace); -mluOpStatus_t conj_complex(int batch, int m, int n,float* d_input,float* d_output, mluOpHandle_t handle); +mluOpStatus_t conj_complex(int batch, int m, int n, float* d_input, + float* d_output, mluOpHandle_t handle); -mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, float* drA, float* diA, int lda, mluOpHandle_t handle); +mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, + float* drA, float* diA, int lda, + mluOpHandle_t handle); -mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle); +mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, + float alpha, float beta, float* d_ra, float* d_ia, int lda, + int stride_a, float* d_rb, float* d_ib, int ldb, + int stride_b, float* d_rc, float* d_ic, int ldc, + int stride_c, mluOpHandle_t handle); mluOpStatus_t workspace_malloc(size_t size, float** workspace); -// mluOpStatus_t complex_set_half_zero(int batch, int stride, float* d_a, int m, int ld); +// mluOpStatus_t complex_set_half_zero(int batch, int stride, float* d_a, int m, +// int ld); -mluOpStatus_t set_half_zero(int batch,int stride,float* d_a, int lda, int m, mluOpHandle_t handle); +mluOpStatus_t set_half_zero(int batch, int stride, float* d_a, int lda, int m, + mluOpHandle_t handle); -mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, float* id_a, int lda, float* rd_b, float* id_b, int ldb, mluOpHandle_t handle); +mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, + float* id_a, int lda, float* rd_b, float* id_b, int ldb, + mluOpHandle_t handle); -mluOpStatus_t cherk(int batch, int stride, int n,int k, float* rd_a, float* id_a, int lda, float* rd_c, float* id_c, int ldc, mluOpHandle_t handle); +mluOpStatus_t cherk(int batch, int stride, int n, int k, float* rd_a, + float* id_a, int lda, float* rd_c, float* id_c, int ldc, + mluOpHandle_t handle); #endif \ No newline at end of file diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index becaf6b9a..c6a7325ef 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -1,1028 +1,852 @@ #include "cholesky.h" -#include +#include unsigned int next_power_of_2(unsigned int n) { - if (n == 0) { - return 1; - } + if (n == 0) { + return 1; + } - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; - return n + 1; + return n + 1; } - __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; -__mlu_func__ -float recur_add(float* input, int length) -{ - if(length == 1) - { - return input[0]; - } - else - { - int half_length; - half_length = length / 2; - float sum1 = recur_add(input, half_length); - float sum2 = recur_add(input + half_length, length - half_length); - input[0] = sum1+sum2; - return sum1 + sum2; - } - +__mlu_func__ float recur_add(float* input, int length) { + if (length == 1) { + return input[0]; + } else { + int half_length; + half_length = length / 2; + float sum1 = recur_add(input, half_length); + float sum2 = recur_add(input + half_length, length - half_length); + input[0] = sum1 + sum2; + return sum1 + sum2; + } } -__mlu_func__ -float kahansum(float* input, int length) -{ - float sum = 0.0; - float c = 0.0; - for(int i = 0; i < length; i++) - { - float y = input[i] - c; - float t = sum + y; - c = (t - sum) - y; - sum = t; - } - input[0] = sum; - return sum; +__mlu_func__ float kahansum(float* input, int length) { + float sum = 0.0; + float c = 0.0; + for (int i = 0; i < length; i++) { + float y = input[i] - c; + float t = sum + y; + c = (t - sum) - y; + sum = t; + } + input[0] = sum; + return sum; } - -__mlu_func__ -void sgemm_fixwidth_device(int m, int k, - float* A0, const int lda, - float *sC, float *sB) -{ - int id = taskId % 4; - - int span = POTF_NB; - - - - - float* rC = (float*)nram_buffer; - float* rA= rC + M * POTF_NB/TASK_NUM; - float* rp = rA + M * POTF_NB/TASK_NUM; - - float* rB= rp + M * POTF_NB/TASK_NUM; - - float* temp_result = rB + POTF_NB * POTF_NB; - temp_result[0] = 0.0; - - - - if(id*span0) - { - float* temp_result = nram_src+span*span; - float* temp_result2 = temp_result + span * span; - float* temp_a = nram_src; - float* temp_b = diag+iter*span; - float* local_result = temp_result; - float* local_diag = temp_result2; - - for(int i = 0; i < span; i++) - { - __bang_mul(local_result, temp_a, temp_b, iter); - __bang_mul(local_diag, diag+i*span, temp_b,iter); - local_result = local_result + span; - temp_a = temp_a + span; - local_diag = local_diag + span; - } - - if(iter>1) - { - local_result = temp_result; - local_diag = temp_result2; - for(int i = 0; i < span; i++) - { - kahansum(local_result,iter); - kahansum(local_diag,iter); - local_result = local_result + span; - local_diag = local_diag + span; - } - } - for(int i = 0; i < span; i++) - { - nram_src[i*span+iter] -= temp_result[i*span]; - diag[i*span+iter] -= temp_result2[i*span]; - } - - } + if (id == 0) { + __memcpy(sB, rp, POTF_NB * POTF_NB * sizeof(float), NRAM2SRAM); + } - if(factor<0) - { - if(id == 0) - { - printf("factor:%.3f\n",factor); - printf("iter:%d\n",iter); - } + __sync_cluster(); + for (int iter = 0; iter < k; iter += POTF_NB) { + __bang_move(rA, rp, POTF_NB * span * sizeof(float)); + __memcpy(rB, sB, POTF_NB * POTF_NB * sizeof(float), SRAM2NRAM); + + __sync_cluster(); + if (id * span < m) + __memcpy_async(rp, OFFSET_ROW(A0, span * id, iter + POTF_NB), + POTF_NB * sizeof(float), GDRAM2NRAM, + POTF_NB * sizeof(float), lda * sizeof(float), span - 1); + + for (int i = 0; i < span; i++) { + for (int j = 0; j < POTF_NB; j++) { + for (int h = 0; h < POTF_NB; h++) { + temp_result[h] = rA[i * POTF_NB + h] * rB[j * POTF_NB + h]; } - factor = diag[iter*POTF_NB+iter]; - factor = std::sqrt(factor); - factor = (1.0/factor); - for(int i = 0; i < span; i++) - { - - nram_src[i*POTF_NB+iter] *= factor; - diag[i*POTF_NB+iter] *= factor; - - - } - __sync(); - - + rC[i * POTF_NB + j] += kahansum(temp_result, POTF_NB); + } } + __sync_cluster(); - - - if(id*span 0; - int span = (remain > POTF_NB||remain <= 0) ? POTF_NB : remain; + if (id * span < m) + __memcpy(sC + coreId * span * POTF_NB, rp, POTF_NB * span * sizeof(float), + NRAM2SRAM); + __sync_cluster(); +} - float *rA = (float*)nram_buffer + id * NB * NB * 4; +static __mlu_func__ void spotf2_sminout_fixsize_device(int m, float* A, + int lda) { + int id = coreId % 4; + int span = POTF_NB; + float* diag = (float*)nram_buffer; + float* nram_src = diag + span * span; + + __memcpy(diag, A, span * span * sizeof(float), SRAM2NRAM); + __memcpy(nram_src, A + id * span * POTF_NB, span * span * sizeof(float), + SRAM2NRAM); + + float factor; + for (int iter = 0; iter < POTF_NB; iter++) { + if (iter > 0) { + float* temp_result = nram_src + span * span; + float* temp_result2 = temp_result + span * span; + float* temp_a = nram_src; + float* temp_b = diag + iter * span; + float* local_result = temp_result; + float* local_diag = temp_result2; + + for (int i = 0; i < span; i++) { + __bang_mul(local_result, temp_a, temp_b, iter); + __bang_mul(local_diag, diag + i * span, temp_b, iter); + local_result = local_result + span; + temp_a = temp_a + span; + local_diag = local_diag + span; + } + + if (iter > 1) { + local_result = temp_result; + local_diag = temp_result2; + for (int i = 0; i < span; i++) { + kahansum(local_result, iter); + kahansum(local_diag, iter); + local_result = local_result + span; + local_diag = local_diag + span; + } + } + for (int i = 0; i < span; i++) { + nram_src[i * span + iter] -= temp_result[i * span]; + diag[i * span + iter] -= temp_result2[i * span]; + } + } - float *rB = rA + NB * NB; + if (factor < 0) { + if (id == 0) { + printf("factor:%.3f\n", factor); + printf("iter:%d\n", iter); + } + } + factor = diag[iter * POTF_NB + iter]; + factor = std::sqrt(factor); + factor = (1.0 / factor); + for (int i = 0; i < span; i++) { + nram_src[i * POTF_NB + iter] *= factor; + diag[i * POTF_NB + iter] *= factor; + } + __sync(); + } + __sync_cluster(); - float *rC = rB + NB * NB; + if (id * span < m) + __memcpy(A + id * span * POTF_NB, nram_src, span * span * sizeof(float), + NRAM2SRAM); + __sync_cluster(); +} - float* rp = rC + NB * NB; +__mlu_func__ void sgemm_anywidth_device(int m, int k, float* A0, const int lda, + float* sC, float* sB) { + int id = taskId % 4; - int span_b = POTF_NB > m ? m : POTF_NB; + int remain = m - id * POTF_NB; + bool if_execute = remain > 0; + int span = (remain > POTF_NB || remain <= 0) ? POTF_NB : remain; + float* rA = (float*)nram_buffer + id * NB * NB * 4; + float* rB = rA + NB * NB; - __memset_nram(rC,span_b*span,(float)ZERO); + float* rC = rB + NB * NB; - if(if_execute) - { - if(k>0) - { - __memcpy(rA,A0+id*POTF_NB*lda,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span-1); - } - __memcpy(rp,sC+id*POTF_NB*lda,span_b*sizeof(float),SRAM2NRAM,span_b*sizeof(float),lda*sizeof(float),span-1); + float* rp = rC + NB * NB; - } - - if(k>0) - { - __memcpy(rB,A0,k*sizeof(float),SRAM2NRAM,NB*sizeof(float),lda*sizeof(float),span_b-1); - } - + int span_b = POTF_NB > m ? m : POTF_NB; - __sync_cluster(); + __memset_nram(rC, span_b * span, (float)ZERO); - for(int i = 0; i < span; i++) - { - for(int j = 0; j < span_b; j++) - { - for(int h = 0; h < k; h++) - { - rC[i*span_b+j] += rA[i*NB+h] * rB[j*NB+h]; - } - } + if (if_execute) { + if (k > 0) { + __memcpy(rA, A0 + id * POTF_NB * lda, k * sizeof(float), SRAM2NRAM, + NB * sizeof(float), lda * sizeof(float), span - 1); } + __memcpy(rp, sC + id * POTF_NB * lda, span_b * sizeof(float), SRAM2NRAM, + span_b * sizeof(float), lda * sizeof(float), span - 1); + } - __bang_sub(rp,rp,rC,span_b * span); + if (k > 0) { + __memcpy(rB, A0, k * sizeof(float), SRAM2NRAM, NB * sizeof(float), + lda * sizeof(float), span_b - 1); + } - __sync_cluster(); + __sync_cluster(); - if(id==0) - { - for(int i = 0; i < span; i++) - { - __memcpy(sC+(i*lda),rp+i*span_b,(i+1)*sizeof(float),NRAM2SRAM); - } - + for (int i = 0; i < span; i++) { + for (int j = 0; j < span_b; j++) { + for (int h = 0; h < k; h++) { + rC[i * span_b + j] += rA[i * NB + h] * rB[j * NB + h]; + } } - else if(if_execute) - { - __memcpy(sC+(id*POTF_NB*lda),rp,span_b*sizeof(float),NRAM2SRAM,lda*sizeof(float),span_b*sizeof(float),span-1); - } - - -} + } + __bang_sub(rp, rp, rC, span_b * span); -static __mlu_func__ void spotf2_sminout_anysize_device(int m, float *A, int lda) -{ - float factor; - int id = coreId % 4; - int finish = id * POTF_NB; - int remain = m - finish; - bool if_execute = remain > 0; - int span = remain > POTF_NB ? POTF_NB : remain; - int iter_num = m > POTF_NB ? POTF_NB : m; - for(int iter = 0; iter < iter_num; iter++) - { - factor=sqrt(A[iter*lda+iter]); - factor = 1.0/factor; - __sync_cluster(); - for(int i = 0; i < span; i++) - { - if(if_execute) - A[i*lda+iter+id*POTF_NB*lda] *= factor; - - } - __sync_cluster(); - - if(if_execute) - { - for(int i = iter + 1; i < iter_num; i++) - { - for(int j = finish; j < finish + span; j++) - { - if(j < i) - continue; - A[j * lda + i ] -= A[i*lda+iter] * A[j * lda + iter]; - } - } - } - - __sync_cluster(); + __sync_cluster(); + if (id == 0) { + for (int i = 0; i < span; i++) { + __memcpy(sC + (i * lda), rp + i * span_b, (i + 1) * sizeof(float), + NRAM2SRAM); } - -} - -__mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float *A0, float *A, int lda, const int localstep, const int gbstep) -{ - int id = taskId % 4; - float* shared_data = (float*)sram_buffer; - float* sdata_A = shared_data; - float* sdata_B = shared_data + m *POTF_NB/TASK_NUM * 4; - - - sgemm_fixwidth_device(m, localstep, A0, lda, sdata_A, sdata_B); + } else if (if_execute) { + __memcpy(sC + (id * POTF_NB * lda), rp, span_b * sizeof(float), NRAM2SRAM, + lda * sizeof(float), span_b * sizeof(float), span - 1); + } +} +static __mlu_func__ void spotf2_sminout_anysize_device(int m, float* A, + int lda) { + float factor; + int id = coreId % 4; + int finish = id * POTF_NB; + int remain = m - finish; + bool if_execute = remain > 0; + int span = remain > POTF_NB ? POTF_NB : remain; + int iter_num = m > POTF_NB ? POTF_NB : m; + for (int iter = 0; iter < iter_num; iter++) { + factor = sqrt(A[iter * lda + iter]); + factor = 1.0 / factor; __sync_cluster(); - - - - spotf2_sminout_fixsize_device(m, sdata_A, POTF_NB); - + for (int i = 0; i < span; i++) { + if (if_execute) A[i * lda + iter + id * POTF_NB * lda] *= factor; + } __sync_cluster(); - int span = POTF_NB; - - - if(id==0) - { - for(int i = 0; i < span; i++) - { - __memcpy(A+(i*lda),sdata_A+i*POTF_NB,(i+1)*sizeof(float),SRAM2LDRAM); + if (if_execute) { + for (int i = iter + 1; i < iter_num; i++) { + for (int j = finish; j < finish + span; j++) { + if (j < i) continue; + A[j * lda + i] -= A[i * lda + iter] * A[j * lda + iter]; } - - } - else if(id*span < m) - { - __memcpy(A+(id*POTF_NB*lda),sdata_A+coreId*POTF_NB*POTF_NB,POTF_NB*sizeof(float),SRAM2LDRAM,lda*sizeof(float),POTF_NB*sizeof(float),span-1); + } } __sync_cluster(); + } +} +__mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float* A0, + float* A, int lda, + const int localstep, + const int gbstep) { + int id = taskId % 4; + float* shared_data = (float*)sram_buffer; + float* sdata_A = shared_data; + float* sdata_B = shared_data + m * POTF_NB / TASK_NUM * 4; -} + sgemm_fixwidth_device(m, localstep, A0, lda, sdata_A, sdata_B); -__mlu_func__ void spotf2_smlpout_anywidth_device(const int m, float *A0, float *A, int lda, const int localstep, const int gbstep) -{ + __sync_cluster(); - sgemm_anywidth_device(m, localstep, A0, lda, A, nullptr); + spotf2_sminout_fixsize_device(m, sdata_A, POTF_NB); - - spotf2_sminout_anysize_device(m, A, lda); + __sync_cluster(); - __sync_cluster(); + int span = POTF_NB; + if (id == 0) { + for (int i = 0; i < span; i++) { + __memcpy(A + (i * lda), sdata_A + i * POTF_NB, (i + 1) * sizeof(float), + SRAM2LDRAM); + } + } else if (id * span < m) { + __memcpy(A + (id * POTF_NB * lda), sdata_A + coreId * POTF_NB * POTF_NB, + POTF_NB * sizeof(float), SRAM2LDRAM, lda * sizeof(float), + POTF_NB * sizeof(float), span - 1); + } + __sync_cluster(); } +__mlu_func__ void spotf2_smlpout_anywidth_device(const int m, float* A0, + float* A, int lda, + const int localstep, + const int gbstep) { + sgemm_anywidth_device(m, localstep, A0, lda, A, nullptr); -__mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, bool trans, int m, float *dA, int lda, int localstep, int gbstep) -{ - int id = taskId; - float* orignA = dA; - int batch_id = id / 4; - if(batch_id >= batch) - return; - dA = orignA + batch_id * stride; + spotf2_sminout_anysize_device(m, A, lda); - float* shared_data = (float*)sram_buffer; + __sync_cluster(); +} - if(m%4==0) - { - for(int i = 0; i < m; i += POTF_NB) - { - spotf2_smlpout_fixwidth_device(m-i,OFFSET_ROW(dA, localstep + i,0), OFFSET_ROW(dA, localstep + i, localstep + i), lda, localstep+i, gbstep); - } +__mlu_global__ void spotf2_smlpin_anywidth_kernel(int batch, int stride, + bool trans, int m, float* dA, + int lda, int localstep, + int gbstep) { + int id = taskId; + float* orignA = dA; + int batch_id = id / 4; + if (batch_id >= batch) return; + dA = orignA + batch_id * stride; + + float* shared_data = (float*)sram_buffer; + + if (m % 4 == 0) { + for (int i = 0; i < m; i += POTF_NB) { + spotf2_smlpout_fixwidth_device( + m - i, OFFSET_ROW(dA, localstep + i, 0), + OFFSET_ROW(dA, localstep + i, localstep + i), lda, localstep + i, + gbstep); } - else - { - - if(id == 0) - { - __memcpy(shared_data,dA,m*sizeof(float),GDRAM2SRAM,NB*sizeof(float),lda*sizeof(float),m-1); - } - __sync_cluster(); - - for(int i = 0; i < m; i += POTF_NB) - { - spotf2_smlpout_anywidth_device(m-i,shared_data+i*NB, shared_data+i*NB+i, NB, localstep+i, gbstep); - } - - __sync_cluster(); - - if(id == 0) - { - __memcpy(dA,shared_data,m*sizeof(float),SRAM2GDRAM,lda*sizeof(float),NB*sizeof(float),m-1); - } - __sync_cluster(); + } else { + if (id == 0) { + __memcpy(shared_data, dA, m * sizeof(float), GDRAM2SRAM, + NB * sizeof(float), lda * sizeof(float), m - 1); } + __sync_cluster(); - - - + for (int i = 0; i < m; i += POTF_NB) { + spotf2_smlpout_anywidth_device(m - i, shared_data + i * NB, + shared_data + i * NB + i, NB, + localstep + i, gbstep); + } -} + __sync_cluster(); -__mlu_func__ -void small_sgemm_batch(int m, int k, - float* A0, const int lda,int width, - float* dst, float* nram_remain) -{ - int ldk = k; - int ldm = m; - float* src1 = nram_remain; - float* src2 = src1 + ldk * ldm; - float* dst2 = src2 + width * ldk; - - float* dA = A0 + k; - __memcpy_async(dst, dA, width*sizeof(float),GDRAM2NRAM,width*sizeof(float),lda*sizeof(float),m-1); - - if(k == 0) - { - __sync(); - return; + if (id == 0) { + __memcpy(dA, shared_data, m * sizeof(float), SRAM2GDRAM, + lda * sizeof(float), NB * sizeof(float), m - 1); } - - __memset_nram(src1,ldm*ldk,(float)ZERO); + __sync_cluster(); + } +} - __memcpy_async(src1, A0, k*sizeof(float),GDRAM2NRAM,ldk*sizeof(float),lda*sizeof(float),m-1); +__mlu_func__ void small_sgemm_batch(int m, int k, float* A0, const int lda, + int width, float* dst, float* nram_remain) { + int ldk = k; + int ldm = m; + float* src1 = nram_remain; + float* src2 = src1 + ldk * ldm; + float* dst2 = src2 + width * ldk; - __memset_nram(dst2,ldm*width,(float)ZERO); + float* dA = A0 + k; + __memcpy_async(dst, dA, width * sizeof(float), GDRAM2NRAM, + width * sizeof(float), lda * sizeof(float), m - 1); + if (k == 0) { __sync(); + return; + } - __memcpy(src2, src1, ldk*width*sizeof(float),NRAM2NRAM); + __memset_nram(src1, ldm * ldk, (float)ZERO); - for(int i = 0; i < m; i++) - { - for(int j = 0; j < width; j++) - { - for(int h = 0; h < k; h++) - { - dst2[i*width+j] += src1[i*ldk+h] * src2[j*ldk+h]; - } - } - } + __memcpy_async(src1, A0, k * sizeof(float), GDRAM2NRAM, ldk * sizeof(float), + lda * sizeof(float), m - 1); - __bang_sub(dst,dst,dst2,width * m); + __memset_nram(dst2, ldm * width, (float)ZERO); - __sync(); -} + __sync(); -__mlu_func__ void small_sminout_batch(int m, int width, float *dst, float *nram_remain, int lda) -{ - float factor; - float* diag = dst; - - for(int iter = 0; iter < width; iter++) - { - factor=sqrt(diag[iter*width+iter]); - factor = 1.0/factor; - for(int i = 0; i < m; i ++) - { - dst[i*width+iter] *= factor; - } - __sync(); - for(int i = iter + 1; i < width; i++) - { - for(int j = 0; j < m; j++) - { - dst[j * width + i ] -= dst[i*width+iter] * dst[j * width + iter]; - - } - } - __sync(); + __memcpy(src2, src1, ldk * width * sizeof(float), NRAM2NRAM); - - + for (int i = 0; i < m; i++) { + for (int j = 0; j < width; j++) { + for (int h = 0; h < k; h++) { + dst2[i * width + j] += src1[i * ldk + h] * src2[j * ldk + h]; + } + } + } + __bang_sub(dst, dst, dst2, width * m); - - + __sync(); +} + +__mlu_func__ void small_sminout_batch(int m, int width, float* dst, + float* nram_remain, int lda) { + float factor; + float* diag = dst; + for (int iter = 0; iter < width; iter++) { + factor = sqrt(diag[iter * width + iter]); + factor = 1.0 / factor; + for (int i = 0; i < m; i++) { + dst[i * width + iter] *= factor; } __sync(); + for (int i = iter + 1; i < width; i++) { + for (int j = 0; j < m; j++) { + dst[j * width + i] -= dst[i * width + iter] * dst[j * width + iter]; + } + } + __sync(); + } + __sync(); } -__mlu_func__ -void smlpout_batch(const int m, float *A0, float *A, int lda, const int localstep, int width) -{ - float* dst = (float*)nram_buffer; - float* nram_remain = dst + m * m; - - - small_sgemm_batch(m, localstep, A0, lda, width, dst, nram_remain); +__mlu_func__ void smlpout_batch(const int m, float* A0, float* A, int lda, + const int localstep, int width) { + float* dst = (float*)nram_buffer; + float* nram_remain = dst + m * m; - __sync(); - - small_sminout_batch(m, width, dst, nram_remain, width); + small_sgemm_batch(m, localstep, A0, lda, width, dst, nram_remain); - __sync(); + __sync(); - for(int i = 0; i < width; i++) - { - __memcpy(A+(i*lda),dst+i*width,(i+1)*sizeof(float),NRAM2GDRAM); - } + small_sminout_batch(m, width, dst, nram_remain, width); - if(m > width) - { - __memcpy(A+(width*lda),dst+width*width,width*sizeof(float),NRAM2GDRAM,lda*sizeof(float),width*sizeof(float),m-width-1); - } + __sync(); - __sync(); + for (int i = 0; i < width; i++) { + __memcpy(A + (i * lda), dst + i * width, (i + 1) * sizeof(float), + NRAM2GDRAM); + } + if (m > width) { + __memcpy(A + (width * lda), dst + width * width, width * sizeof(float), + NRAM2GDRAM, lda * sizeof(float), width * sizeof(float), + m - width - 1); + } + __sync(); } -__mlu_global__ void spotf2_batch_kernel(int batch, int stride, int m, float *dA, int lda) -{ - int id = taskId; - int batch_id = id; - if(batch_id >= batch) - return; - float* orignA = dA; - dA = orignA + batch_id * stride; - int width = POTF_NB; - int span = width; - - for(int i = 0; i < m; i += width) - { - span = std::min(width, m - i); - smlpout_batch(m-i,dA+i*lda,dA+i*lda+i,lda,i,span); - } - +__mlu_global__ void spotf2_batch_kernel(int batch, int stride, int m, float* dA, + int lda) { + int id = taskId; + int batch_id = id; + if (batch_id >= batch) return; + float* orignA = dA; + dA = orignA + batch_id * stride; + int width = POTF_NB; + int span = width; + + for (int i = 0; i < m; i += width) { + span = std::min(width, m - i); + smlpout_batch(m - i, dA + i * lda, dA + i * lda + i, lda, i, span); + } } -mluOpStatus_t mlu_spotf2_lpin(int batch, int stride, bool trans,bool uplo, int n, int ldda, float* dA, int gbstep, cnrtQueue_t queue) -{ - cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; - dim.y = 1; - dim.z = 1; - if(batch > 1) - { - dim.x = batch; - KERNEL_CHECK( - spotf2_batch_kernel<<>>(batch, stride, n, dA, ldda)); +mluOpStatus_t mlu_spotf2_lpin(int batch, int stride, bool trans, bool uplo, + int n, int ldda, float* dA, int gbstep, + cnrtQueue_t queue) { + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + dim.y = 1; + dim.z = 1; + if (batch > 1) { + dim.x = batch; + KERNEL_CHECK(spotf2_batch_kernel<<>>(batch, stride, + n, dA, ldda)); + } else { + int carry_batch = batch; + if (batch == 1) { + func_type = CNRT_FUNC_TYPE_UNION1; + } else if (batch == 2) { + func_type = CNRT_FUNC_TYPE_UNION2; + } else if (batch <= 4) { + func_type = CNRT_FUNC_TYPE_UNION4; + carry_batch = 4; + } else { + func_type = CNRT_FUNC_TYPE_UNION8; + carry_batch = batch < 8 ? 8 : batch; } - else - { - int carry_batch = batch; - if(batch == 1) - { - func_type = CNRT_FUNC_TYPE_UNION1; - } - else if(batch == 2) - { - func_type = CNRT_FUNC_TYPE_UNION2; - } - else if(batch <= 4) - { - func_type = CNRT_FUNC_TYPE_UNION4; - carry_batch = 4; - } - else - { - func_type = CNRT_FUNC_TYPE_UNION8; - carry_batch = batch < 8 ? 8 : batch; - } - dim.x = carry_batch * 4; - KERNEL_CHECK( - spotf2_smlpin_anywidth_kernel<<>>(batch, stride, trans, n, dA, ldda, 0,gbstep)); - - } - return MLUOP_STATUS_SUCCESS; + dim.x = carry_batch * 4; + KERNEL_CHECK(spotf2_smlpin_anywidth_kernel<<>>( + batch, stride, trans, n, dA, ldda, 0, gbstep)); + } + return MLUOP_STATUS_SUCCESS; } -__mlu_entry__ void mlu_strsm_rectile_batch_kernel( - int batch, int stride, - int m,int n, bool trans, - float *dA, int32_t lda, - float *dB, int32_t ldb) -{ - int id = taskId; - int batch_id = id; - if(batch_id >= batch) - return; - float* orignA = dA; - float* orignB = dB; - dA = orignA + batch_id * stride; - dB = orignB + batch_id * stride; - int span = n; - int start = 0; - - - - float *sA = (float*)nram_buffer; - float *rB = sA + 8*POTF_NB; - float *rC = rB + 4*POTF_NB * 8*POTF_NB; - float *rBp = rC + 4*POTF_NB * 8*POTF_NB; - float *rA = rBp + 4*POTF_NB; - int calc_length = (8 * POTF_NB) > m ? m : (8 * POTF_NB); - __memset_nram(rB,POTF_NB*calc_length,(float)ZERO); - __memset_nram(sA,calc_length*calc_length,(float)ZERO); - - - float temp_b = 0, factor = 0; - - - - __memcpy_async(sA,dA,sizeof(float),GDRAM2NRAM); +__mlu_entry__ void mlu_strsm_rectile_batch_kernel(int batch, int stride, int m, + int n, bool trans, float* dA, + int32_t lda, float* dB, + int32_t ldb) { + int id = taskId; + int batch_id = id; + if (batch_id >= batch) return; + float* orignA = dA; + float* orignB = dB; + dA = orignA + batch_id * stride; + dB = orignB + batch_id * stride; + int span = n; + int start = 0; + + float* sA = (float*)nram_buffer; + float* rB = sA + 8 * POTF_NB; + float* rC = rB + 4 * POTF_NB * 8 * POTF_NB; + float* rBp = rC + 4 * POTF_NB * 8 * POTF_NB; + float* rA = rBp + 4 * POTF_NB; + int calc_length = (8 * POTF_NB) > m ? m : (8 * POTF_NB); + __memset_nram(rB, POTF_NB * calc_length, (float)ZERO); + __memset_nram(sA, calc_length * calc_length, (float)ZERO); + + float temp_b = 0, factor = 0; + + __memcpy_async(sA, dA, sizeof(float), GDRAM2NRAM); + + __memcpy(rBp, OFFSET_B_ROW(dB, start, 0), sizeof(float), GDRAM2NRAM, + sizeof(float), ldb * sizeof(float), span - 1); + __sync(); - __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),GDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); + if (trans) { + __memcpy_async(rA, sA, (1) * sizeof(float), NRAM2NRAM); + __memcpy_async(rB, rBp, sizeof(float), NRAM2NRAM, + calc_length * sizeof(float), sizeof(float), span - 1); __sync(); - - if(trans) - { - __memcpy_async(rA,sA,(1)*sizeof(float),NRAM2NRAM); - __memcpy_async(rB,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); - __sync(); - - __memcpy_async(sA,OFFSET_ROW(dA,1,0),2*sizeof(float),GDRAM2NRAM); - __memcpy_async(rBp,OFFSET_B_ROW(dB,start,1),sizeof(float),GDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); - factor = 1.0 / rA[0]; - for(int i = 0; i < span; i++) - { - rB[i*calc_length] *= factor; - } + __memcpy_async(sA, OFFSET_ROW(dA, 1, 0), 2 * sizeof(float), GDRAM2NRAM); + __memcpy_async(rBp, OFFSET_B_ROW(dB, start, 1), sizeof(float), GDRAM2NRAM, + sizeof(float), ldb * sizeof(float), span - 1); + factor = 1.0 / rA[0]; + for (int i = 0; i < span; i++) { + rB[i * calc_length] *= factor; + } - __sync(); - - for(int iter = 1; iter < m - 1; iter++) - { - __memcpy_async(rA,sA,(iter+1)*sizeof(float),NRAM2NRAM); - __memcpy_async(rB+iter,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); - __sync(); - - __memcpy_async(sA,OFFSET_ROW(dA,iter+1,0),(iter+2)*sizeof(float),GDRAM2NRAM); - __memcpy_async(rBp,OFFSET_B_ROW(dB,start,iter+1),sizeof(float),GDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); - factor = 1.0 / rA[iter]; - for(int i = 0; i < span; i++) - { - __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,iter); - temp_b = 0; - for(int j = 0; j < iter; j++) - { - temp_b += rC[i*calc_length+j]; - } - temp_b = rB[i*calc_length+iter] - temp_b; - rB[i*calc_length+iter] = temp_b * factor; - } - - __sync(); - } + __sync(); - __memcpy_async(rA,sA,(m)*sizeof(float),NRAM2NRAM); - __memcpy_async(rB+m-1,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); - __sync(); - factor = 1.0 / rA[m-1]; - for(int i = 0; i < span; i++) - { - __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,m-1); - - temp_b = 0; - for(int j = 0; j < m-1; j++) - { - temp_b += rC[i*calc_length+j]; - } - temp_b = rB[i*calc_length+m-1] - temp_b; - - rB[i*calc_length+m-1] = temp_b * factor; + for (int iter = 1; iter < m - 1; iter++) { + __memcpy_async(rA, sA, (iter + 1) * sizeof(float), NRAM2NRAM); + __memcpy_async(rB + iter, rBp, sizeof(float), NRAM2NRAM, + calc_length * sizeof(float), sizeof(float), span - 1); + __sync(); + + __memcpy_async(sA, OFFSET_ROW(dA, iter + 1, 0), + (iter + 2) * sizeof(float), GDRAM2NRAM); + __memcpy_async(rBp, OFFSET_B_ROW(dB, start, iter + 1), sizeof(float), + GDRAM2NRAM, sizeof(float), ldb * sizeof(float), span - 1); + factor = 1.0 / rA[iter]; + for (int i = 0; i < span; i++) { + __bang_mul(rC + i * calc_length, rA, rB + i * calc_length, iter); + temp_b = 0; + for (int j = 0; j < iter; j++) { + temp_b += rC[i * calc_length + j]; } - __sync(); - - - __memcpy(OFFSET_B_ROW(dB,start,0),rB,calc_length*sizeof(float),NRAM2GDRAM,ldb * sizeof(float), calc_length * sizeof(float), span - 1); - __sync(); + temp_b = rB[i * calc_length + iter] - temp_b; + rB[i * calc_length + iter] = temp_b * factor; + } + __sync(); } -} + __memcpy_async(rA, sA, (m) * sizeof(float), NRAM2NRAM); + __memcpy_async(rB + m - 1, rBp, sizeof(float), NRAM2NRAM, + calc_length * sizeof(float), sizeof(float), span - 1); + __sync(); + factor = 1.0 / rA[m - 1]; + for (int i = 0; i < span; i++) { + __bang_mul(rC + i * calc_length, rA, rB + i * calc_length, m - 1); + temp_b = 0; + for (int j = 0; j < m - 1; j++) { + temp_b += rC[i * calc_length + j]; + } + temp_b = rB[i * calc_length + m - 1] - temp_b; -__mlu_entry__ void mlu_strsm_rectile_kernel( - int batch, int stride, - int m,int n, bool trans, - float *dA, int32_t lda, - float *dB, int32_t ldb) -{ - int id = taskId; - int batch_id = id / 4; - if(batch_id >= batch) - return; - id = id % 4; - float* orignA = dA; - float* orignB = dB; - dA = orignA + batch_id * stride; - dB = orignB + batch_id * stride; - - - - int span = n / 4; - int start = id * span; - if(id == 3) - { - span = n - 3 * span; + rB[i * calc_length + m - 1] = temp_b * factor; } + __sync(); - bool if_execute = span > 0; - float* sA = (float*)sram_buffer; - - float* rB = (float*)nram_buffer; - float* rC = rB + 4*POTF_NB * 8*POTF_NB; - float* rBp = rC + 4*POTF_NB * 8*POTF_NB; - float* rA = rBp + 4*POTF_NB; - int calc_length = (8 * POTF_NB) > m ? m : (8 * POTF_NB); - __memset_nram(rB,POTF_NB*calc_length,(float)ZERO); + __memcpy(OFFSET_B_ROW(dB, start, 0), rB, calc_length * sizeof(float), + NRAM2GDRAM, ldb * sizeof(float), calc_length * sizeof(float), + span - 1); + __sync(); + } +} +__mlu_entry__ void mlu_strsm_rectile_kernel(int batch, int stride, int m, int n, + bool trans, float* dA, int32_t lda, + float* dB, int32_t ldb) { + int id = taskId; + int batch_id = id / 4; + if (batch_id >= batch) return; + id = id % 4; + float* orignA = dA; + float* orignB = dB; + dA = orignA + batch_id * stride; + dB = orignB + batch_id * stride; + + int span = n / 4; + int start = id * span; + if (id == 3) { + span = n - 3 * span; + } + bool if_execute = span > 0; + float* sA = (float*)sram_buffer; + + float* rB = (float*)nram_buffer; + float* rC = rB + 4 * POTF_NB * 8 * POTF_NB; + float* rBp = rC + 4 * POTF_NB * 8 * POTF_NB; + float* rA = rBp + 4 * POTF_NB; + int calc_length = (8 * POTF_NB) > m ? m : (8 * POTF_NB); + __memset_nram(rB, POTF_NB * calc_length, (float)ZERO); + + float temp_b = 0, factor = 0; + float sum = 0.0; + float c = 0.0; + float t = 0.0; + sum = 0.0; + c = 0.0; + t = 0.0; + temp_b = 0; + factor = 0; - float temp_b = 0, factor = 0; - float sum = 0.0; - float c = 0.0; - float t = 0.0; - sum = 0.0; - c = 0.0; - t =0.0; - temp_b = 0; - factor = 0; - + if (id == 0) { + __memcpy(sA, dA, sizeof(float), LDRAM2SRAM); + } + if (if_execute) + __memcpy(rBp, OFFSET_B_ROW(dB, start, 0), sizeof(float), LDRAM2NRAM, + sizeof(float), ldb * sizeof(float), span - 1); + __sync_cluster(); - if(id == 0) - { - __memcpy(sA,dA,sizeof(float),LDRAM2SRAM); - } - if(if_execute) - __memcpy(rBp,OFFSET_B_ROW(dB,start,0),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); + if (trans) { + __memcpy_async(rA, sA, (1) * sizeof(float), SRAM2NRAM); + if (if_execute) + __memcpy_async(rB, rBp, sizeof(float), NRAM2NRAM, + calc_length * sizeof(float), sizeof(float), span - 1); __sync_cluster(); + if (id == 0) { + __memcpy_async(sA, OFFSET_ROW(dA, 1, 0), 2 * sizeof(float), LDRAM2SRAM); + } + if (if_execute) + __memcpy_async(rBp, OFFSET_B_ROW(dB, start, 1), sizeof(float), LDRAM2NRAM, + sizeof(float), ldb * sizeof(float), span - 1); + factor = 1.0 / rA[0]; + for (int i = 0; i < span; i++) { + rB[i * calc_length] *= factor; + } - if(trans) - { - __memcpy_async(rA,sA,(1)*sizeof(float),SRAM2NRAM); - if(if_execute) - __memcpy_async(rB,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); - __sync_cluster(); - if(id == 0) - { - __memcpy_async(sA,OFFSET_ROW(dA,1,0),2*sizeof(float),LDRAM2SRAM); - } - if(if_execute) - __memcpy_async(rBp,OFFSET_B_ROW(dB,start,1),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); - factor = 1.0 / rA[0]; - for(int i = 0; i < span; i++) - { - rB[i*calc_length] *= factor; - } - - __sync_cluster(); - - for(int iter = 1; iter < m - 1; iter++) - { - __memcpy_async(rA,sA,(iter+1)*sizeof(float),SRAM2NRAM); - if(if_execute) - __memcpy_async(rB+iter,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); - __sync_cluster(); - if(id == 0) - { - __memcpy_async(sA,OFFSET_ROW(dA,iter+1,0),(iter+2)*sizeof(float),LDRAM2SRAM); - } - if(if_execute) - __memcpy_async(rBp,OFFSET_B_ROW(dB,start,iter+1),sizeof(float),LDRAM2NRAM,sizeof(float), ldb * sizeof(float), span - 1); - factor = 1.0 / rA[iter]; - for(int i = 0; i < span; i++) - { - __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,iter); - temp_b = 0; - sum = 0.0; - c = 0.0; - t = 0.0; - - for(int j = 0; j < iter; j++) - { - temp_b = rC[i*calc_length+j] - c; - t = sum + temp_b; - c = (t - sum) - temp_b; - sum = t; - } - temp_b = sum; - temp_b = rB[i*calc_length+iter] - temp_b; - rB[i*calc_length+iter] = temp_b * factor; - } - - __sync_cluster(); - } + __sync_cluster(); - __memcpy_async(rA,sA,(m)*sizeof(float),SRAM2NRAM); - if(if_execute) - __memcpy_async(rB+m-1,rBp,sizeof(float),NRAM2NRAM,calc_length * sizeof(float), sizeof(float), span - 1); - __sync_cluster(); - factor = 1.0 / rA[m-1]; - for(int i = 0; i < span; i++) - { - __bang_mul(rC+i*calc_length,rA,rB+i*calc_length,m-1); - - sum = 0.0; - c = 0.0; - t = 0.0; - temp_b = 0; - - for(int j = 0; j < m-1; j++) - { - temp_b = rC[i*calc_length+j] - c; - t = sum + temp_b; - c = (t - sum) - temp_b; - sum = t; - } - temp_b = sum; - temp_b = rB[i*calc_length+m-1] - temp_b; - rB[i*calc_length+m-1] = temp_b * factor; + for (int iter = 1; iter < m - 1; iter++) { + __memcpy_async(rA, sA, (iter + 1) * sizeof(float), SRAM2NRAM); + if (if_execute) + __memcpy_async(rB + iter, rBp, sizeof(float), NRAM2NRAM, + calc_length * sizeof(float), sizeof(float), span - 1); + __sync_cluster(); + if (id == 0) { + __memcpy_async(sA, OFFSET_ROW(dA, iter + 1, 0), + (iter + 2) * sizeof(float), LDRAM2SRAM); + } + if (if_execute) + __memcpy_async(rBp, OFFSET_B_ROW(dB, start, iter + 1), sizeof(float), + LDRAM2NRAM, sizeof(float), ldb * sizeof(float), + span - 1); + factor = 1.0 / rA[iter]; + for (int i = 0; i < span; i++) { + __bang_mul(rC + i * calc_length, rA, rB + i * calc_length, iter); + temp_b = 0; + sum = 0.0; + c = 0.0; + t = 0.0; + + for (int j = 0; j < iter; j++) { + temp_b = rC[i * calc_length + j] - c; + t = sum + temp_b; + c = (t - sum) - temp_b; + sum = t; } - __sync_cluster(); - + temp_b = sum; + temp_b = rB[i * calc_length + iter] - temp_b; + rB[i * calc_length + iter] = temp_b * factor; + } - if(if_execute) - { - __memcpy(OFFSET_B_ROW(dB,start,0),rB,calc_length*sizeof(float),NRAM2LDRAM,ldb * sizeof(float), calc_length * sizeof(float), span - 1); - } - __sync_cluster(); + __sync_cluster(); + } + __memcpy_async(rA, sA, (m) * sizeof(float), SRAM2NRAM); + if (if_execute) + __memcpy_async(rB + m - 1, rBp, sizeof(float), NRAM2NRAM, + calc_length * sizeof(float), sizeof(float), span - 1); + __sync_cluster(); + factor = 1.0 / rA[m - 1]; + for (int i = 0; i < span; i++) { + __bang_mul(rC + i * calc_length, rA, rB + i * calc_length, m - 1); + + sum = 0.0; + c = 0.0; + t = 0.0; + temp_b = 0; + + for (int j = 0; j < m - 1; j++) { + temp_b = rC[i * calc_length + j] - c; + t = sum + temp_b; + c = (t - sum) - temp_b; + sum = t; + } + temp_b = sum; + temp_b = rB[i * calc_length + m - 1] - temp_b; + rB[i * calc_length + m - 1] = temp_b * factor; } + __sync_cluster(); + if (if_execute) { + __memcpy(OFFSET_B_ROW(dB, start, 0), rB, calc_length * sizeof(float), + NRAM2LDRAM, ldb * sizeof(float), calc_length * sizeof(float), + span - 1); + } + __sync_cluster(); + } } -mluOpStatus_t strsm_rectile(int batch, int stride, bool upper, bool trans, int m, int n, float *d_a, int lda, float *d_b, int lddb, cnrtQueue_t queue) -{ - cnrtDim3_t dim; - - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; - - dim.y = 1; - dim.z = 1; - - if(batch>16) - { - dim.x = batch; - KERNEL_CHECK( - mlu_strsm_rectile_batch_kernel<<>>(batch,stride,m,n,trans,d_a,lda,d_b,lddb)); +mluOpStatus_t strsm_rectile(int batch, int stride, bool upper, bool trans, + int m, int n, float* d_a, int lda, float* d_b, + int lddb, cnrtQueue_t queue) { + cnrtDim3_t dim; + + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + + dim.y = 1; + dim.z = 1; + + if (batch > 16) { + dim.x = batch; + KERNEL_CHECK(mlu_strsm_rectile_batch_kernel<<>>( + batch, stride, m, n, trans, d_a, lda, d_b, lddb)); + } else { + int carry_batch = batch; + if (batch == 1) { + func_type = CNRT_FUNC_TYPE_UNION1; + } else if (batch == 2) { + func_type = CNRT_FUNC_TYPE_UNION2; + } else if (batch <= 4) { + func_type = CNRT_FUNC_TYPE_UNION4; + carry_batch = 4; + } else { + func_type = CNRT_FUNC_TYPE_UNION8; + carry_batch = batch < 8 ? 8 : batch; + if (batch <= 8) { + carry_batch = 8; + } else if (batch <= 16) { + carry_batch = 16; + } else { + carry_batch = 32; + } } - else - { - int carry_batch = batch; - if(batch == 1) - { - func_type = CNRT_FUNC_TYPE_UNION1; - } - else if(batch == 2) - { - func_type = CNRT_FUNC_TYPE_UNION2; - } - else if(batch <= 4) - { - func_type = CNRT_FUNC_TYPE_UNION4; - carry_batch = 4; - } - else - { - func_type = CNRT_FUNC_TYPE_UNION8; - carry_batch = batch < 8 ? 8 : batch; - if(batch <= 8) - { - carry_batch = 8; - } - else if(batch <= 16) - { - carry_batch = 16; - } - else - { - carry_batch = 32; - - } - } - dim.x = carry_batch * 4; + dim.x = carry_batch * 4; - if(!upper && trans) - { - KERNEL_CHECK( - mlu_strsm_rectile_kernel<<>>(batch,stride,m,n,trans,d_a,lda,d_b,lddb)); - } + if (!upper && trans) { + KERNEL_CHECK(mlu_strsm_rectile_kernel<<>>( + batch, stride, m, n, trans, d_a, lda, d_b, lddb)); } + } - - return MLUOP_STATUS_SUCCESS; + return MLUOP_STATUS_SUCCESS; } +__mlu_global__ void add_c_batch(int batch, int stride, float beta, float* d_c, + float* src, int ldc, int ldsrc, int m, int n) { + int id = taskId; + int batch_id = id; + if (batch_id >= batch) return; + float* orignC = d_c; + float* orignSrc = src; + d_c = orignC + batch_id * stride; + src = orignSrc + batch_id * m * n; + + if (beta == 0.0f) { + __memcpy(d_c, src, n * sizeof(float), GDRAM2GDRAM, ldc * sizeof(float), + ldsrc * sizeof(float), m - 1); + return; + } + float* a_sram = (float*)nram_buffer + m * n; - -__mlu_global__ -void add_c_batch(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) -{ - - int id = taskId; - int batch_id = id; - if(batch_id >= batch) - return; - float* orignC = d_c; - float* orignSrc = src; - d_c = orignC + batch_id * stride; - src = orignSrc + batch_id * m*n; - - - if (beta == 0.0f) - { - - - __memcpy(d_c,src,n*sizeof(float),GDRAM2GDRAM,ldc*sizeof(float),ldsrc*sizeof(float),m-1); - return; - } - - float* a_sram = (float*)nram_buffer + m * n; - - __memcpy(nram_buffer,d_c,n*sizeof(float),LDRAM2NRAM,n*sizeof(float),ldc*sizeof(float),m-1); - __memcpy(a_sram,src,n*m*sizeof(float),LDRAM2NRAM); + __memcpy(nram_buffer, d_c, n * sizeof(float), LDRAM2NRAM, n * sizeof(float), + ldc * sizeof(float), m - 1); + __memcpy(a_sram, src, n * m * sizeof(float), LDRAM2NRAM); __sync(); + int32_t data_num = m * n; + const float* a_offset = a_sram; + const float* b_offset = (float*)nram_buffer; - int32_t data_num = m*n; - const float *a_offset = a_sram; - const float *b_offset = (float*)nram_buffer; - - float *a_nram = (float *)a_offset; - float *b_nram = (float *)b_offset; + float* a_nram = (float*)a_offset; + float* b_nram = (float*)b_offset; __bang_add(b_nram, a_nram, b_nram, data_num); - __memcpy(d_c,b_nram,n*sizeof(float),NRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); - + __memcpy(d_c, b_nram, n * sizeof(float), NRAM2LDRAM, ldc * sizeof(float), + n * sizeof(float), m - 1); __sync(); - } -__mlu_global__ -void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) -{ - - int id = taskId; - int ipu_per_cluster = 4; - int batch_id = id / ipu_per_cluster; - if(batch_id >= batch) - return; - id = taskId % ipu_per_cluster; - float* orignC = d_c; - float* orignSrc = src; - d_c = orignC + batch_id * stride; - src = orignSrc + batch_id * m*n; - - if (beta == 0.0f) - { - if(id == 0) - { - __memcpy(sram_buffer,src,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); - - } - __sync_cluster(); - if(id == 0) - { - __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); - } - __sync_cluster(); - return; +__mlu_global__ void add_c(int batch, int stride, float beta, float* d_c, + float* src, int ldc, int ldsrc, int m, int n) { + int id = taskId; + int ipu_per_cluster = 4; + int batch_id = id / ipu_per_cluster; + if (batch_id >= batch) return; + id = taskId % ipu_per_cluster; + float* orignC = d_c; + float* orignSrc = src; + d_c = orignC + batch_id * stride; + src = orignSrc + batch_id * m * n; + + if (beta == 0.0f) { + if (id == 0) { + __memcpy(sram_buffer, src, n * sizeof(float), GDRAM2SRAM, + n * sizeof(float), ldsrc * sizeof(float), m - 1); } - - float* a_sram = (float*)sram_buffer + 3* m * n; - + __sync_cluster(); if (id == 0) { - __memcpy(sram_buffer,d_c,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldc*sizeof(float),m-1); - __memcpy(a_sram,src,n*m*sizeof(float),GDRAM2SRAM); + __memcpy(d_c, sram_buffer, n * sizeof(float), SRAM2LDRAM, + ldc * sizeof(float), n * sizeof(float), m - 1); } + __sync_cluster(); + return; + } - __sync_cluster(); + float* a_sram = (float*)sram_buffer + 3 * m * n; + if (id == 0) { + __memcpy(sram_buffer, d_c, n * sizeof(float), GDRAM2SRAM, n * sizeof(float), + ldc * sizeof(float), m - 1); + __memcpy(a_sram, src, n * m * sizeof(float), GDRAM2SRAM); + } + + __sync_cluster(); - int32_t data_num = m*n; + int32_t data_num = m * n; int32_t data_per_core = data_num / ipu_per_cluster; int32_t data_last_core = data_per_core + data_num % ipu_per_cluster; - const float *a_offset = a_sram + id * data_per_core; - const float *b_offset = (float*)sram_buffer + id * data_per_core; - float *output_offset = (float*)sram_buffer + id * data_per_core; + const float* a_offset = a_sram + id * data_per_core; + const float* b_offset = (float*)sram_buffer + id * data_per_core; + float* output_offset = (float*)sram_buffer + id * data_per_core; if (id == ipu_per_cluster - 1) { data_per_core = data_last_core; @@ -1030,9 +854,9 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); int32_t data_nram_num = - MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; - float *a_nram = (float *)nram_buffer; - float *b_nram = (float *)a_nram + data_nram_num; + MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; + float* a_nram = (float*)nram_buffer; + float* b_nram = (float*)a_nram + data_nram_num; int32_t loop_num = data_per_core / data_nram_num; int32_t rem_nram_num = data_per_core % data_nram_num; @@ -1047,7 +871,7 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in } if (rem_nram_num != 0) { int32_t rem_align_num = - (rem_nram_num + align_num - 1) / align_num * align_num; + (rem_nram_num + align_num - 1) / align_num * align_num; __memcpy(a_nram, a_offset + loop_num * data_nram_num, rem_nram_num * sizeof(float), SRAM2NRAM); __memcpy(b_nram, b_offset + loop_num * data_nram_num, @@ -1059,526 +883,470 @@ void add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, in __sync_cluster(); if (id == 0) { - __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2GDRAM,ldc*sizeof(float),n*sizeof(float),m-1); - + __memcpy(d_c, sram_buffer, n * sizeof(float), SRAM2GDRAM, + ldc * sizeof(float), n * sizeof(float), m - 1); } __sync_cluster(); - } - - -mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a,int lda, int stride_a, float* d_b, int ldb, int stride_b, float* d_c, int ldc, int stride_c, mluOpHandle_t handle, float* workspace) -{ - if(k==0) - return MLUOP_STATUS_SUCCESS; - - int32_t batch_size_arr[1] = {batch}; - int64_t stride_a_arr[1] = {stride_a}; - int64_t stride_b_arr[1] = {stride_b}; - int64_t stride_c_arr[1] = {stride_c}; - - std::string api_name = "Cholesky"; - - - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - - cnnlStrideBatchMatMulAlgo_t algo; - CALL_CNNL(cnnlStrideBatchMatMulAlgoCreate(&algo)); - - cnnlStrideBatchMatMulHeuristicResult_t heuristic_result; - CALL_CNNL(cnnlCreateStrideBatchMatMulHeuristicResult(&heuristic_result)); - - - - cnnlStrideBatchMatMulDescriptor_t stride_bmm_desc; - CALL_CNNL(cnnlStrideBatchMatMulDescCreate(&stride_bmm_desc)); - int32_t allow_tf32 = 0, max_batch_dim = 1; - CALL_CNNL(cnnlSetStrideBatchMatMulDescAttr(stride_bmm_desc, CNNL_STRIDE_BMM_ALLOW_TF32, - &(allow_tf32), sizeof(int32_t))); - CALL_CNNL(cnnlSetStrideBatchMatMulDescAttr(stride_bmm_desc, CNNL_STRIDE_BMM_MAX_BATCH_DIM, - &(max_batch_dim), sizeof(int32_t))); - - - - mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc; - - - - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc));; - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc)); - - int32_t matmul_a_shape[2] = {batch, stride_a}; - int32_t matmul_b_shape[2] = {batch, stride_b}; - int32_t matmul_c_shape[2] = {batch, stride_c}; - - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_a_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 2, matmul_a_shape)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_b_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 2, matmul_b_shape)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_c_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 2, matmul_c_shape)); - - - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_c_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_d_desc); - - - int requested_algo_count = 1, return_algo_count = 0; - size_t workspace_size; - - - cnnlGetStrideBatchMatMulAlgoHeuristic( - cnnl_handle, stride_bmm_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, cnnl_d_desc, trans_a, trans_b, false, - &(alpha), &(beta), m, n, k, lda, ldb, ldc, batch_size_arr, stride_a_arr, stride_b_arr, - stride_c_arr, nullptr, requested_algo_count, &heuristic_result, &return_algo_count); - - cnnlGetStrideBatchMatMulHeuristicResult(heuristic_result, &algo, &workspace_size); - - - if(workspace_size > 0) - { - printf("sgemm workspace size:%zu\n",workspace_size); - } - - - - CALL_CNNL(cnnlStrideBatchMatMul_v2( - cnnl_handle, stride_bmm_desc, algo, trans_a, trans_b, false, m, n, k, batch_size_arr, &(alpha), - cnnl_a_desc, d_a, lda, stride_a_arr, - cnnl_b_desc, d_b, ldb, stride_b_arr, &(beta), cnnl_c_desc, d_c, ldc, - stride_c_arr, cnnl_d_desc, d_c, workspace, workspace_size)); - +mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, + float alpha, float beta, float* d_a, int lda, int stride_a, + float* d_b, int ldb, int stride_b, float* d_c, int ldc, + int stride_c, mluOpHandle_t handle, float* workspace) { + if (k == 0) return MLUOP_STATUS_SUCCESS; + + int32_t batch_size_arr[1] = {batch}; + int64_t stride_a_arr[1] = {stride_a}; + int64_t stride_b_arr[1] = {stride_b}; + int64_t stride_c_arr[1] = {stride_c}; + + std::string api_name = "Cholesky"; + + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + + cnnlStrideBatchMatMulAlgo_t algo; + CALL_CNNL(cnnlStrideBatchMatMulAlgoCreate(&algo)); + + cnnlStrideBatchMatMulHeuristicResult_t heuristic_result; + CALL_CNNL(cnnlCreateStrideBatchMatMulHeuristicResult(&heuristic_result)); + + cnnlStrideBatchMatMulDescriptor_t stride_bmm_desc; + CALL_CNNL(cnnlStrideBatchMatMulDescCreate(&stride_bmm_desc)); + int32_t allow_tf32 = 0, max_batch_dim = 1; + CALL_CNNL(cnnlSetStrideBatchMatMulDescAttr(stride_bmm_desc, + CNNL_STRIDE_BMM_ALLOW_TF32, + &(allow_tf32), sizeof(int32_t))); + CALL_CNNL(cnnlSetStrideBatchMatMulDescAttr( + stride_bmm_desc, CNNL_STRIDE_BMM_MAX_BATCH_DIM, &(max_batch_dim), + sizeof(int32_t))); + + mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc; + + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc)); + ; + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc)); + + int32_t matmul_a_shape[2] = {batch, stride_a}; + int32_t matmul_b_shape[2] = {batch, stride_b}; + int32_t matmul_c_shape[2] = {batch, stride_c}; + + CHECK_RETURN(api_name, + mluOpSetTensorDescriptor(matmul_a_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_a_shape)); + CHECK_RETURN(api_name, + mluOpSetTensorDescriptor(matmul_b_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_b_shape)); + CHECK_RETURN(api_name, + mluOpSetTensorDescriptor(matmul_c_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_c_shape)); + + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_c_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_d_desc); + + int requested_algo_count = 1, return_algo_count = 0; + size_t workspace_size; + + cnnlGetStrideBatchMatMulAlgoHeuristic( + cnnl_handle, stride_bmm_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, + cnnl_d_desc, trans_a, trans_b, false, &(alpha), &(beta), m, n, k, lda, + ldb, ldc, batch_size_arr, stride_a_arr, stride_b_arr, stride_c_arr, + nullptr, requested_algo_count, &heuristic_result, &return_algo_count); + + cnnlGetStrideBatchMatMulHeuristicResult(heuristic_result, &algo, + &workspace_size); + + if (workspace_size > 0) { + printf("sgemm workspace size:%zu\n", workspace_size); + } - + CALL_CNNL(cnnlStrideBatchMatMul_v2( + cnnl_handle, stride_bmm_desc, algo, trans_a, trans_b, false, m, n, k, + batch_size_arr, &(alpha), cnnl_a_desc, d_a, lda, stride_a_arr, + cnnl_b_desc, d_b, ldb, stride_b_arr, &(beta), cnnl_c_desc, d_c, ldc, + stride_c_arr, cnnl_d_desc, d_c, workspace, workspace_size)); - return MLUOP_STATUS_SUCCESS; + return MLUOP_STATUS_SUCCESS; } -__mlu_global__ -void batch_inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, float* d_output, int ld_output, int stride_output, int m) -{ - int id = taskId; - int batch_id = id; - if(batch_id >= batch) - return; - - float* orign_input = d_input; - float* orign_output = d_output; - d_input = orign_input + batch_id * stride_input; - d_output = orign_output + batch_id * stride_output; - - - float* nram_offset = (float*)nram_buffer; - float* nram_src0 = nram_offset; - float* nram_src1 = nram_src0 + m * m; - float* nram_src2 = nram_src1 + m * m; - float* mul_result = nram_src2 + m; - float* nram_dst = nram_src2 + m * m; - float* diag_start = nram_dst; - int height = m, span = m; - - __memset_nram(nram_offset, 4 * m * m, (float)ZERO); - - __memcpy(nram_dst,d_input,m*sizeof(float),GDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),m-1); - - float result = 0.0; - for(int i = 0; i < m; i++) - { - int off = i * m + i; - result = nram_dst[off]; - result = 1.0 / result; - nram_src1[i*height+i] = result; - nram_dst[i*span + i] = result; - diag_start[off] = result; - } +__mlu_global__ void batch_inverse_kernel(int batch, float* d_input, + int ld_input, int stride_input, + float* d_output, int ld_output, + int stride_output, int m) { + int id = taskId; + int batch_id = id; + if (batch_id >= batch) return; + + float* orign_input = d_input; + float* orign_output = d_output; + d_input = orign_input + batch_id * stride_input; + d_output = orign_output + batch_id * stride_output; + + float* nram_offset = (float*)nram_buffer; + float* nram_src0 = nram_offset; + float* nram_src1 = nram_src0 + m * m; + float* nram_src2 = nram_src1 + m * m; + float* mul_result = nram_src2 + m; + float* nram_dst = nram_src2 + m * m; + float* diag_start = nram_dst; + int height = m, span = m; + + __memset_nram(nram_offset, 4 * m * m, (float)ZERO); + + __memcpy(nram_dst, d_input, m * sizeof(float), GDRAM2NRAM, m * sizeof(float), + ld_input * sizeof(float), m - 1); + + float result = 0.0; + for (int i = 0; i < m; i++) { + int off = i * m + i; + result = nram_dst[off]; + result = 1.0 / result; + nram_src1[i * height + i] = result; + nram_dst[i * span + i] = result; + diag_start[off] = result; + } - for(int i = 1; i < height; i++) - { - __memcpy(nram_src2,diag_start+i*m,i*sizeof(float),NRAM2NRAM); - int num = std::min(i, span); - float diag_element = diag_start[i*m+i]; - for(int j = 0; j < num; j++) - { - float temp = 0.0; - __bang_mul(mul_result,nram_src2,nram_src1+j*height,i); - for(int k = 0; k< i; k++) - { - temp += mul_result[k]; - } - temp = temp * -1.0 * diag_element; - nram_dst[i*span+j] = temp; - nram_src1[j*height+i] = temp; - } - __sync(); - + for (int i = 1; i < height; i++) { + __memcpy(nram_src2, diag_start + i * m, i * sizeof(float), NRAM2NRAM); + int num = std::min(i, span); + float diag_element = diag_start[i * m + i]; + for (int j = 0; j < num; j++) { + float temp = 0.0; + __bang_mul(mul_result, nram_src2, nram_src1 + j * height, i); + for (int k = 0; k < i; k++) { + temp += mul_result[k]; + } + temp = temp * -1.0 * diag_element; + nram_dst[i * span + j] = temp; + nram_src1[j * height + i] = temp; } + __sync(); + } - __memcpy(d_output,nram_dst,m*sizeof(float),NRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); - - + __memcpy(d_output, nram_dst, m * sizeof(float), NRAM2GDRAM, + ld_output * sizeof(float), m * sizeof(float), m - 1); } -__mlu_global__ -void inverse_kernel(int batch, float *d_input, int ld_input, int stride_input, float* d_output, int ld_output, int stride_output, int m) -{ - int id = taskId; - int batch_id = id / 4; - if(batch_id >= batch) - return; - id = taskId % 4; - float* orignInput = d_input; - float* orignOutput = d_output; - d_input = orignInput + batch_id * stride_input; - d_output = orignOutput + batch_id * stride_output; - - - if (id == 0) { - __memcpy(sram_buffer,d_input,m*sizeof(float),GDRAM2SRAM,m*sizeof(float),ld_input*sizeof(float),m-1); - } - __sync_cluster(); - - - int span = m/taskDim; - int start = id * span; - if (id == 3) - { - span = m - 3 * span; - } - float* nram_offset = (float*)nram_buffer + id * 3 * m * m; - - float* nram_src1 = nram_offset; - float* nram_src2 = nram_src1 + m * m; - float* mul_result = nram_src2 + m; - float* nram_dst = nram_src2 + m * m; - float* diag_start = ((float*)sram_buffer) + m * start + start; - int height = m - start; - - __memset_nram(nram_offset, 3 * m * m, (float)ZERO); - - float result = 0.0; - for(int i = 0; i < span; i++) - { - int off = i * m + i; - result = diag_start[off]; - result = 1.0 / result; - nram_src1[i*height+i] = result; - nram_dst[i*span + i] = result; - diag_start[off] = result; +__mlu_global__ void inverse_kernel(int batch, float* d_input, int ld_input, + int stride_input, float* d_output, + int ld_output, int stride_output, int m) { + int id = taskId; + int batch_id = id / 4; + if (batch_id >= batch) return; + id = taskId % 4; + float* orignInput = d_input; + float* orignOutput = d_output; + d_input = orignInput + batch_id * stride_input; + d_output = orignOutput + batch_id * stride_output; - } - __sync_cluster(); + if (id == 0) { + __memcpy(sram_buffer, d_input, m * sizeof(float), GDRAM2SRAM, + m * sizeof(float), ld_input * sizeof(float), m - 1); + } + __sync_cluster(); + int span = m / taskDim; + int start = id * span; + if (id == 3) { + span = m - 3 * span; + } + float* nram_offset = (float*)nram_buffer + id * 3 * m * m; + + float* nram_src1 = nram_offset; + float* nram_src2 = nram_src1 + m * m; + float* mul_result = nram_src2 + m; + float* nram_dst = nram_src2 + m * m; + float* diag_start = ((float*)sram_buffer) + m * start + start; + int height = m - start; + + __memset_nram(nram_offset, 3 * m * m, (float)ZERO); + + float result = 0.0; + for (int i = 0; i < span; i++) { + int off = i * m + i; + result = diag_start[off]; + result = 1.0 / result; + nram_src1[i * height + i] = result; + nram_dst[i * span + i] = result; + diag_start[off] = result; + } + __sync_cluster(); - for(int i = 1; i < height; i++) - { - __memcpy(nram_src2,diag_start+i*m,i*sizeof(float),SRAM2NRAM); - int num = std::min(i, span); - float diag_element = diag_start[i*m+i]; - for(int j = 0; j < num; j++) - { - float temp = 0.0; - - __bang_mul(mul_result,nram_src2,nram_src1+j*height,i); - for(int k = 0; k< i; k++) - { - temp += mul_result[k]; - } - temp = temp * -1.0 * diag_element; - nram_dst[i*span+j] = temp; - nram_src1[j*height+i] = temp; - } - __sync(); - + for (int i = 1; i < height; i++) { + __memcpy(nram_src2, diag_start + i * m, i * sizeof(float), SRAM2NRAM); + int num = std::min(i, span); + float diag_element = diag_start[i * m + i]; + for (int j = 0; j < num; j++) { + float temp = 0.0; + + __bang_mul(mul_result, nram_src2, nram_src1 + j * height, i); + for (int k = 0; k < i; k++) { + temp += mul_result[k]; + } + temp = temp * -1.0 * diag_element; + nram_dst[i * span + j] = temp; + nram_src1[j * height + i] = temp; } + __sync(); + } - __sync_cluster(); - - if (span > 0) - __memcpy(diag_start,nram_dst,span*sizeof(float),NRAM2SRAM,m*sizeof(float),span*sizeof(float),height-1); + __sync_cluster(); - __sync_cluster(); - - if (id == 0) { - __memcpy(d_output,sram_buffer,m*sizeof(float),SRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); - } - + if (span > 0) + __memcpy(diag_start, nram_dst, span * sizeof(float), NRAM2SRAM, + m * sizeof(float), span * sizeof(float), height - 1); + __sync_cluster(); + if (id == 0) { + __memcpy(d_output, sram_buffer, m * sizeof(float), SRAM2GDRAM, + ld_output * sizeof(float), m * sizeof(float), m - 1); + } } -__mlu_global__ void set_zero(int batch, int stride, bool upper, int m, float* d_c, int lddc) -{ - int id = taskId; - int batch_id = id / 4; - if(batch_id >= batch) - return; - float* orignC = d_c; - d_c = orignC + batch_id * stride; - id = taskId % 4; - int span = m/4; - int pre = id * span; - float* start_c = d_c + pre * lddc + pre; - float* temp_c = start_c; - if (id == 3) - { - span = m - 3 * span; - - } - for(int i = 0; i < span - 1; i++) - { - temp_c = start_c + i * lddc + i; - int num = m - pre - i; - __ldramset(temp_c+1, num - 1, 0); - } - if (id != 3&&span > 0) - { - temp_c = start_c + (span - 1) * lddc + span - 1; - int num = m - pre - span + 1; - __ldramset(temp_c+1, num - 1, 0); - - } +__mlu_global__ void set_zero(int batch, int stride, bool upper, int m, + float* d_c, int lddc) { + int id = taskId; + int batch_id = id / 4; + if (batch_id >= batch) return; + float* orignC = d_c; + d_c = orignC + batch_id * stride; + id = taskId % 4; + int span = m / 4; + int pre = id * span; + float* start_c = d_c + pre * lddc + pre; + float* temp_c = start_c; + if (id == 3) { + span = m - 3 * span; + } + for (int i = 0; i < span - 1; i++) { + temp_c = start_c + i * lddc + i; + int num = m - pre - i; + __ldramset(temp_c + 1, num - 1, 0); + } + if (id != 3 && span > 0) { + temp_c = start_c + (span - 1) * lddc + span - 1; + int num = m - pre - span + 1; + __ldramset(temp_c + 1, num - 1, 0); + } } - - -mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, float* d_a, int lda, float* d_b, int ldb, mluOpHandle_t handle,float* workspace) -{ - if(n==0) - return MLUOP_STATUS_SUCCESS; - mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, info_desc; - std::string api_name = "Cholesky"; - - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - - - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&info_desc)); - int32_t matmul_a_shape[2] = {batch, m*m}; - int32_t matmul_b_shape[2] = {batch, stride}; - int32_t info_shape[1] = {1}; - - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_a_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 2, matmul_a_shape)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_b_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 2, matmul_b_shape)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - info_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, 1, info_shape)); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(info_desc, cnnl_info_desc); - - float* sgemm_workspace = workspace + batch * m * m; - CNRT_CHECK(cnrtMemset(workspace, 0.0, batch*m*m*sizeof(float))); - - float* h_i; - h_i = (float*)malloc(m*m*sizeof(float)); - - - int m1 = m/2; - int m2 = m - m1; - - float* workspace1 = workspace; - float* workspace2 = workspace1 + m1*m+m1; - - cnrtDim3_t dim; - dim.y = 1; - dim.z = 1; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; - if(batch > 1) - { - dim.x = batch; - KERNEL_CHECK(batch_inverse_kernel<<>>(batch, d_a,lda,stride, workspace1,m,m*m,m1)); - KERNEL_CHECK(batch_inverse_kernel<<>>(batch, d_a+m1*lda+m1,lda,stride, workspace2,m,m*m,m2)); +mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, + float* d_a, int lda, float* d_b, int ldb, + mluOpHandle_t handle, float* workspace) { + if (n == 0) return MLUOP_STATUS_SUCCESS; + mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, info_desc; + std::string api_name = "Cholesky"; + + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&info_desc)); + int32_t matmul_a_shape[2] = {batch, m * m}; + int32_t matmul_b_shape[2] = {batch, stride}; + int32_t info_shape[1] = {1}; + + CHECK_RETURN(api_name, + mluOpSetTensorDescriptor(matmul_a_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_a_shape)); + CHECK_RETURN(api_name, + mluOpSetTensorDescriptor(matmul_b_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, 2, matmul_b_shape)); + CHECK_RETURN(api_name, + mluOpSetTensorDescriptor(info_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_INT32, 1, info_shape)); + + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(info_desc, cnnl_info_desc); + + float* sgemm_workspace = workspace + batch * m * m; + CNRT_CHECK(cnrtMemset(workspace, 0.0, batch * m * m * sizeof(float))); + + float* h_i; + h_i = (float*)malloc(m * m * sizeof(float)); + + int m1 = m / 2; + int m2 = m - m1; + + float* workspace1 = workspace; + float* workspace2 = workspace1 + m1 * m + m1; + + cnrtDim3_t dim; + dim.y = 1; + dim.z = 1; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + if (batch > 1) { + dim.x = batch; + KERNEL_CHECK(batch_inverse_kernel<<>>( + batch, d_a, lda, stride, workspace1, m, m * m, m1)); + KERNEL_CHECK(batch_inverse_kernel<<>>( + batch, d_a + m1 * lda + m1, lda, stride, workspace2, m, m * m, m2)); + } else { + int carry_batch = batch; + if (batch == 1) { + func_type = CNRT_FUNC_TYPE_UNION1; + } else if (batch == 2) { + func_type = CNRT_FUNC_TYPE_UNION2; + } else if (batch <= 4) { + func_type = CNRT_FUNC_TYPE_UNION4; + carry_batch = 4; + } else { + func_type = CNRT_FUNC_TYPE_UNION8; + carry_batch = batch < 8 ? 8 : batch; } - else - { - int carry_batch = batch; - if(batch == 1) - { - func_type = CNRT_FUNC_TYPE_UNION1; - } - else if(batch == 2) - { - func_type = CNRT_FUNC_TYPE_UNION2; - } - else if(batch <= 4) - { - func_type = CNRT_FUNC_TYPE_UNION4; - carry_batch = 4; - } - else - { - func_type = CNRT_FUNC_TYPE_UNION8; - carry_batch = batch < 8 ? 8 : batch; - } - dim.x = carry_batch * 4; - - - KERNEL_CHECK(inverse_kernel<<>>(batch, d_a,lda,stride, workspace1,m,m*m,m1)); - KERNEL_CHECK(inverse_kernel<<>>(batch, d_a+m1*lda+m1,lda,stride, workspace2,m,m*m,m2)); - - } - + dim.x = carry_batch * 4; - sgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,d_a+m1*lda,lda,stride,workspace1,m,m*m,workspace1+m1*m,m,m*m,handle,sgemm_workspace); - sgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,workspace2,m,m*m,workspace1+m1*m,m,m*m,workspace1+m1*m,m,m*m,handle,sgemm_workspace); - cnrtQueueSync(queue); + KERNEL_CHECK(inverse_kernel<<>>( + batch, d_a, lda, stride, workspace1, m, m * m, m1)); + KERNEL_CHECK(inverse_kernel<<>>( + batch, d_a + m1 * lda + m1, lda, stride, workspace2, m, m * m, m2)); + } - cnnlStrideBatchMatMul(cnnl_handle, false, true, n,m, m, batch, 1.0, cnnl_b_desc, d_b, ldb, stride, cnnl_a_desc, workspace, m, m*m, 0.0f, cnnl_b_desc, d_b, ldb, stride); + sgemm(batch, false, false, m2, m1, m1, 1.0f, 0.0f, d_a + m1 * lda, lda, + stride, workspace1, m, m * m, workspace1 + m1 * m, m, m * m, handle, + sgemm_workspace); + sgemm(batch, false, false, m2, m2, m1, -1.0f, 0.0f, workspace2, m, m * m, + workspace1 + m1 * m, m, m * m, workspace1 + m1 * m, m, m * m, handle, + sgemm_workspace); + cnrtQueueSync(queue); + cnnlStrideBatchMatMul(cnnl_handle, false, true, n, m, m, batch, 1.0, + cnnl_b_desc, d_b, ldb, stride, cnnl_a_desc, workspace, + m, m * m, 0.0f, cnnl_b_desc, d_b, ldb, stride); - return MLUOP_STATUS_SUCCESS; + return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t set_half_zero(int batch,int stride,float* d_a, int lda, int m, mluOpHandle_t handle) -{ - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; - int carry_batch = 0; - if(batch == 1) - { - carry_batch = 1; - } - else if(batch == 2) - { - carry_batch = 2; - } - else if(batch <= 4) - { - carry_batch = 4; - } - else if(batch <= 8) - { - carry_batch = 8; - } - dim.x = 4 * batch; - dim.y = 1; - dim.z = 1; - KERNEL_CHECK(set_zero<<>>(batch, stride, false, m, d_a,lda)); - return MLUOP_STATUS_SUCCESS; +mluOpStatus_t set_half_zero(int batch, int stride, float* d_a, int lda, int m, + mluOpHandle_t handle) { + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; + int carry_batch = 0; + if (batch == 1) { + carry_batch = 1; + } else if (batch == 2) { + carry_batch = 2; + } else if (batch <= 4) { + carry_batch = 4; + } else if (batch <= 8) { + carry_batch = 8; + } + dim.x = 4 * batch; + dim.y = 1; + dim.z = 1; + KERNEL_CHECK( + set_zero<<>>(batch, stride, false, m, d_a, lda)); + return MLUOP_STATUS_SUCCESS; } - -mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans,int n, int k, float* d_a, int ldda, float* d_c, int lddc, mluOpHandle_t handle,float* workspace) -{ - if(k==0) - return MLUOP_STATUS_SUCCESS; - - sgemm(batch, false,true,n,n,k,-1.0f,1.0f,d_a,ldda,stride,d_a,ldda,stride,d_c,lddc,stride,handle,workspace); - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; - int carry_batch = next_power_of_2(batch); - dim.x = carry_batch * 4; - dim.y = 1; - dim.z = 1; - KERNEL_CHECK(set_zero<<>>(batch, stride, upper, n, d_c,lddc)); - - - return MLUOP_STATUS_SUCCESS; +mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans, int n, int k, + float* d_a, int ldda, float* d_c, int lddc, + mluOpHandle_t handle, float* workspace) { + if (k == 0) return MLUOP_STATUS_SUCCESS; + + sgemm(batch, false, true, n, n, k, -1.0f, 1.0f, d_a, ldda, stride, d_a, ldda, + stride, d_c, lddc, stride, handle, workspace); + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; + int carry_batch = next_power_of_2(batch); + dim.x = carry_batch * 4; + dim.y = 1; + dim.z = 1; + KERNEL_CHECK( + set_zero<<>>(batch, stride, upper, n, d_c, lddc)); + + return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, int n, int recnb, float* d_A, int lda, int gbstep, mluOpHandle_t handle, float* workspace) -{ - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - if(n==0) - return MLUOP_STATUS_SUCCESS; - - if(n <=recnb) - { - mlu_spotf2_lpin(batch, stride, trans, uplo,n,lda,d_A,gbstep,queue); - } - else - { - int n1 = n/2; - int n2 = n-n1; - mlu_spotrf_rectile(batch,stride,trans,uplo,n1,recnb,OFFSET_ROW(d_A,0,0),lda,gbstep, handle,workspace); - strsm_rectile(batch, stride, uplo,trans,n1,n2,OFFSET_ROW(d_A,0,0),lda,OFFSET_ROW(d_A,n1,0),lda,queue); - ssyrk(batch,stride,uplo,trans,n2,n1,d_A+n1*lda,lda,OFFSET_ROW(d_A,n1,n1),lda,handle,workspace); - mlu_spotrf_rectile(batch,stride,trans,uplo,n2,recnb,OFFSET_ROW(d_A,n1,n1),lda,gbstep+n1,handle,workspace); - - - - } - return MLUOP_STATUS_SUCCESS; +mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, + int n, int recnb, float* d_A, int lda, + int gbstep, mluOpHandle_t handle, + float* workspace) { + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + if (n == 0) return MLUOP_STATUS_SUCCESS; + + if (n <= recnb) { + mlu_spotf2_lpin(batch, stride, trans, uplo, n, lda, d_A, gbstep, queue); + } else { + int n1 = n / 2; + int n2 = n - n1; + mlu_spotrf_rectile(batch, stride, trans, uplo, n1, recnb, + OFFSET_ROW(d_A, 0, 0), lda, gbstep, handle, workspace); + strsm_rectile(batch, stride, uplo, trans, n1, n2, OFFSET_ROW(d_A, 0, 0), + lda, OFFSET_ROW(d_A, n1, 0), lda, queue); + ssyrk(batch, stride, uplo, trans, n2, n1, d_A + n1 * lda, lda, + OFFSET_ROW(d_A, n1, n1), lda, handle, workspace); + mlu_spotrf_rectile(batch, stride, trans, uplo, n2, recnb, + OFFSET_ROW(d_A, n1, n1), lda, gbstep + n1, handle, + workspace); + } + return MLUOP_STATUS_SUCCESS; } // m * n -mluOpStatus_t transpose(int batch, int m, int n, float* d_input,float* d_output, mluOpHandle_t handle,mluOpDataType_t type, float* workspace) -{ - if(m==0) - return MLUOP_STATUS_SUCCESS; - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - - mluOpTensorDescriptor_t trans_input_desc, trans_output_desc; - std::string api_name = "Cholesky"; - const int input_dim = 3; - - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_input_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_output_desc)); +mluOpStatus_t transpose(int batch, int m, int n, float* d_input, + float* d_output, mluOpHandle_t handle, + mluOpDataType_t type, float* workspace) { + if (m == 0) return MLUOP_STATUS_SUCCESS; + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); - int32_t transpose_input_shape[3] = {batch, m, n}; - int32_t transpose_output_shape[3] = {batch, n, m}; + mluOpTensorDescriptor_t trans_input_desc, trans_output_desc; + std::string api_name = "Cholesky"; + const int input_dim = 3; - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - trans_input_desc, MLUOP_LAYOUT_ARRAY, - type, 3, transpose_input_shape)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_input_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_output_desc)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - trans_output_desc, MLUOP_LAYOUT_ARRAY, - type, 3, transpose_output_shape)); + int32_t transpose_input_shape[3] = {batch, m, n}; + int32_t transpose_output_shape[3] = {batch, n, m}; - int permute[3] = {0, 2, 1}; + CHECK_RETURN(api_name, + mluOpSetTensorDescriptor(trans_input_desc, MLUOP_LAYOUT_ARRAY, + type, 3, transpose_input_shape)); - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_input_desc, cnnl_in_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_output_desc, cnnl_out_desc); + CHECK_RETURN(api_name, + mluOpSetTensorDescriptor(trans_output_desc, MLUOP_LAYOUT_ARRAY, + type, 3, transpose_output_shape)); - cnnlTransposeDescriptor_t cnnl_trans_desc = NULL; + int permute[3] = {0, 2, 1}; - CALL_CNNL(cnnlCreateTransposeDescriptor(&cnnl_trans_desc)); + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_input_desc, cnnl_in_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_output_desc, + cnnl_out_desc); - CALL_CNNL(cnnlSetTransposeDescriptor(cnnl_trans_desc, input_dim, permute)); + cnnlTransposeDescriptor_t cnnl_trans_desc = NULL; - size_t size=0; + CALL_CNNL(cnnlCreateTransposeDescriptor(&cnnl_trans_desc)); - + CALL_CNNL(cnnlSetTransposeDescriptor(cnnl_trans_desc, input_dim, permute)); - CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, cnnl_trans_desc, &size)); + size_t size = 0; + CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, + cnnl_trans_desc, &size)); - if(size > 0ul) - { - printf("transpose2 need size: %zu\n",size); - } - - CALL_CNNL(cnnlTranspose_v2(cnnl_handle, cnnl_trans_desc, cnnl_in_desc, - d_input, cnnl_out_desc, d_output, - workspace, size)); - return MLUOP_STATUS_SUCCESS; + if (size > 0ul) { + printf("transpose2 need size: %zu\n", size); + } + CALL_CNNL(cnnlTranspose_v2(cnnl_handle, cnnl_trans_desc, cnnl_in_desc, + d_input, cnnl_out_desc, d_output, workspace, + size)); + return MLUOP_STATUS_SUCCESS; } - diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index d6b36498a..6d2b56349 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -1,478 +1,424 @@ #include "cholesky.h" -#define COMPLEX_OFFSET(A,off) (((float*)A) + (2 * (off))) +#define COMPLEX_OFFSET(A, off) (((float*)A) + (2 * (off))) #define COMPLEX_TYPE_SIZE ((2) * sizeof(float)) __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; -__mlu_func__ -void small_cgemm(int m,int k, - float* rA0, float* iA0, const int lda, - int width, float* sram_buffer, float* dst) -{ - int id = taskId % 4; - int span = CPOTF_NB; - int finish = id * span; - int remain = m - finish; - bool if_execute = remain > 0; - span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; - - - float* rC = dst + CPOTF_NB *CREC_NB; - float* iC = rC + CPOTF_NB *CREC_NB; - float* rA = iC + CPOTF_NB *CREC_NB; - float* iA = rA + CPOTF_NB *CREC_NB; - float* rp = iA + CPOTF_NB *CREC_NB; - float* ip = rp + CPOTF_NB *CREC_NB; - float* rB = ip + CPOTF_NB *CREC_NB; - float* iB = rB + CPOTF_NB *CREC_NB; - - float *srB = sram_buffer; //srB:shared_real_B - float *siB = srB + CPOTF_NB * CREC_NB; //siB:shared_imag_B - - float* rdst = dst; - float* idst = rdst + CPOTF_NB*CPOTF_NB; - - int total_length = k + width; - int loop_width = CPOTF_NB; - int b_height = std::min(width, CPOTF_NB); - - - if(if_execute) - { - int prefetch_width = std::min(loop_width, total_length); - __memcpy(rp,(rA0+finish*lda),prefetch_width*sizeof(float),GDRAM2NRAM,CPOTF_NB*sizeof(float),lda*sizeof(float),span-1); - __memcpy(ip,(iA0+finish*lda),prefetch_width*sizeof(float),GDRAM2NRAM,CPOTF_NB*sizeof(float),lda*sizeof(float),span-1); - } - __memset_nram(rC,CPOTF_NB*CREC_NB*2,(float)ZERO); - __sync_cluster(); - if(id == 0) - { - __memcpy(srB,rp,CPOTF_NB*CPOTF_NB*sizeof(float),NRAM2SRAM); - __memcpy(siB,ip,CPOTF_NB*CPOTF_NB*sizeof(float),NRAM2SRAM); - } +__mlu_func__ void small_cgemm(int m, int k, float* rA0, float* iA0, + const int lda, int width, float* sram_buffer, + float* dst) { + int id = taskId % 4; + int span = CPOTF_NB; + int finish = id * span; + int remain = m - finish; + bool if_execute = remain > 0; + span = (remain > CPOTF_NB || remain <= 0) ? CPOTF_NB : remain; + + float* rC = dst + CPOTF_NB * CREC_NB; + float* iC = rC + CPOTF_NB * CREC_NB; + float* rA = iC + CPOTF_NB * CREC_NB; + float* iA = rA + CPOTF_NB * CREC_NB; + float* rp = iA + CPOTF_NB * CREC_NB; + float* ip = rp + CPOTF_NB * CREC_NB; + float* rB = ip + CPOTF_NB * CREC_NB; + float* iB = rB + CPOTF_NB * CREC_NB; + + float* srB = sram_buffer; // srB:shared_real_B + float* siB = srB + CPOTF_NB * CREC_NB; // siB:shared_imag_B + + float* rdst = dst; + float* idst = rdst + CPOTF_NB * CPOTF_NB; + + int total_length = k + width; + int loop_width = CPOTF_NB; + int b_height = std::min(width, CPOTF_NB); + + if (if_execute) { + int prefetch_width = std::min(loop_width, total_length); + __memcpy(rp, (rA0 + finish * lda), prefetch_width * sizeof(float), + GDRAM2NRAM, CPOTF_NB * sizeof(float), lda * sizeof(float), + span - 1); + __memcpy(ip, (iA0 + finish * lda), prefetch_width * sizeof(float), + GDRAM2NRAM, CPOTF_NB * sizeof(float), lda * sizeof(float), + span - 1); + } + __memset_nram(rC, CPOTF_NB * CREC_NB * 2, (float)ZERO); + __sync_cluster(); + if (id == 0) { + __memcpy(srB, rp, CPOTF_NB * CPOTF_NB * sizeof(float), NRAM2SRAM); + __memcpy(siB, ip, CPOTF_NB * CPOTF_NB * sizeof(float), NRAM2SRAM); + } + __sync_cluster(); + float a1, a2, b1, b2; + for (int iter = 0; iter < k; iter += loop_width) { + __bang_move(rA, rp, CPOTF_NB * span * sizeof(float)); + __bang_move(iA, ip, CPOTF_NB * span * sizeof(float)); + __memcpy(rB, srB, CPOTF_NB * b_height * sizeof(float), SRAM2NRAM); + __memcpy(iB, siB, CPOTF_NB * b_height * sizeof(float), SRAM2NRAM); __sync_cluster(); - float a1,a2,b1,b2; - for(int iter = 0; iter < k; iter += loop_width) - { - __bang_move(rA,rp,CPOTF_NB * span*sizeof(float)); - __bang_move(iA,ip,CPOTF_NB * span*sizeof(float)); - __memcpy(rB,srB,CPOTF_NB*b_height*sizeof(float),SRAM2NRAM); - __memcpy(iB,siB,CPOTF_NB*b_height*sizeof(float),SRAM2NRAM); - __sync_cluster(); - if(if_execute) - { - int prefetch_width = std::min(loop_width, total_length-iter-loop_width); - __memcpy_async(rp,(rA0+finish*lda+iter+loop_width),prefetch_width*sizeof(float),GDRAM2NRAM,CPOTF_NB*sizeof(float),lda*sizeof(float),span-1); - __memcpy_async(ip,(iA0+finish*lda+iter+loop_width),prefetch_width*sizeof(float),GDRAM2NRAM,CPOTF_NB*sizeof(float),lda*sizeof(float),span-1); - } - for(int i = 0; i < span; i++) - { - for(int j = 0; j < b_height; j++) - { - for(int h = 0; h < loop_width; h++) - { - a1 = rA[(i*CPOTF_NB+h)]; - b1 = iA[(i*CPOTF_NB+h)]; - a2 = rB[(j*CPOTF_NB+h)]; - b2 = iB[(j*CPOTF_NB+h)]; - rC[(i*CPOTF_NB+j)] += (a1*a2+b1*b2); - iC[(i*CPOTF_NB+j)] += (a2*b1-a1*b2); - } - } - } - __sync_cluster(); - if(id == 0) - { - __memcpy(srB,rp,CPOTF_NB*b_height*sizeof(float),NRAM2SRAM); - __memcpy(siB,ip,CPOTF_NB*b_height*sizeof(float),NRAM2SRAM); + if (if_execute) { + int prefetch_width = + std::min(loop_width, total_length - iter - loop_width); + __memcpy_async(rp, (rA0 + finish * lda + iter + loop_width), + prefetch_width * sizeof(float), GDRAM2NRAM, + CPOTF_NB * sizeof(float), lda * sizeof(float), span - 1); + __memcpy_async(ip, (iA0 + finish * lda + iter + loop_width), + prefetch_width * sizeof(float), GDRAM2NRAM, + CPOTF_NB * sizeof(float), lda * sizeof(float), span - 1); + } + for (int i = 0; i < span; i++) { + for (int j = 0; j < b_height; j++) { + for (int h = 0; h < loop_width; h++) { + a1 = rA[(i * CPOTF_NB + h)]; + b1 = iA[(i * CPOTF_NB + h)]; + a2 = rB[(j * CPOTF_NB + h)]; + b2 = iB[(j * CPOTF_NB + h)]; + rC[(i * CPOTF_NB + j)] += (a1 * a2 + b1 * b2); + iC[(i * CPOTF_NB + j)] += (a2 * b1 - a1 * b2); } - __sync_cluster(); + } } - - __bang_sub(rp,rp,rC,CPOTF_NB * span); - __bang_sub(ip,ip,iC,CPOTF_NB * span); - - if(if_execute) - { - __memcpy(rdst,rp,span*CPOTF_NB*sizeof(float),NRAM2NRAM); - __memcpy(idst,ip,span*CPOTF_NB*sizeof(float),NRAM2NRAM); - } - if(id == 0) - { - __memcpy(sram_buffer,rp,span*CPOTF_NB*sizeof(float),NRAM2SRAM); - __memcpy(sram_buffer+CPOTF_NB*CPOTF_NB,ip,span*CPOTF_NB*sizeof(float),NRAM2SRAM); + __sync_cluster(); + if (id == 0) { + __memcpy(srB, rp, CPOTF_NB * b_height * sizeof(float), NRAM2SRAM); + __memcpy(siB, ip, CPOTF_NB * b_height * sizeof(float), NRAM2SRAM); } __sync_cluster(); -} + } + __bang_sub(rp, rp, rC, CPOTF_NB * span); + __bang_sub(ip, ip, iC, CPOTF_NB * span); + if (if_execute) { + __memcpy(rdst, rp, span * CPOTF_NB * sizeof(float), NRAM2NRAM); + __memcpy(idst, ip, span * CPOTF_NB * sizeof(float), NRAM2NRAM); + } + if (id == 0) { + __memcpy(sram_buffer, rp, span * CPOTF_NB * sizeof(float), NRAM2SRAM); + __memcpy(sram_buffer + CPOTF_NB * CPOTF_NB, ip, + span * CPOTF_NB * sizeof(float), NRAM2SRAM); + } + __sync_cluster(); +} -__mlu_func__ -void small_cminout(int m, int width, - float *dst, float *sram_buffer, int lda) -{ - float factor; - int id = taskId % 4; - int finish = id * CPOTF_NB; - int remain = m - finish; - bool if_execute = remain > 0; - int span = 2; - span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; - float *rdst = dst; - float *idst = dst + CPOTF_NB*CPOTF_NB; - float *rdiag = idst + CPOTF_NB *CREC_NB; - float *idiag = rdiag + CPOTF_NB*CPOTF_NB; - - if(if_execute) - { - - __memcpy(rdiag,sram_buffer,width*CPOTF_NB*sizeof(float),SRAM2NRAM); - __memcpy(idiag,sram_buffer+CPOTF_NB*CPOTF_NB,width*CPOTF_NB*sizeof(float),SRAM2NRAM); - for(int iter = 0; iter < width; iter++) - { - factor = sqrt(rdiag[(iter * CPOTF_NB+iter)]); - factor = 1.0/factor; - for(int i = 0; i 0; + int span = 2; + span = (remain > CPOTF_NB || remain <= 0) ? CPOTF_NB : remain; + float* rdst = dst; + float* idst = dst + CPOTF_NB * CPOTF_NB; + float* rdiag = idst + CPOTF_NB * CREC_NB; + float* idiag = rdiag + CPOTF_NB * CPOTF_NB; + + if (if_execute) { + __memcpy(rdiag, sram_buffer, width * CPOTF_NB * sizeof(float), SRAM2NRAM); + __memcpy(idiag, sram_buffer + CPOTF_NB * CPOTF_NB, + width * CPOTF_NB * sizeof(float), SRAM2NRAM); + for (int iter = 0; iter < width; iter++) { + factor = sqrt(rdiag[(iter * CPOTF_NB + iter)]); + factor = 1.0 / factor; + for (int i = 0; i < width; i++) { + rdiag[(i * CPOTF_NB + iter)] *= factor; + idiag[(i * CPOTF_NB + iter)] *= factor; + + rdst[(i * CPOTF_NB + iter)] *= factor; + idst[(i * CPOTF_NB + iter)] *= factor; + } + + __sync(); + for (int i = iter + 1; i < width; i++) { + for (int j = 0; j < width; j++) { + float a1, b1, a2, b2, a3, b3; + a1 = rdst[(j * CPOTF_NB + iter)]; + b1 = idst[(j * CPOTF_NB + iter)]; + a2 = rdiag[(i * CPOTF_NB + iter)]; + b2 = idiag[(i * CPOTF_NB + iter)]; + a3 = rdiag[(j * CPOTF_NB + iter)]; + b3 = idiag[(j * CPOTF_NB + iter)]; + + rdst[(j * CPOTF_NB + i)] -= (a1 * a2 + b1 * b2); // a4 + idst[(j * CPOTF_NB + i)] -= (a2 * b1 - a1 * b2); // b4 + rdiag[(j * CPOTF_NB + i)] -= (a3 * a2 + b3 * b2); // a5 + idiag[(j * CPOTF_NB + i)] -= (a2 * b3 - a3 * b2); // b5 } + } } - __sync_cluster(); - + } + __sync_cluster(); } +__mlu_func__ void cmplout(int batch, const int m, float* rA0, float* rA, + float* iA0, float* iA, int lda, int localstep, + int width) { + int id = taskId % 4; + int finish = id * CPOTF_NB; + int remain = m - finish; + bool if_execute = remain > 0; + int span = (remain > CPOTF_NB || remain <= 0) ? CPOTF_NB : remain; + float* dst = (float*)nram_buffer; + small_cgemm(m, localstep, rA0, iA0, lda, width, (float*)sram_buffer, dst); + __sync_cluster(); + small_cminout(m, width, dst, (float*)sram_buffer, CPOTF_NB); -__mlu_func__ void cmplout(int batch, const int m, float *rA0, float *rA,float *iA0, float *iA, int lda, int localstep, int width) -{ - int id = taskId%4; - int finish = id * CPOTF_NB; - int remain = m - finish; - bool if_execute = remain > 0; - int span = (remain > CPOTF_NB||remain <= 0) ? CPOTF_NB : remain; - float* dst = (float*)nram_buffer; - small_cgemm(m, localstep, rA0, iA0, lda, width, (float*)sram_buffer, dst); + __sync_cluster(); - __sync_cluster(); + float* rdst = dst; + float* idst = dst + CPOTF_NB * CPOTF_NB; - + if (id == 0) { + for (int i = 0; i < width; i++) { + __memcpy((rA + (i * lda)), (rdst + (i * CPOTF_NB)), + (i + 1) * sizeof(float), NRAM2LDRAM); + __memcpy((iA + (i * lda)), (idst + (i * CPOTF_NB)), + (i + 1) * sizeof(float), NRAM2LDRAM); + } + + } else if (if_execute) { + __memcpy((rA + (finish * lda)), rdst, width * sizeof(float), NRAM2LDRAM, + lda * sizeof(float), CPOTF_NB * sizeof(float), span - 1); + __memcpy((iA + (finish * lda)), idst, width * sizeof(float), NRAM2LDRAM, + lda * sizeof(float), CPOTF_NB * sizeof(float), span - 1); + span = 0; + } + __sync_cluster(); +} - small_cminout(m, width, dst, (float*)sram_buffer, CPOTF_NB); +__mlu_func__ void small_cgemm_batch(int batch, int m, int k, float* rA0, + float* iA0, const int lda, int width, + float* r_dst, float* i_dst) { + int ldk = k; + int ldm = m; + + float* r_dst2 = i_dst + m * width; + float* i_dst2 = r_dst2 + m * width; + float* r_src1 = i_dst2 + m * width; + float* i_src1 = r_src1 + ldk * ldm; + float* r_src2 = i_src1 + ldk * ldm; + float* i_src2 = r_src2 + width * ldk; + + float* r_dA = rA0 + k; + float* i_dA = iA0 + k; + __memcpy_async(r_dst, r_dA, width * sizeof(float), GDRAM2NRAM, + width * sizeof(float), lda * sizeof(float), m - 1); + __memcpy_async(i_dst, i_dA, width * sizeof(float), GDRAM2NRAM, + width * sizeof(float), lda * sizeof(float), m - 1); + + if (k == 0) { + __sync(); - __sync_cluster(); + return; + } - float *rdst = dst; - float *idst = dst + CPOTF_NB*CPOTF_NB; - - if(id == 0) - { - for(int i = 0; i < width; i++) - { - __memcpy((rA+(i*lda)),(rdst+(i*CPOTF_NB)),(i+1)*sizeof(float),NRAM2LDRAM); - __memcpy((iA+(i*lda)),(idst+(i*CPOTF_NB)),(i+1)*sizeof(float),NRAM2LDRAM); - } - - } - else if(if_execute) - { - __memcpy((rA+(finish*lda)),rdst,width*sizeof(float),NRAM2LDRAM,lda*sizeof(float),CPOTF_NB*sizeof(float),span-1); - __memcpy((iA+(finish*lda)),idst,width*sizeof(float),NRAM2LDRAM,lda*sizeof(float),CPOTF_NB*sizeof(float),span-1); - span = 0; - } - __sync_cluster(); - - -} + __memset_nram(r_src1, 2 * ldm * ldk, (float)ZERO); -__mlu_func__ -void small_cgemm_batch(int batch, int m,int k, - float* rA0, float* iA0, const int lda, - int width, float* r_dst,float* i_dst) -{ - int ldk = k; - int ldm = m; - - float* r_dst2 = i_dst + m * width; - float* i_dst2 = r_dst2 + m * width; - float* r_src1 = i_dst2 + m * width; - float* i_src1 = r_src1 + ldk * ldm; - float* r_src2 = i_src1 + ldk * ldm; - float* i_src2 = r_src2 + width * ldk; - - float* r_dA = rA0 + k; - float* i_dA = iA0 + k; - __memcpy_async(r_dst, r_dA, width*sizeof(float),GDRAM2NRAM,width*sizeof(float),lda*sizeof(float),m-1); - __memcpy_async(i_dst, i_dA, width*sizeof(float),GDRAM2NRAM,width*sizeof(float),lda*sizeof(float),m-1); - - if(k == 0) - { - __sync(); - - return; - } + __memcpy_async(r_src1, rA0, k * sizeof(float), GDRAM2NRAM, + ldk * sizeof(float), lda * sizeof(float), m - 1); + __memcpy_async(i_src1, iA0, k * sizeof(float), GDRAM2NRAM, + ldk * sizeof(float), lda * sizeof(float), m - 1); - __memset_nram(r_src1,2*ldm*ldk,(float)ZERO); + __memset_nram(r_dst2, 2 * ldm * width, (float)ZERO); - __memcpy_async(r_src1, rA0, k*sizeof(float),GDRAM2NRAM,ldk*sizeof(float),lda*sizeof(float),m-1); - __memcpy_async(i_src1, iA0, k*sizeof(float),GDRAM2NRAM,ldk*sizeof(float),lda*sizeof(float),m-1); + __sync(); - __memset_nram(r_dst2,2*ldm*width,(float)ZERO); + __memcpy(r_src2, r_src1, ldk * width * sizeof(float), NRAM2NRAM); - __sync(); + __memcpy(i_src2, i_src1, ldk * width * sizeof(float), NRAM2NRAM); - __memcpy(r_src2, r_src1, ldk*width*sizeof(float),NRAM2NRAM); - - __memcpy(i_src2, i_src1, ldk*width*sizeof(float),NRAM2NRAM); - - - float a1,a2,b1,b2; - for(int i = 0; i < m; i++) - { - for(int j = 0; j < width; j++) - { - for(int h = 0; h < k; h++) - { - a1 = r_src1[i*ldk+h]; - b1 = i_src1[i*ldk+h]; - a2 = r_src2[j*ldk+h]; - b2 = i_src2[j*ldk+h]; - r_dst2[i*width+j] += (a1*a2+b1*b2); - i_dst2[i*width+j] += (a2*b1-a1*b2); - } - } + float a1, a2, b1, b2; + for (int i = 0; i < m; i++) { + for (int j = 0; j < width; j++) { + for (int h = 0; h < k; h++) { + a1 = r_src1[i * ldk + h]; + b1 = i_src1[i * ldk + h]; + a2 = r_src2[j * ldk + h]; + b2 = i_src2[j * ldk + h]; + r_dst2[i * width + j] += (a1 * a2 + b1 * b2); + i_dst2[i * width + j] += (a2 * b1 - a1 * b2); + } } - + } - __bang_sub(r_dst,r_dst,r_dst2,width * m); - __bang_sub(i_dst,i_dst,i_dst2,width * m); + __bang_sub(r_dst, r_dst, r_dst2, width * m); + __bang_sub(i_dst, i_dst, i_dst2, width * m); - __sync(); + __sync(); } -__mlu_func__ -void small_cminout_batch(int m, int width, - float *r_dst, float* i_dst, int lda) -{ - float factor; - float* r_diag = r_dst; - float* i_diag = i_dst; - - float a1,a2,b1,b2; +__mlu_func__ void small_cminout_batch(int m, int width, float* r_dst, + float* i_dst, int lda) { + float factor; + float* r_diag = r_dst; + float* i_diag = i_dst; - - + float a1, a2, b1, b2; - for(int iter = 0; iter < width; iter++) - { - - - if (r_diag[iter*width+iter]<0) - { - printf("iter:%d,taskId:%d\n",iter,taskId); - } - factor = sqrt(r_diag[iter*width+iter]); - factor = 1.0/factor; - for(int i = 0; i < m; i++) - { - r_dst[i*width+iter] *= factor; - i_dst[i*width+iter] *= factor; - } - __sync(); - for(int i = iter+1; i < width; i++) - { - for(int j = 0; j < m; j++) - { - a1 = r_dst[(j*width+iter)]; - b1 = i_dst[(j*width+iter)]; - a2 = r_diag[(i*width+iter)]; - b2 = i_diag[(i*width+iter)]; - - r_dst[(j*width+i)] -= (a1*a2+b1*b2); - i_dst[(j*width+i)] -= (a2*b1-a1*b2); - } - } - __sync(); + for (int iter = 0; iter < width; iter++) { + if (r_diag[iter * width + iter] < 0) { + printf("iter:%d,taskId:%d\n", iter, taskId); + } + factor = sqrt(r_diag[iter * width + iter]); + factor = 1.0 / factor; + for (int i = 0; i < m; i++) { + r_dst[i * width + iter] *= factor; + i_dst[i * width + iter] *= factor; } __sync(); + for (int i = iter + 1; i < width; i++) { + for (int j = 0; j < m; j++) { + a1 = r_dst[(j * width + iter)]; + b1 = i_dst[(j * width + iter)]; + a2 = r_diag[(i * width + iter)]; + b2 = i_diag[(i * width + iter)]; - + r_dst[(j * width + i)] -= (a1 * a2 + b1 * b2); + i_dst[(j * width + i)] -= (a2 * b1 - a1 * b2); + } + } + __sync(); + } + __sync(); } -__mlu_func__ -void smlpout_batch(const int m, float *rA0, float* iA0, - float *rA, float* iA, int lda, const int localstep, int width) -{ - float* r_dst = (float*)nram_buffer; - float* i_dst = r_dst + m * width; +__mlu_func__ void smlpout_batch(const int m, float* rA0, float* iA0, float* rA, + float* iA, int lda, const int localstep, + int width) { + float* r_dst = (float*)nram_buffer; + float* i_dst = r_dst + m * width; - small_cgemm_batch(1,m,localstep,rA0,iA0,lda,width,r_dst,i_dst); + small_cgemm_batch(1, m, localstep, rA0, iA0, lda, width, r_dst, i_dst); - __sync(); + __sync(); - small_cminout_batch(m,width,r_dst,i_dst,lda); + small_cminout_batch(m, width, r_dst, i_dst, lda); - __sync(); + __sync(); - for(int i = 0;i < width; i++) - { - __memcpy((rA+(i*lda)),(r_dst+(i*width)),(i+1)*sizeof(float),NRAM2GDRAM); - __memcpy((iA+(i*lda)),(i_dst+(i*width)),(i+1)*sizeof(float),NRAM2GDRAM); - } + for (int i = 0; i < width; i++) { + __memcpy((rA + (i * lda)), (r_dst + (i * width)), (i + 1) * sizeof(float), + NRAM2GDRAM); + __memcpy((iA + (i * lda)), (i_dst + (i * width)), (i + 1) * sizeof(float), + NRAM2GDRAM); + } - if(m > width) - { - __memcpy(rA+(width*lda),r_dst+width*width,width*sizeof(float),NRAM2GDRAM,lda*sizeof(float),width*sizeof(float),m-width-1); - __memcpy(iA+(width*lda),i_dst+width*width,width*sizeof(float),NRAM2GDRAM,lda*sizeof(float),width*sizeof(float),m-width-1); - } + if (m > width) { + __memcpy(rA + (width * lda), r_dst + width * width, width * sizeof(float), + NRAM2GDRAM, lda * sizeof(float), width * sizeof(float), + m - width - 1); + __memcpy(iA + (width * lda), i_dst + width * width, width * sizeof(float), + NRAM2GDRAM, lda * sizeof(float), width * sizeof(float), + m - width - 1); + } - __sync(); + __sync(); } -__mlu_global__ void cpotf_kernel(int batch, int stride, int m, float *drA, float *diA, int lda) -{ - int width = CPOTF_NB; - int span = width; - float* origin_rA, *origin_iA; - origin_rA = drA; - origin_iA = diA; - int id = taskId; - int batch_id = id / 4; - if(batch_id >= batch) - return; - drA = origin_rA + batch_id * stride; - diA = origin_iA + batch_id * stride; - for(int i = 0; i < m; i += width) - { - span = std::min(width, m - i); - cmplout(batch, m-i, (drA+i*lda), (drA+i*lda+i), (diA+i*lda), (diA+i*lda+i), lda, i, span); - } +__mlu_global__ void cpotf_kernel(int batch, int stride, int m, float* drA, + float* diA, int lda) { + int width = CPOTF_NB; + int span = width; + float *origin_rA, *origin_iA; + origin_rA = drA; + origin_iA = diA; + int id = taskId; + int batch_id = id / 4; + if (batch_id >= batch) return; + drA = origin_rA + batch_id * stride; + diA = origin_iA + batch_id * stride; + for (int i = 0; i < m; i += width) { + span = std::min(width, m - i); + cmplout(batch, m - i, (drA + i * lda), (drA + i * lda + i), (diA + i * lda), + (diA + i * lda + i), lda, i, span); + } } -__mlu_global__ -void cpotf_batch_kernel(int batch, int stride, int m, float *r_dA, float* i_dA, int lda) -{ - int id = taskId; - int batch_id = id; - if(batch_id >= batch) - return; - float* r_orignA = r_dA; - float* i_orignA = i_dA; - r_dA = r_orignA + batch_id * stride; - i_dA = i_orignA + batch_id * stride; - int width = CPOTF_NB; - int span = width; - - for(int i = 0; i < m; i += width) - { - span = std::min(width, m - i); - smlpout_batch(m-i, r_dA+i*lda, i_dA+i*lda, r_dA+i*lda+i, i_dA+i*lda+i, lda, i, span); - - } - +__mlu_global__ void cpotf_batch_kernel(int batch, int stride, int m, + float* r_dA, float* i_dA, int lda) { + int id = taskId; + int batch_id = id; + if (batch_id >= batch) return; + float* r_orignA = r_dA; + float* i_orignA = i_dA; + r_dA = r_orignA + batch_id * stride; + i_dA = i_orignA + batch_id * stride; + int width = CPOTF_NB; + int span = width; + + for (int i = 0; i < m; i += width) { + span = std::min(width, m - i); + smlpout_batch(m - i, r_dA + i * lda, i_dA + i * lda, r_dA + i * lda + i, + i_dA + i * lda + i, lda, i, span); + } } -mluOpStatus_t mlu_cpotf_lpin(int batch, int stride, int n, int lda, float* drA, float* diA, cnrtQueue_t queue) -{ - cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; - dim.y = 1; - dim.z = 1; - if (batch < 8) - { - dim.x = 4*batch; - KERNEL_CHECK(cpotf_kernel<<>>(batch, stride, n, drA,diA, lda)); - } - else - { - func_type = CNRT_FUNC_TYPE_BLOCK; - dim.x = batch; - KERNEL_CHECK(cpotf_batch_kernel<<>>(batch, stride, n, drA,diA, lda)); - } - - return MLUOP_STATUS_SUCCESS; +mluOpStatus_t mlu_cpotf_lpin(int batch, int stride, int n, int lda, float* drA, + float* diA, cnrtQueue_t queue) { + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_UNION1; + dim.y = 1; + dim.z = 1; + if (batch < 8) { + dim.x = 4 * batch; + KERNEL_CHECK(cpotf_kernel<<>>(batch, stride, n, drA, + diA, lda)); + } else { + func_type = CNRT_FUNC_TYPE_BLOCK; + dim.x = batch; + KERNEL_CHECK(cpotf_batch_kernel<<>>(batch, stride, n, + drA, diA, lda)); + } + + return MLUOP_STATUS_SUCCESS; } -__mlu_global__ -void add_c1(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) -{ - - int id = taskId; - int ipu_per_cluster = 4; - int batch_id = id / ipu_per_cluster; - if(batch_id >= batch) - return; - id = taskId % ipu_per_cluster; - float* orignC = d_c; - float* orignSrc = src; - d_c = orignC + batch_id * stride; - src = orignSrc + batch_id * m*n; - - - if (beta == 0.0f) - { - if(id == 0) - { - __memcpy(sram_buffer,src,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); - - } - __sync_cluster(); - if(id == 0) - { - __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); - } - __sync_cluster(); - return; +__mlu_global__ void add_c1(int batch, int stride, float beta, float* d_c, + float* src, int ldc, int ldsrc, int m, int n) { + int id = taskId; + int ipu_per_cluster = 4; + int batch_id = id / ipu_per_cluster; + if (batch_id >= batch) return; + id = taskId % ipu_per_cluster; + float* orignC = d_c; + float* orignSrc = src; + d_c = orignC + batch_id * stride; + src = orignSrc + batch_id * m * n; + + if (beta == 0.0f) { + if (id == 0) { + __memcpy(sram_buffer, src, n * sizeof(float), GDRAM2SRAM, + n * sizeof(float), ldsrc * sizeof(float), m - 1); } - - float* a_sram = (float*)sram_buffer + 3* m * n; - + __sync_cluster(); if (id == 0) { - __memcpy(sram_buffer,d_c,n*sizeof(float),GDRAM2SRAM,n*sizeof(float),ldc*sizeof(float),m-1); - __memcpy(a_sram,src,n*m*sizeof(float),GDRAM2SRAM); + __memcpy(d_c, sram_buffer, n * sizeof(float), SRAM2LDRAM, + ldc * sizeof(float), n * sizeof(float), m - 1); } + __sync_cluster(); + return; + } - __sync_cluster(); + float* a_sram = (float*)sram_buffer + 3 * m * n; + + if (id == 0) { + __memcpy(sram_buffer, d_c, n * sizeof(float), GDRAM2SRAM, n * sizeof(float), + ldc * sizeof(float), m - 1); + __memcpy(a_sram, src, n * m * sizeof(float), GDRAM2SRAM); + } + __sync_cluster(); - int32_t data_num = m*n; + int32_t data_num = m * n; int32_t data_per_core = data_num / ipu_per_cluster; int32_t data_last_core = data_per_core + data_num % ipu_per_cluster; - const float *a_offset = a_sram + id * data_per_core; - const float *b_offset = (float*)sram_buffer + id * data_per_core; - float *output_offset = (float*)sram_buffer + id * data_per_core; + const float* a_offset = a_sram + id * data_per_core; + const float* b_offset = (float*)sram_buffer + id * data_per_core; + float* output_offset = (float*)sram_buffer + id * data_per_core; if (id == ipu_per_cluster - 1) { data_per_core = data_last_core; @@ -481,9 +427,9 @@ void add_c1(int batch, int stride, float beta, float *d_c, float* src,int ldc, i int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); int32_t data_nram_num = - MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; - float *a_nram = (float *)nram_buffer; - float *b_nram = (float *)a_nram + data_nram_num; + MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; + float* a_nram = (float*)nram_buffer; + float* b_nram = (float*)a_nram + data_nram_num; int32_t loop_num = data_per_core / data_nram_num; int32_t rem_nram_num = data_per_core % data_nram_num; @@ -498,7 +444,7 @@ void add_c1(int batch, int stride, float beta, float *d_c, float* src,int ldc, i } if (rem_nram_num != 0) { int32_t rem_align_num = - (rem_nram_num + align_num - 1) / align_num * align_num; + (rem_nram_num + align_num - 1) / align_num * align_num; __memcpy(a_nram, a_offset + loop_num * data_nram_num, rem_nram_num * sizeof(float), SRAM2NRAM); __memcpy(b_nram, b_offset + loop_num * data_nram_num, @@ -510,77 +456,66 @@ void add_c1(int batch, int stride, float beta, float *d_c, float* src,int ldc, i __sync_cluster(); if (id == 0) { - __memcpy(d_c,sram_buffer,n*sizeof(float),SRAM2GDRAM,ldc*sizeof(float),n*sizeof(float),m-1); - + __memcpy(d_c, sram_buffer, n * sizeof(float), SRAM2GDRAM, + ldc * sizeof(float), n * sizeof(float), m - 1); } __sync_cluster(); - } +__mlu_global__ void complex_add_c(int batch, int stride, float beta, float* d_c, + float* src, int ldc, int ldsrc, int m, + int n) { + int id = taskId; + int ipu_per_cluster = 4; + id = taskId; + + int span = m / 4; + int finish = id * span; + if (id == 3) { + span = m - 3 * span; + } -__mlu_global__ -void complex_add_c(int batch, int stride, float beta, float *d_c, float* src,int ldc, int ldsrc, int m, int n) -{ - - int id = taskId; - int ipu_per_cluster = 4; - id = taskId; - - int span = m/4; - int finish = id * span; - if(id == 3) - { - span = m - 3 * span; + float* sram_buffer = (float*)nram_buffer; + if (beta == 0.0f) { + if (id == 0) { + __memcpy(sram_buffer, src, n * sizeof(float), GDRAM2NRAM, + n * sizeof(float), ldsrc * sizeof(float), m - 1); } - - - - float* sram_buffer = (float*)nram_buffer; - if (beta == 0.0f) - { - if(id == 0) - { - __memcpy(sram_buffer,src,n*sizeof(float),GDRAM2NRAM,n*sizeof(float),ldsrc*sizeof(float),m-1); - - } - __sync_cluster(); - if(id == 0) - { - __memcpy(d_c,sram_buffer,n*sizeof(float),NRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),m-1); - } - __sync_cluster(); - return; + __sync_cluster(); + if (id == 0) { + __memcpy(d_c, sram_buffer, n * sizeof(float), NRAM2LDRAM, + ldc * sizeof(float), n * sizeof(float), m - 1); } + __sync_cluster(); + return; + } - float* a_sram = (float*)sram_buffer + 3* m * n; - - int d_c_offset = ldc*finish; - int src_offset = ldsrc*finish; - - __memcpy(sram_buffer,d_c+d_c_offset,n*sizeof(float),LDRAM2NRAM,n*sizeof(float),ldc*sizeof(float),span-1); - __memcpy(a_sram,src+src_offset,n*span*sizeof(float),LDRAM2NRAM); + float* a_sram = (float*)sram_buffer + 3 * m * n; + int d_c_offset = ldc * finish; + int src_offset = ldsrc * finish; + __memcpy(sram_buffer, d_c + d_c_offset, n * sizeof(float), LDRAM2NRAM, + n * sizeof(float), ldc * sizeof(float), span - 1); + __memcpy(a_sram, src + src_offset, n * span * sizeof(float), LDRAM2NRAM); - int32_t data_per_core = span*n; + int32_t data_per_core = span * n; int32_t data_last_core = data_per_core; - const float *a_offset = a_sram; - const float *b_offset = (float*)sram_buffer; - float *output_offset = (float*)sram_buffer; + const float* a_offset = a_sram; + const float* b_offset = (float*)sram_buffer; + float* output_offset = (float*)sram_buffer; if (id == ipu_per_cluster - 1) { data_per_core = data_last_core; } - - int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); int32_t data_nram_num = - MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; - float *a_nram = (float *)a_sram + m*n; - float *b_nram = (float *)a_nram + data_nram_num; + MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; + float* a_nram = (float*)a_sram + m * n; + float* b_nram = (float*)a_nram + data_nram_num; int32_t loop_num = data_per_core / data_nram_num; int32_t rem_nram_num = data_per_core % data_nram_num; @@ -595,7 +530,7 @@ void complex_add_c(int batch, int stride, float beta, float *d_c, float* src,int } if (rem_nram_num != 0) { int32_t rem_align_num = - (rem_nram_num + align_num - 1) / align_num * align_num; + (rem_nram_num + align_num - 1) / align_num * align_num; __memcpy(a_nram, a_offset + loop_num * data_nram_num, rem_nram_num * sizeof(float), NRAM2NRAM); __memcpy(b_nram, b_offset + loop_num * data_nram_num, @@ -603,467 +538,457 @@ void complex_add_c(int batch, int stride, float beta, float *d_c, float* src,int __bang_add(a_nram, a_nram, b_nram, rem_align_num); __memcpy(output_offset + loop_num * data_nram_num, a_nram, rem_nram_num * sizeof(float), NRAM2NRAM); - } -__memcpy(d_c+d_c_offset,sram_buffer,n*sizeof(float),NRAM2LDRAM,ldc*sizeof(float),n*sizeof(float),span-1); - - - + __memcpy(d_c + d_c_offset, sram_buffer, n * sizeof(float), NRAM2LDRAM, + ldc * sizeof(float), n * sizeof(float), span - 1); } +mluOpStatus_t workspace_malloc(size_t size, float** workspace) { + CNRT_CHECK(cnrtMalloc((void**)workspace, size)); -mluOpStatus_t workspace_malloc(size_t size, float** workspace) -{ - CNRT_CHECK(cnrtMalloc((void **)workspace, size)); - - return MLUOP_STATUS_SUCCESS; + return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t workspace_free(float** workspace) -{ - CNRT_CHECK(cnrtFree((void *)(*workspace))); +mluOpStatus_t workspace_free(float** workspace) { + CNRT_CHECK(cnrtFree((void*)(*workspace))); - return MLUOP_STATUS_SUCCESS; + return MLUOP_STATUS_SUCCESS; } +__mlu_global__ void complex_inverse_kernel(int batch, float* rd_input, + float* id_input, int ld_input, + int stride_input, float* rd_output, + float* id_output, int ld_output, + int stride_output, int m) { + int id = taskId; + id = taskId % 4; + int batch_id = taskId / 4; + if (batch_id >= batch) return; + float* origin_r_input = rd_input; + float* origin_i_input = id_input; + float* origin_r_output = rd_output; + float* origin_i_output = id_output; + rd_input = origin_r_input + batch_id * stride_input; + id_input = origin_i_input + batch_id * stride_input; + rd_output = origin_r_output + batch_id * stride_output; + id_output = origin_i_output + batch_id * stride_output; + + int span = m / 4; + int start = id * span; + if (id == 3) { + span = m - 3 * span; + } + float* nram_offset = (float*)nram_buffer; + float* rdiag_start = (float*)nram_offset; + float* idiag_start = rdiag_start + m * m; + float* r_nram_src1 = idiag_start + m * m; + float* i_nram_src1 = r_nram_src1 + m * m; + float* r_nram_src2 = i_nram_src1 + m * m; + float* i_nram_src2 = r_nram_src2 + m; + float* r_mul_result = i_nram_src2 + m; + float* i_mul_result = r_mul_result + m; + float* r_nram_dst = i_mul_result + m; + float* i_nram_dst = r_nram_dst + m * m; + + int height = m - start; + + __memset_nram(nram_offset, 4 * m * m * 2 + 2, (float)ZERO); + + if (span > 0) { + __memcpy(rdiag_start, rd_input + ld_input * start + start, + height * sizeof(float), LDRAM2NRAM, m * sizeof(float), + ld_input * sizeof(float), height - 1); + __memcpy(idiag_start, id_input + ld_input * start + start, + height * sizeof(float), LDRAM2NRAM, m * sizeof(float), + ld_input * sizeof(float), height - 1); + } -__mlu_global__ -void complex_inverse_kernel(int batch, float *rd_input, float *id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m) -{ - int id = taskId; - id = taskId % 4; - int batch_id = taskId/4; - if(batch_id >= batch) - return; - float* origin_r_input = rd_input; - float* origin_i_input = id_input; - float* origin_r_output = rd_output; - float* origin_i_output = id_output; - rd_input = origin_r_input + batch_id * stride_input; - id_input = origin_i_input + batch_id * stride_input; - rd_output = origin_r_output + batch_id * stride_output; - id_output = origin_i_output + batch_id * stride_output; - - - int span = m/4; - int start = id * span; - if (id == 3) - { - span = m - 3 * span; - } - float* nram_offset = (float*)nram_buffer; - float* rdiag_start = (float*)nram_offset; - float* idiag_start = rdiag_start + m * m; - float* r_nram_src1 = idiag_start + m * m; - float* i_nram_src1 = r_nram_src1 + m * m; - float* r_nram_src2 = i_nram_src1 + m * m; - float* i_nram_src2 = r_nram_src2 + m; - float* r_mul_result = i_nram_src2 + m; - float* i_mul_result = r_mul_result + m; - float* r_nram_dst = i_mul_result + m; - float* i_nram_dst = r_nram_dst + m * m; - - int height = m - start; - - __memset_nram(nram_offset, 4 * m * m * 2+2, (float)ZERO); - - if(span > 0) - { - __memcpy(rdiag_start,rd_input + ld_input * start + start,height*sizeof(float),LDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),height-1); - __memcpy(idiag_start,id_input + ld_input * start + start,height*sizeof(float),LDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),height-1); - } - - float result = 0.0; - for(int i = 0; i < height; i++) - { - int off = i * m + i; - result = rdiag_start[off]; - result = 1.0 / result; - r_nram_src1[i*height+i] = result; - r_nram_dst[i*span + i] = result; - rdiag_start[off] = result; - - } - + float result = 0.0; + for (int i = 0; i < height; i++) { + int off = i * m + i; + result = rdiag_start[off]; + result = 1.0 / result; + r_nram_src1[i * height + i] = result; + r_nram_dst[i * span + i] = result; + rdiag_start[off] = result; + } - for(int i = 1; i < height; i++) - { - __memcpy(r_nram_src2,rdiag_start+i*m,i*sizeof(float),NRAM2NRAM); - __memcpy(i_nram_src2,idiag_start+i*m,i*sizeof(float),NRAM2NRAM); - int num = std::min(i, span); - float diag_element = rdiag_start[i*m+i]; - for(int j = 0; j < num; j++) - { - float r_temp = 0.0; - float i_temp = 0.0; - - __bang_mul(r_mul_result,r_nram_src2,r_nram_src1+j*height,i); - __bang_mul(i_mul_result,r_nram_src2,i_nram_src1+j*height,i); - for(int k = 0; k< i; k++) - { - r_temp += r_mul_result[k]; - i_temp += i_mul_result[k]; - - } - __bang_mul(r_mul_result,i_nram_src2,i_nram_src1+j*height,i); - __bang_mul(i_mul_result,i_nram_src2,r_nram_src1+j*height,i); - for(int k = 0; k< i; k++) - { - r_temp += r_mul_result[k]; - i_temp -= i_mul_result[k]; - } - r_temp = r_temp * -1.0 * diag_element; - i_temp = i_temp * -1.0 * diag_element; - r_nram_dst[i*span+j] = r_temp; - i_nram_dst[i*span+j] = i_temp; - r_nram_src1[j*height+i] = r_temp; - i_nram_src1[j*height+i] = i_temp; - } - __sync(); - + for (int i = 1; i < height; i++) { + __memcpy(r_nram_src2, rdiag_start + i * m, i * sizeof(float), NRAM2NRAM); + __memcpy(i_nram_src2, idiag_start + i * m, i * sizeof(float), NRAM2NRAM); + int num = std::min(i, span); + float diag_element = rdiag_start[i * m + i]; + for (int j = 0; j < num; j++) { + float r_temp = 0.0; + float i_temp = 0.0; + + __bang_mul(r_mul_result, r_nram_src2, r_nram_src1 + j * height, i); + __bang_mul(i_mul_result, r_nram_src2, i_nram_src1 + j * height, i); + for (int k = 0; k < i; k++) { + r_temp += r_mul_result[k]; + i_temp += i_mul_result[k]; + } + __bang_mul(r_mul_result, i_nram_src2, i_nram_src1 + j * height, i); + __bang_mul(i_mul_result, i_nram_src2, r_nram_src1 + j * height, i); + for (int k = 0; k < i; k++) { + r_temp += r_mul_result[k]; + i_temp -= i_mul_result[k]; + } + r_temp = r_temp * -1.0 * diag_element; + i_temp = i_temp * -1.0 * diag_element; + r_nram_dst[i * span + j] = r_temp; + i_nram_dst[i * span + j] = i_temp; + r_nram_src1[j * height + i] = r_temp; + i_nram_src1[j * height + i] = i_temp; } - - __sync(); - - - __sync(); + } + __sync(); - if(span > 0) - { - __memcpy(rd_output + ld_output * start + start,r_nram_dst,span*sizeof(float),NRAM2LDRAM,ld_output*sizeof(float),span*sizeof(float),height-1); - __memcpy(id_output + ld_output * start + start,i_nram_dst,span*sizeof(float),NRAM2LDRAM,ld_output*sizeof(float),span*sizeof(float),height-1); - } - - - + __sync(); + if (span > 0) { + __memcpy(rd_output + ld_output * start + start, r_nram_dst, + span * sizeof(float), NRAM2LDRAM, ld_output * sizeof(float), + span * sizeof(float), height - 1); + __memcpy(id_output + ld_output * start + start, i_nram_dst, + span * sizeof(float), NRAM2LDRAM, ld_output * sizeof(float), + span * sizeof(float), height - 1); + } } -__mlu_global__ -void complex_batch_inverse_kernel(int batch, float *rd_input, float* id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m) -{ - int id = taskId; - int batch_id = id; - if(batch_id >= batch) - return; - - float* r_orign_input = rd_input; - float* i_orign_input = id_input; - float* r_orign_output = rd_output; - float* i_orign_output = id_output; - rd_input = r_orign_input + batch_id * stride_input; - id_input = i_orign_input + batch_id * stride_input; - rd_output = r_orign_output + batch_id * stride_output; - id_output = i_orign_output + batch_id * stride_output; - - - float* nram_offset = (float*)nram_buffer; - float* r_nram_src0 = nram_offset; - float* i_nram_src0 = r_nram_src0 + m * m; - float* r_nram_src1 = i_nram_src0 + m * m; - float* i_nram_src1 = r_nram_src1 + m * m; - float* r_nram_src2 = i_nram_src1 + m * m; - float* i_nram_src2 = r_nram_src2 + m ; - float* r_mul_result = i_nram_src2 + m; - float* i_mul_result = r_mul_result + m; - float* r_nram_dst = i_mul_result + m; - float* i_nram_dst = r_nram_dst + m * m; - float* r_diag_start = r_nram_dst; - float* i_diag_start = i_nram_dst; - int height = m, span = m; - - __memset_nram(nram_offset, 10 * m * m, (float)ZERO); - - - __memcpy(r_nram_dst,rd_input,m*sizeof(float),GDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),m-1); - __memcpy(i_nram_dst,id_input,m*sizeof(float),GDRAM2NRAM,m*sizeof(float),ld_input*sizeof(float),m-1); - float result = 0.0; - for(int i = 0; i < m; i++) - { - int off = i * m + i; - result = r_nram_dst[off]; - result = 1.0 / result; - r_nram_src1[i*height+i] = result; - r_nram_dst[i*span + i] = result; - r_diag_start[off] = result; - } +__mlu_global__ void complex_batch_inverse_kernel( + int batch, float* rd_input, float* id_input, int ld_input, int stride_input, + float* rd_output, float* id_output, int ld_output, int stride_output, + int m) { + int id = taskId; + int batch_id = id; + if (batch_id >= batch) return; + + float* r_orign_input = rd_input; + float* i_orign_input = id_input; + float* r_orign_output = rd_output; + float* i_orign_output = id_output; + rd_input = r_orign_input + batch_id * stride_input; + id_input = i_orign_input + batch_id * stride_input; + rd_output = r_orign_output + batch_id * stride_output; + id_output = i_orign_output + batch_id * stride_output; + + float* nram_offset = (float*)nram_buffer; + float* r_nram_src0 = nram_offset; + float* i_nram_src0 = r_nram_src0 + m * m; + float* r_nram_src1 = i_nram_src0 + m * m; + float* i_nram_src1 = r_nram_src1 + m * m; + float* r_nram_src2 = i_nram_src1 + m * m; + float* i_nram_src2 = r_nram_src2 + m; + float* r_mul_result = i_nram_src2 + m; + float* i_mul_result = r_mul_result + m; + float* r_nram_dst = i_mul_result + m; + float* i_nram_dst = r_nram_dst + m * m; + float* r_diag_start = r_nram_dst; + float* i_diag_start = i_nram_dst; + int height = m, span = m; + + __memset_nram(nram_offset, 10 * m * m, (float)ZERO); + + __memcpy(r_nram_dst, rd_input, m * sizeof(float), GDRAM2NRAM, + m * sizeof(float), ld_input * sizeof(float), m - 1); + __memcpy(i_nram_dst, id_input, m * sizeof(float), GDRAM2NRAM, + m * sizeof(float), ld_input * sizeof(float), m - 1); + float result = 0.0; + for (int i = 0; i < m; i++) { + int off = i * m + i; + result = r_nram_dst[off]; + result = 1.0 / result; + r_nram_src1[i * height + i] = result; + r_nram_dst[i * span + i] = result; + r_diag_start[off] = result; + } - for(int i = 1; i < height; i++) - { - __memcpy(r_nram_src2,r_diag_start+i*m,i*sizeof(float),NRAM2NRAM); - __memcpy(i_nram_src2,i_diag_start+i*m,i*sizeof(float),NRAM2NRAM); - int num = std::min(i, span); - float diag_element = r_diag_start[i*m+i]; - for(int j = 0; j < num; j++) - { - float r_temp = 0.0; - float i_temp = 0.0; - __bang_mul(r_mul_result,r_nram_src2,r_nram_src1+j*height,i); - __bang_mul(i_mul_result,r_nram_src2,i_nram_src1+j*height,i); - for(int k = 0; k< i; k++) - { - r_temp += r_mul_result[k]; - i_temp += i_mul_result[k]; - } - __bang_mul(r_mul_result,i_nram_src2,i_nram_src1+j*height,i); - __bang_mul(i_mul_result,i_nram_src2,r_nram_src1+j*height,i); - for(int k = 0; k< i; k++) - { - r_temp += r_mul_result[k]; - i_temp -= i_mul_result[k]; - } - r_temp = r_temp * -1.0 * diag_element; - i_temp = i_temp * -1.0 * diag_element; - r_nram_dst[i*span+j] = r_temp; - i_nram_dst[i*span+j] = i_temp; - r_nram_src1[j*height+i] = r_temp; - i_nram_src1[j*height+i] = i_temp; - } - __sync(); - + for (int i = 1; i < height; i++) { + __memcpy(r_nram_src2, r_diag_start + i * m, i * sizeof(float), NRAM2NRAM); + __memcpy(i_nram_src2, i_diag_start + i * m, i * sizeof(float), NRAM2NRAM); + int num = std::min(i, span); + float diag_element = r_diag_start[i * m + i]; + for (int j = 0; j < num; j++) { + float r_temp = 0.0; + float i_temp = 0.0; + __bang_mul(r_mul_result, r_nram_src2, r_nram_src1 + j * height, i); + __bang_mul(i_mul_result, r_nram_src2, i_nram_src1 + j * height, i); + for (int k = 0; k < i; k++) { + r_temp += r_mul_result[k]; + i_temp += i_mul_result[k]; + } + __bang_mul(r_mul_result, i_nram_src2, i_nram_src1 + j * height, i); + __bang_mul(i_mul_result, i_nram_src2, r_nram_src1 + j * height, i); + for (int k = 0; k < i; k++) { + r_temp += r_mul_result[k]; + i_temp -= i_mul_result[k]; + } + r_temp = r_temp * -1.0 * diag_element; + i_temp = i_temp * -1.0 * diag_element; + r_nram_dst[i * span + j] = r_temp; + i_nram_dst[i * span + j] = i_temp; + r_nram_src1[j * height + i] = r_temp; + i_nram_src1[j * height + i] = i_temp; } __sync(); + } + __sync(); - __memcpy(rd_output,r_nram_dst,m*sizeof(float),NRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); - __memcpy(id_output,i_nram_dst,m*sizeof(float),NRAM2GDRAM,ld_output*sizeof(float), m*sizeof(float),m-1); - + __memcpy(rd_output, r_nram_dst, m * sizeof(float), NRAM2GDRAM, + ld_output * sizeof(float), m * sizeof(float), m - 1); + __memcpy(id_output, i_nram_dst, m * sizeof(float), NRAM2GDRAM, + ld_output * sizeof(float), m * sizeof(float), m - 1); } +mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, + float alpha, float beta, float* d_ra, float* d_ia, int lda, + int stride_a, float* d_rb, float* d_ib, int ldb, + int stride_b, float* d_rc, float* d_ic, int ldc, + int stride_c, mluOpHandle_t handle, float* workspace) { + if (k == 0) return MLUOP_STATUS_SUCCESS; + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); -mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle,float* workspace) -{ - if(k==0) - return MLUOP_STATUS_SUCCESS; - - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); + float *r_c, *i_c; + r_c = d_rc; + i_c = d_ic; - - - float *r_c, *i_c; - r_c = d_rc; - i_c = d_ic; - - int s_stride_a = stride_a; - int s_stride_b = stride_b; - int s_stride_c = stride_c; + int s_stride_a = stride_a; + int s_stride_b = stride_b; + int s_stride_c = stride_c; - sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,d_ra,lda,s_stride_a,d_rb,ldb,s_stride_b,r_c,ldc,s_stride_c,handle,workspace); - cnrtQueueSync(queue); - - sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_ib,ldb,s_stride_b,r_c,ldc,s_stride_c,handle,workspace); - cnrtQueueSync(queue); - - sgemm(batch,trans_a,trans_b,m,n,k,-alpha,beta,d_ra,lda,s_stride_a,d_ib,ldb,s_stride_b,i_c,ldc,s_stride_c,handle,workspace); - cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,d_ia,lda,s_stride_a,d_rb,ldb,s_stride_b,i_c,ldc,s_stride_c,handle,workspace); - cnrtQueueSync(queue); + sgemm(batch, trans_a, trans_b, m, n, k, alpha, beta, d_ra, lda, s_stride_a, + d_rb, ldb, s_stride_b, r_c, ldc, s_stride_c, handle, workspace); + cnrtQueueSync(queue); - return MLUOP_STATUS_SUCCESS; -} + sgemm(batch, trans_a, trans_b, m, n, k, alpha, 1, d_ia, lda, s_stride_a, d_ib, + ldb, s_stride_b, r_c, ldc, s_stride_c, handle, workspace); + cnrtQueueSync(queue); + sgemm(batch, trans_a, trans_b, m, n, k, -alpha, beta, d_ra, lda, s_stride_a, + d_ib, ldb, s_stride_b, i_c, ldc, s_stride_c, handle, workspace); + cnrtQueueSync(queue); + sgemm(batch, trans_a, trans_b, m, n, k, alpha, 1, d_ia, lda, s_stride_a, d_rb, + ldb, s_stride_b, i_c, ldc, s_stride_c, handle, workspace); + cnrtQueueSync(queue); -mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_ra, float* d_ia, int lda, int stride_a, float* d_rb, float* d_ib, int ldb, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle, float* cgemm_workspace) -{ - if(k==0) - return MLUOP_STATUS_SUCCESS; - - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - - float *workspace = cgemm_workspace; - float* sgemm_workspace = cgemm_workspace + ((unsigned long)batch)*2*(m*k); - float* copy_ra = workspace; - float* copy_ia = copy_ra + ((unsigned long)batch)*m*k; - int copy_lda = k; - int copy_stride_a = m*k; - - for(int i = 0; i < batch; i++) - { - CNRT_CHECK(cnrtMemcpy2D(copy_ra+i*m*k, k*sizeof(float), d_ra+i*stride_a, lda*sizeof(float), - k*sizeof(float), m, CNRT_MEM_TRANS_DIR_DEV2DEV)); - CNRT_CHECK(cnrtMemcpy2D(copy_ia+i*m*k, k*sizeof(float), d_ia+i*stride_a, lda*sizeof(float), - k*sizeof(float), m, CNRT_MEM_TRANS_DIR_DEV2DEV)); - } - + return MLUOP_STATUS_SUCCESS; +} - float *r_c, *i_c; - r_c = d_rc; - i_c = d_ic; +mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, + int k, float alpha, float beta, float* d_ra, + float* d_ia, int lda, int stride_a, float* d_rb, + float* d_ib, int ldb, int stride_b, float* d_rc, + float* d_ic, int ldc, int stride_c, + mluOpHandle_t handle, float* cgemm_workspace) { + if (k == 0) return MLUOP_STATUS_SUCCESS; + + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + + float* workspace = cgemm_workspace; + float* sgemm_workspace = + cgemm_workspace + ((unsigned long)batch) * 2 * (m * k); + float* copy_ra = workspace; + float* copy_ia = copy_ra + ((unsigned long)batch) * m * k; + int copy_lda = k; + int copy_stride_a = m * k; + + for (int i = 0; i < batch; i++) { + CNRT_CHECK(cnrtMemcpy2D(copy_ra + i * m * k, k * sizeof(float), + d_ra + i * stride_a, lda * sizeof(float), + k * sizeof(float), m, CNRT_MEM_TRANS_DIR_DEV2DEV)); + CNRT_CHECK(cnrtMemcpy2D(copy_ia + i * m * k, k * sizeof(float), + d_ia + i * stride_a, lda * sizeof(float), + k * sizeof(float), m, CNRT_MEM_TRANS_DIR_DEV2DEV)); + } - int s_stride_b = stride_b; - int s_stride_c = stride_c; + float *r_c, *i_c; + r_c = d_rc; + i_c = d_ic; + + int s_stride_b = stride_b; + int s_stride_c = stride_c; + + sgemm(batch, trans_a, trans_b, m, n, k, alpha, beta, copy_ra, copy_lda, + copy_stride_a, d_rb, ldb, s_stride_b, r_c, ldc, s_stride_c, handle, + sgemm_workspace); + cnrtQueueSync(queue); + + sgemm(batch, trans_a, trans_b, m, n, k, -alpha, 1, copy_ia, copy_lda, + copy_stride_a, d_ib, ldb, s_stride_b, r_c, ldc, s_stride_c, handle, + sgemm_workspace); + cnrtQueueSync(queue); + + sgemm(batch, trans_a, trans_b, m, n, k, alpha, beta, copy_ra, copy_lda, + copy_stride_a, d_ib, ldb, s_stride_b, i_c, ldc, s_stride_c, handle, + sgemm_workspace); + cnrtQueueSync(queue); + sgemm(batch, trans_a, trans_b, m, n, k, alpha, 1, copy_ia, copy_lda, + copy_stride_a, d_rb, ldb, s_stride_b, i_c, ldc, s_stride_c, handle, + sgemm_workspace); + cnrtQueueSync(queue); + + return MLUOP_STATUS_SUCCESS; +} - - - sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,copy_ra,copy_lda,copy_stride_a,d_rb,ldb,s_stride_b,r_c,ldc,s_stride_c,handle,sgemm_workspace); - cnrtQueueSync(queue); - - sgemm(batch,trans_a,trans_b,m,n,k,-alpha,1,copy_ia,copy_lda,copy_stride_a,d_ib,ldb,s_stride_b,r_c,ldc,s_stride_c,handle,sgemm_workspace); +mluOpStatus_t complex_inverse(int batch, float* rd_input, float* id_input, + int ld_input, int stride_input, float* rd_output, + float* id_output, int ld_output, + int stride_output, int m, mluOpHandle_t handle, + float* workspace) { + int inverse_rec = 16; + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + if (m <= inverse_rec) { + cnrtDim3_t dim; + cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; + dim.y = 1; + dim.z = 1; + if (batch < 8) { + dim.x = 4 * batch; + KERNEL_CHECK(complex_inverse_kernel<<>>( + batch, rd_input, id_input, ld_input, stride_input, rd_output, + id_output, ld_output, stride_output, m)); + } else { + dim.x = batch; + KERNEL_CHECK(complex_batch_inverse_kernel<<>>( + batch, rd_input, id_input, ld_input, stride_input, rd_output, + id_output, ld_output, stride_output, m)); + } + + } else { + int m1 = m / 2; + int m2 = m - m1; + + float* output1_r = rd_output; + float* output2_r = rd_output + m1 * m + m1; + float* output1_i = id_output; + float* output2_i = id_output + m1 * m + m1; + + complex_inverse(batch, rd_input, id_input, ld_input, stride_input, + rd_output, id_output, ld_output, stride_output, m1, handle, + workspace); + complex_inverse(batch, rd_input + m1 * ld_input + m1, + id_input + m1 * ld_input + m1, ld_input, stride_input, + output2_r, output2_i, ld_output, stride_output, m2, handle, + workspace); cnrtQueueSync(queue); + float* cgemm_workspace = workspace + batch * 2 * (m2 * m1); + float* temp_r = workspace; + float* temp_i = temp_r + batch * m2 * m1; + int temp_ld = m1; + int temp_stride = m2 * m1; - sgemm(batch,trans_a,trans_b,m,n,k,alpha,beta,copy_ra,copy_lda,copy_stride_a,d_ib,ldb,s_stride_b,i_c,ldc,s_stride_c,handle,sgemm_workspace); + cgemm(batch, false, false, m2, m1, m1, 1.0f, 0.0f, rd_input + m1 * ld_input, + id_input + m1 * ld_input, ld_input, stride_input, output1_r, + output1_i, ld_output, stride_output, temp_r, temp_i, temp_ld, + temp_stride, handle, cgemm_workspace); cnrtQueueSync(queue); - sgemm(batch,trans_a,trans_b,m,n,k,alpha,1,copy_ia,copy_lda,copy_stride_a,d_rb,ldb,s_stride_b,i_c,ldc,s_stride_c,handle,sgemm_workspace); + cgemm(batch, false, false, m2, m2, m1, -1.0f, 0.0f, output2_r, output2_i, + ld_output, stride_output, temp_r, temp_i, temp_ld, temp_stride, + rd_output + m1 * ld_output, id_output + m1 * ld_output, ld_output, + stride_output, handle, cgemm_workspace); cnrtQueueSync(queue); + } - - return MLUOP_STATUS_SUCCESS; + return MLUOP_STATUS_SUCCESS; } - -mluOpStatus_t complex_inverse(int batch, float *rd_input, float *id_input, int ld_input, int stride_input, float* rd_output, float* id_output, int ld_output, int stride_output, int m, mluOpHandle_t handle, float* workspace) -{ - int inverse_rec = 16; - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - if(m <= inverse_rec) - { - - - cnrtDim3_t dim; - cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK; - dim.y = 1; - dim.z = 1; - if(batch < 8) - { - dim.x = 4 * batch; - KERNEL_CHECK(complex_inverse_kernel<<>>(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m)); - } - else - { - dim.x = batch; - KERNEL_CHECK(complex_batch_inverse_kernel<<>>(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m)); - } - - } - else - { - int m1 = m/2; - int m2 = m - m1; - - float* output1_r = rd_output; - float* output2_r = rd_output + m1*m+m1; - float* output1_i = id_output; - float* output2_i = id_output + m1*m+m1; - - - complex_inverse(batch, rd_input, id_input, ld_input, stride_input, rd_output, id_output, ld_output, stride_output, m1, handle, workspace); - complex_inverse(batch, rd_input+m1*ld_input+m1, id_input+m1*ld_input+m1, ld_input, stride_input, output2_r, output2_i, ld_output, stride_output, m2, handle, workspace); - cnrtQueueSync(queue); - - - - float* cgemm_workspace = workspace + batch*2*(m2*m1); - float* temp_r = workspace; - float* temp_i = temp_r + batch*m2*m1; - int temp_ld = m1; - int temp_stride = m2*m1; - - cgemm(batch, false,false,m2,m1,m1,1.0f,0.0f,rd_input+m1*ld_input,id_input+m1*ld_input,ld_input,stride_input,output1_r,output1_i,ld_output,stride_output,temp_r,temp_i,temp_ld,temp_stride,handle,cgemm_workspace); - cnrtQueueSync(queue); - cgemm(batch, false,false,m2,m2,m1,-1.0f,0.0f,output2_r,output2_i,ld_output,stride_output,temp_r,temp_i,temp_ld,temp_stride,rd_output+m1*ld_output,id_output + m1*ld_output,ld_output,stride_output,handle,cgemm_workspace); - cnrtQueueSync(queue); - - } - - - return MLUOP_STATUS_SUCCESS; +mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, + float* id_a, int lda, float* rd_b, float* id_b, int ldb, + mluOpHandle_t handle, float* ctrsm_workspace) { + if (n == 0) return MLUOP_STATUS_SUCCESS; + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + float* workspace = ctrsm_workspace + batch * m * m * 2; + CNRT_CHECK( + cnrtMemset(ctrsm_workspace, 0.0, batch * m * m * 2 * sizeof(float))); + float *r_inverse_result, *i_inverse_result; + r_inverse_result = ctrsm_workspace; + i_inverse_result = r_inverse_result + batch * m * m; + + complex_inverse(batch, rd_a, id_a, lda, stride, r_inverse_result, + i_inverse_result, m, m * m, m, handle, workspace); + cnrtQueueSync(queue); + + cgemm_real(batch, false, true, n, m, m, 1.0, 0.0f, rd_b, id_b, ldb, stride, + r_inverse_result, i_inverse_result, m, m * m, rd_b, id_b, ldb, + stride, handle, workspace); + + return MLUOP_STATUS_SUCCESS; } - -mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, float* id_a, int lda, float* rd_b, float* id_b, int ldb, mluOpHandle_t handle, float* ctrsm_workspace) -{ - if(n==0) - return MLUOP_STATUS_SUCCESS; - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - float* workspace = ctrsm_workspace + batch*m*m*2; - CNRT_CHECK(cnrtMemset(ctrsm_workspace, 0.0, batch*m*m*2*sizeof(float))); - float *r_inverse_result, *i_inverse_result; - r_inverse_result = ctrsm_workspace; - i_inverse_result = r_inverse_result + batch*m*m; - - - complex_inverse(batch,rd_a,id_a,lda,stride,r_inverse_result,i_inverse_result,m,m*m,m,handle,workspace); - cnrtQueueSync(queue); - - cgemm_real(batch,false,true,n,m,m,1.0,0.0f,rd_b,id_b,ldb,stride,r_inverse_result,i_inverse_result,m,m*m,rd_b,id_b,ldb,stride,handle,workspace); - - return MLUOP_STATUS_SUCCESS; +mluOpStatus_t cherk(int batch, int stride, int n, int k, float* rd_a, + float* id_a, int lda, float* rd_c, float* id_c, int ldc, + mluOpHandle_t handle, float* workspace) { + if (k == 0) return MLUOP_STATUS_SUCCESS; + cgemm(batch, false, true, n, n, k, -1.0f, 1.0f, rd_a, id_a, lda, stride, rd_a, + id_a, lda, stride, rd_c, id_c, ldc, stride, handle, workspace); + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + cnrtQueueSync(queue); + set_half_zero(batch, stride, rd_c, ldc, n, handle); + set_half_zero(batch, stride, id_c, ldc, n, handle); + return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t cherk(int batch, int stride, int n,int k, float* rd_a, float* id_a, int lda, float* rd_c, float* id_c, int ldc, mluOpHandle_t handle,float* workspace) -{ - if(k==0) - return MLUOP_STATUS_SUCCESS; - cgemm(batch,false,true,n,n,k,-1.0f,1.0f,rd_a,id_a,lda,stride,rd_a,id_a,lda,stride,rd_c,id_c,ldc,stride,handle,workspace); - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - cnrtQueueSync(queue); - set_half_zero(batch,stride,rd_c,ldc,n,handle); - set_half_zero(batch,stride,id_c,ldc,n,handle); - return MLUOP_STATUS_SUCCESS; -} - - -mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, float* drA, float* diA, int lda, mluOpHandle_t handle, float* workspace) -{ - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - if(n <= recnb) - { - mlu_cpotf_lpin(batch,stride, n, lda, drA, diA, queue); - } - else - { - int n1 = n/2; - int n2 = n-n1; - mlu_cpotrf_rectile(batch,stride,n1,recnb,drA,diA,lda,handle,workspace); - ctrsm(batch,stride,n1,n2,drA,diA,lda,drA+n1*lda,diA+n1*lda,lda,handle,workspace); - cherk(batch,stride,n2,n1,drA+n1*lda,diA+n1*lda,lda,drA+n1*lda+n1,diA+n1*lda+n1,lda,handle,workspace); - mlu_cpotrf_rectile(batch,stride,n2,recnb,drA+n1*lda+n1,diA+n1*lda+n1,lda,handle,workspace); - - } - return MLUOP_STATUS_SUCCESS; +mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, + float* drA, float* diA, int lda, + mluOpHandle_t handle, float* workspace) { + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); + if (n <= recnb) { + mlu_cpotf_lpin(batch, stride, n, lda, drA, diA, queue); + } else { + int n1 = n / 2; + int n2 = n - n1; + mlu_cpotrf_rectile(batch, stride, n1, recnb, drA, diA, lda, handle, + workspace); + ctrsm(batch, stride, n1, n2, drA, diA, lda, drA + n1 * lda, diA + n1 * lda, + lda, handle, workspace); + cherk(batch, stride, n2, n1, drA + n1 * lda, diA + n1 * lda, lda, + drA + n1 * lda + n1, diA + n1 * lda + n1, lda, handle, workspace); + mlu_cpotrf_rectile(batch, stride, n2, recnb, drA + n1 * lda + n1, + diA + n1 * lda + n1, lda, handle, workspace); + } + return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t conj_complex(int batch, int m, int n,float* d_input,float* d_output, mluOpHandle_t handle) -{ - if(m==0) - return MLUOP_STATUS_SUCCESS; - cnrtQueue_t queue; - mluOpGetQueue(handle,&queue); - - mluOpTensorDescriptor_t input_desc, output_desc; - std::string api_name = "Cholesky"; +mluOpStatus_t conj_complex(int batch, int m, int n, float* d_input, + float* d_output, mluOpHandle_t handle) { + if (m == 0) return MLUOP_STATUS_SUCCESS; + cnrtQueue_t queue; + mluOpGetQueue(handle, &queue); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&input_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&output_desc)); + mluOpTensorDescriptor_t input_desc, output_desc; + std::string api_name = "Cholesky"; - int32_t input_shape[3] = {batch, m, n}; - int32_t output_shape[3] = {batch, m, n}; + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&input_desc)); + CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&output_desc)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - input_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_COMPLEX_FLOAT, 3, input_shape)); + int32_t input_shape[3] = {batch, m, n}; + int32_t output_shape[3] = {batch, m, n}; - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - output_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_COMPLEX_FLOAT, 3, output_shape)); + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + input_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_COMPLEX_FLOAT, 3, input_shape)); - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_in_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_out_desc); - + CHECK_RETURN(api_name, mluOpSetTensorDescriptor( + output_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_COMPLEX_FLOAT, 3, output_shape)); - CALL_CNNL(cnnlConj(cnnl_handle, cnnl_in_desc, d_input, - cnnl_out_desc, d_output)); + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_in_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_out_desc); - return MLUOP_STATUS_SUCCESS; + CALL_CNNL( + cnnlConj(cnnl_handle, cnnl_in_desc, d_input, cnnl_out_desc, d_output)); + return MLUOP_STATUS_SUCCESS; } \ No newline at end of file diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp index 41be2bd81..49921170d 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp @@ -26,7 +26,6 @@ namespace mluoptest { - void CholeskyExecutor::paramCheck() { if (parser_->getInputNum() != 1) { LOG(ERROR) << "cholesky input number is wrong. "; @@ -35,372 +34,278 @@ void CholeskyExecutor::paramCheck() { LOG(ERROR) << "cholesky output number is wrong. "; } flag_quant_mode_ = NO_QUANT; - - - } -void set_matrix_zero(float*A, bool upper, bool trans_, int n_, int ldda_, mluOpDataType_t type_) -{ - if(trans_) - { - for (long int i = 0; i < n_; i++) - { - for (long int j = 0; j < ldda_; j++) - { - if(upper) - { - if(i >= j) - { - if(i == j && type_ == MLUOP_DTYPE_COMPLEX_FLOAT) - { - A[(j + i * ldda_)*2+1] = 0.0; - } - else - { - if(type_ == MLUOP_DTYPE_FLOAT) - A[j + i * ldda_] = 0.0; - else - { - A[(j + i * ldda_)*2] = 0.0; - A[(j + i * ldda_)*2+1] = 0.0; - } - } - - } - } - else - { - if(i <= j) - { - if(i == j) - { - if(type_ == MLUOP_DTYPE_COMPLEX_FLOAT) - { - A[(j + i * ldda_)*2+1] = 0.0; - } - } - else - { - if(type_ == MLUOP_DTYPE_FLOAT) - A[j + i * ldda_] = 0.0; - else - { - A[(j + i * ldda_)*2] = 0.0; - A[(j + i * ldda_)*2+1] = 0.0; - } - } - - } - } +void set_matrix_zero(float* A, bool upper, bool trans_, int n_, int ldda_, + mluOpDataType_t type_) { + if (trans_) { + for (long int i = 0; i < n_; i++) { + for (long int j = 0; j < ldda_; j++) { + if (upper) { + if (i >= j) { + if (i == j && type_ == MLUOP_DTYPE_COMPLEX_FLOAT) { + A[(j + i * ldda_) * 2 + 1] = 0.0; + } else { + if (type_ == MLUOP_DTYPE_FLOAT) + A[j + i * ldda_] = 0.0; + else { + A[(j + i * ldda_) * 2] = 0.0; + A[(j + i * ldda_) * 2 + 1] = 0.0; + } } - } - } - else - { - for (int i = 0; i < n_; i++) - { - for (int j = 0; j < ldda_; j++) - { - if((i > j && ~upper)||(i < j && upper)) - { - if(type_ == MLUOP_DTYPE_FLOAT) - A[j + i * ldda_] = 0.0; - else - { - A[(j + i * ldda_)*2] = 0.0; - A[(j + i * ldda_)*2+1] = 0.0; - } + } + } else { + if (i <= j) { + if (i == j) { + if (type_ == MLUOP_DTYPE_COMPLEX_FLOAT) { + A[(j + i * ldda_) * 2 + 1] = 0.0; + } + } else { + if (type_ == MLUOP_DTYPE_FLOAT) + A[j + i * ldda_] = 0.0; + else { + A[(j + i * ldda_) * 2] = 0.0; + A[(j + i * ldda_) * 2 + 1] = 0.0; } } + } + } + } + } + } else { + for (int i = 0; i < n_; i++) { + for (int j = 0; j < ldda_; j++) { + if ((i > j && ~upper) || (i < j && upper)) { + if (type_ == MLUOP_DTYPE_FLOAT) + A[j + i * ldda_] = 0.0; + else { + A[(j + i * ldda_) * 2] = 0.0; + A[(j + i * ldda_) * 2 + 1] = 0.0; + } } + } } - + } } -void trans_mul(float*A, float*C, int lda,bool upper_, bool trans_, int n_, int ldda_, mluOpDataType_t type_, bool diag_add) -{ - if(trans_) - { - for(long int i = 0; i = i) || (upper_==true && j >= i))) - { - A[j*lda*2+i*2] = 0.0; - A[j*lda*2+i*2+1] = 0.0; - - if(j == i&& diag_add) - { - A[j*lda*2+i*2] = 1.0; - } - } - for(long int k = 0; k <=i; k++) - { - if(upper_==false) - { - if(j < i) - continue; - else - { - if(type_ == MLUOP_DTYPE_FLOAT) - { - - A[i+j*lda] += (C[k+i*lda]*C[k+j*lda]); - } - else - { - A[(i+j*lda)*2] += (C[(k+i*lda)*2]*C[(k+j*lda)*2]+C[(k+i*lda)*2+1]*C[(k+j*lda)*2+1]); - A[(i+j*lda)*2+1] += (C[(k+i*lda)*2]*C[(k+j*lda)*2+1]-C[(k+i*lda)*2+1]*C[(k+j*lda)*2]); - } - } - if(type_ != MLUOP_DTYPE_FLOAT && j != i) - { - A[(j+i*lda)*2] = A[(i+j*lda)*2]; - A[(j+i*lda)*2+1] = -A[(i+j*lda)*2+1]; - } - } - else - { - if(type_ == MLUOP_DTYPE_FLOAT) - { - if(j > i) - continue; - else - { - A[i+j*lda] += (C[k*lda+i]*C[k*lda+j]); - } - } - else - { - if(j < i) - continue; - else - { - - A[(i+j*lda)*2] += (C[(k*lda+i)*2]*C[(k*lda+j)*2]+C[(k*lda+i)*2+1]*C[(k*lda+j)*2+1]); - A[(i+j*lda)*2+1] += (-C[(k*lda+i)*2]*C[(k*lda+j)*2+1]+C[(k*lda+i)*2+1]*C[(k*lda+j)*2]); - } - - } - - - } - } - if(((upper_) || (upper_==true && j > i))) - { - if(type_ != MLUOP_DTYPE_FLOAT) - { - A[(j+i*lda)*2] = A[(i+j*lda)*2]; - A[(j+i*lda)*2+1] = -A[(i+j*lda)*2+1]; - } - else - { - A[(j+i*lda)] = A[(i+j*lda)]; - } - - } - } +void trans_mul(float* A, float* C, int lda, bool upper_, bool trans_, int n_, + int ldda_, mluOpDataType_t type_, bool diag_add) { + if (trans_) { + for (long int i = 0; i < lda; i++) { + for (long int j = 0; j < n_; j++) { + if (type_ == MLUOP_DTYPE_FLOAT) { + A[i + j * lda] = 0.0; + if (j == i && diag_add) { + A[j + i * lda] = 1.0; + } + + } else if (type_ == MLUOP_DTYPE_COMPLEX_FLOAT && + ((upper_ == false && j >= i) || + (upper_ == true && j >= i))) { + A[j * lda * 2 + i * 2] = 0.0; + A[j * lda * 2 + i * 2 + 1] = 0.0; + + if (j == i && diag_add) { + A[j * lda * 2 + i * 2] = 1.0; + } } - } - else - { - for(int i = 0; i i) + continue; + else { + A[i + j * lda] += (C[k * lda + i] * C[k * lda + j]); + } + } else { + if (j < i) + continue; + else { + A[(i + j * lda) * 2] += + (C[(k * lda + i) * 2] * C[(k * lda + j) * 2] + + C[(k * lda + i) * 2 + 1] * C[(k * lda + j) * 2 + 1]); + A[(i + j * lda) * 2 + 1] += + (-C[(k * lda + i) * 2] * C[(k * lda + j) * 2 + 1] + + C[(k * lda + i) * 2 + 1] * C[(k * lda + j) * 2]); + } } + } } - } -} - -void fill_zero(float*A, bool upper_, int batch_, int n_, int ldda_, mluOpDataType_t type_,bool if_conj) -{ - int stride = n_ * ldda_; - if(type_ == MLUOP_DTYPE_FLOAT) - { - } - else - { - stride *= 2; + if (((upper_) || (upper_ == true && j > i))) { + if (type_ != MLUOP_DTYPE_FLOAT) { + A[(j + i * lda) * 2] = A[(i + j * lda) * 2]; + A[(j + i * lda) * 2 + 1] = -A[(i + j * lda) * 2 + 1]; + } else { + A[(j + i * lda)] = A[(i + j * lda)]; + } + } + } } - for(long int i = 0; i < batch_;i++) - { - for(long int j = 0; j < n_; j++) - { - for(long int h = 0; h < ldda_; h++) - { - if(j==h) - { - continue; - } - else if(j 0) - { - std::memcpy(temp_dst,temp_src,transfer_remain); - } +void cpu_transfer_data(float* dst, float* src, unsigned long data_size) { + unsigned long size_block = 1024 * 1024 * 1024; + unsigned long transfer_num = data_size / size_block; + unsigned long transfer_remain = data_size % size_block; + float *temp_dst = dst, *temp_src = src; + for (unsigned long i = 0; i < transfer_num; i++) { + std::memcpy(temp_dst, temp_src, size_block); + temp_dst += (size_block / 4); + temp_src += (size_block / 4); + } + if (transfer_remain > 0) { + std::memcpy(temp_dst, temp_src, transfer_remain); + } } -void mlu_transfer_data(float* dst, float* src, unsigned long data_size,cnrtMemTransDir_t dir) -{ - unsigned long size_block = 1024*1024*1024; - unsigned long transfer_num = data_size / size_block; - unsigned long transfer_remain = data_size % size_block; - float* temp_dst= dst, *temp_src = src; - - for(unsigned long i = 0; i < transfer_num; i++) - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(temp_dst, temp_src, size_block, dir)); - temp_dst += (size_block/4); - temp_src += (size_block/4); - } - if(transfer_remain > 0) - { - GTEST_CHECK(CNRT_RET_SUCCESS == - cnrtMemcpy(temp_dst, temp_src, transfer_remain, dir)); - } +void mlu_transfer_data(float* dst, float* src, unsigned long data_size, + cnrtMemTransDir_t dir) { + unsigned long size_block = 1024 * 1024 * 1024; + unsigned long transfer_num = data_size / size_block; + unsigned long transfer_remain = data_size % size_block; + float *temp_dst = dst, *temp_src = src; + + for (unsigned long i = 0; i < transfer_num; i++) { + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(temp_dst, temp_src, size_block, dir)); + temp_dst += (size_block / 4); + temp_src += (size_block / 4); + } + if (transfer_remain > 0) { + GTEST_CHECK(CNRT_RET_SUCCESS == + cnrtMemcpy(temp_dst, temp_src, transfer_remain, dir)); + } } -void CholeskyExecutor::prepareComputeParam() -{ -//cpu端把矩阵的一半设置成0 -//然后转置乘法,结果存到cpu端的另一个矩阵 -//然后传给gpu端 +void CholeskyExecutor::prepareComputeParam() { + // cpu端把矩阵的一半设置成0 + //然后转置乘法,结果存到cpu端的另一个矩阵 + //然后传给gpu端 printf("start prepare compute parameter.\n"); int long_int_size = sizeof(long int); int int_size = sizeof(int); - printf("long int size:%d, int size:%d\n",long_int_size,int_size); + printf("long int size:%d, int size:%d\n", long_int_size, int_size); auto input_desc_ = (tensor_desc_[0].tensor); auto output_desc_ = (tensor_desc_[1].tensor); auto dev_a = (float*)(data_vector_[0].host_ptr); @@ -413,391 +318,327 @@ void CholeskyExecutor::prepareComputeParam() int dim_size = input_shape.dims_size(); type_ = input_desc_->dtype; type_size_ = type_ == MLUOP_DTYPE_FLOAT ? 4 : 8; - if(dim_size ==2) - { - n_ = input_shape.dims(0); + if (dim_size == 2) { + n_ = input_shape.dims(0); int dim = input_desc_->dim; - stride_ = (input_desc_->strides)[dim-1]; + stride_ = (input_desc_->strides)[dim - 1]; ldda_ = input_desc_->dims[1]; - printf("n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n",n_,ldda_,stride_,upper_,trans_); + printf("n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n", n_, ldda_, stride_, + upper_, trans_); int size = input_desc_->dims[0]; - printf("size:%d, dim:%d, \n",size,dim); + printf("size:%d, dim:%d, \n", size, dim); printf("strides:\n"); - for(int i = 0; i < dim; i++) - { - printf("%ld ",(input_desc_->strides)[i]); + for (int i = 0; i < dim; i++) { + printf("%ld ", (input_desc_->strides)[i]); } printf("\n"); - printf("data vector length : %ld\n",data_vector_.size()); - } - else if(dim_size == 3) - { + printf("data vector length : %ld\n", data_vector_.size()); + } else if (dim_size == 3) { batch_size_ = input_shape.dims(0); - n_ = input_shape.dims(1); + n_ = input_shape.dims(1); int dim = input_desc_->dim; - stride_ = (input_desc_->strides)[dim-1]; + stride_ = (input_desc_->strides)[dim - 1]; ldda_ = input_desc_->dims[2]; - printf("batch_size:%ld,n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n",batch_size_,n_,ldda_,stride_,upper_,trans_); - + printf("batch_size:%ld,n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n", + batch_size_, n_, ldda_, stride_, upper_, trans_); + int size = input_desc_->dims[1]; - printf("size:%d, dim:%d, \n",size,dim); + printf("size:%d, dim:%d, \n", size, dim); printf("strides:\n"); - for(int i = 0; i < dim; i++) - { - printf("%ld ",(input_desc_->strides)[i]); + for (int i = 0; i < dim; i++) { + printf("%ld ", (input_desc_->strides)[i]); } printf("\n"); - printf("data vector length : %ld\n",data_vector_.size()); + printf("data vector length : %ld\n", data_vector_.size()); } - unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; -// unsigned long size_2g = 1024*1024*1024-1+1024*1024*1024; -// unsigned long size_2g = 1024*1024*10-1; -// int transfer_num = total_size / size_2g; - -// int transfer_remain = total_size % size_2g; -// printf("total size:%ld, transfer_num:%d, transfer_remain:%d\n",total_size,transfer_num,transfer_remain); - -// printf("matrix random:\n"); -// print_matrix(batch_size_, dev_a,ldda_,trans_,n_,ldda_,type_); -// print_matrix(batch_size_, base_line_out,ldda_,trans_,n_,ldda_,type_); - -// for(unsigned long i = 0; i < transfer_num; i++) -// { -// std::memcpy(dev_c+(i*size_2g),dev_a+(i*size_2g),size_2g); -// } -// printf("ddd\n"); -// if(transfer_remain > 0) -// { -// std::memcpy(dev_c+(transfer_num*size_2g),dev_a+(transfer_num*size_2g),transfer_remain); -// } -// printf("lll\n"); - cpu_transfer_data(dev_c,dev_a,total_size); -// if(batch_size_ > 16 && n_ > 2000) -// { -// std::memcpy(dev_c,dev_a,16*type_size_*n_*ldda_); -// std::memcpy(dev_c+16*type_size_/4*n_*ldda_,dev_a+16*type_size_/4*n_*ldda_,(batch_size_-16)*type_size_*n_*ldda_); -// } -// else -// { -// std::memcpy(dev_c,dev_a,batch_size_*type_size_*n_*ldda_); -// } - if(parser_->device() == CPU) - { - for(long int i = 0; i < batch_size_;i++) - { - if(type_ == MLUOP_DTYPE_FLOAT) - set_matrix_zero(dev_c+i*n_*ldda_,false,trans_,n_,ldda_,type_); + unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; + // unsigned long size_2g = 1024*1024*1024-1+1024*1024*1024; + // unsigned long size_2g = 1024*1024*10-1; + // int transfer_num = total_size / size_2g; + + // int transfer_remain = total_size % size_2g; + // printf("total size:%ld, transfer_num:%d, + // transfer_remain:%d\n",total_size,transfer_num,transfer_remain); + + // printf("matrix random:\n"); + // print_matrix(batch_size_, dev_a,ldda_,trans_,n_,ldda_,type_); + // print_matrix(batch_size_, base_line_out,ldda_,trans_,n_,ldda_,type_); + + // for(unsigned long i = 0; i < transfer_num; i++) + // { + // std::memcpy(dev_c+(i*size_2g),dev_a+(i*size_2g),size_2g); + // } + // printf("ddd\n"); + // if(transfer_remain > 0) + // { + // std::memcpy(dev_c+(transfer_num*size_2g),dev_a+(transfer_num*size_2g),transfer_remain); + // } + // printf("lll\n"); + cpu_transfer_data(dev_c, dev_a, total_size); + // if(batch_size_ > 16 && n_ > 2000) + // { + // std::memcpy(dev_c,dev_a,16*type_size_*n_*ldda_); + // std::memcpy(dev_c+16*type_size_/4*n_*ldda_,dev_a+16*type_size_/4*n_*ldda_,(batch_size_-16)*type_size_*n_*ldda_); + // } + // else + // { + // std::memcpy(dev_c,dev_a,batch_size_*type_size_*n_*ldda_); + // } + if (parser_->device() == CPU) { + for (long int i = 0; i < batch_size_; i++) { + if (type_ == MLUOP_DTYPE_FLOAT) + set_matrix_zero(dev_c + i * n_ * ldda_, false, trans_, n_, ldda_, + type_); else - set_matrix_zero(dev_c+i*n_*ldda_*2,false,trans_,n_,ldda_,type_); + set_matrix_zero(dev_c + i * n_ * ldda_ * 2, false, trans_, n_, ldda_, + type_); } -// set_matrix_zero((float*)dev_c,upper_,trans_,n_,ldda_,type_); - for(long int i = 0; i < batch_size_;i++) - { - if(type_ == MLUOP_DTYPE_FLOAT) - { - trans_mul(dev_a+i*n_*ldda_,dev_c+i*n_*ldda_,ldda_,false,trans_,n_,ldda_,type_,true); - fill_zero(dev_a,false,batch_size_,n_,ldda_,type_,false); - } - else - { - trans_mul(dev_a+i*n_*ldda_*2,dev_c+i*n_*ldda_*2,ldda_,false,trans_,n_,ldda_,type_,true); - fill_zero(dev_a,false,batch_size_,n_,ldda_,type_,true); + // set_matrix_zero((float*)dev_c,upper_,trans_,n_,ldda_,type_); + for (long int i = 0; i < batch_size_; i++) { + if (type_ == MLUOP_DTYPE_FLOAT) { + trans_mul(dev_a + i * n_ * ldda_, dev_c + i * n_ * ldda_, ldda_, false, + trans_, n_, ldda_, type_, true); + fill_zero(dev_a, false, batch_size_, n_, ldda_, type_, false); + } else { + trans_mul(dev_a + i * n_ * ldda_ * 2, dev_c + i * n_ * ldda_ * 2, ldda_, + false, trans_, n_, ldda_, type_, true); + fill_zero(dev_a, false, batch_size_, n_, ldda_, type_, true); } } } - - - - -// printf("matrix A:\n"); -// print_matrix(batch_size_,dev_a,ldda_,trans_,n_,ldda_,type_); -// printf("matrix C:\n"); -// print_matrix(batch_size_,dev_c,ldda_,trans_,n_,ldda_,type_); - mlu_transfer_data(dev_d,dev_a,total_size,CNRT_MEM_TRANS_DIR_HOST2DEV); + // printf("matrix A:\n"); + // print_matrix(batch_size_,dev_a,ldda_,trans_,n_,ldda_,type_); + // printf("matrix C:\n"); + // print_matrix(batch_size_,dev_c,ldda_,trans_,n_,ldda_,type_); + mlu_transfer_data(dev_d, dev_a, total_size, CNRT_MEM_TRANS_DIR_HOST2DEV); - if(parser_->device() == CPU) - { + if (parser_->device() == CPU) { float* cpu_a = cpu_fp32_input_[0]; - cpu_transfer_data(cpu_a,dev_a,total_size); - + cpu_transfer_data(cpu_a, dev_a, total_size); } - - - } void CholeskyExecutor::compute() { - -// prepareComputeParam(); - - VLOG(4) <<" CholeskyExecutor compute "; + // prepareComputeParam(); + + VLOG(4) << " CholeskyExecutor compute "; auto input_desc_ = tensor_desc_[0].tensor; auto output_desc_ = tensor_desc_[1].tensor; auto h_input = (float*)(data_vector_[0].host_ptr); auto h_output = (float*)(data_vector_[1].host_ptr); auto d_intput = (float*)(data_vector_[0].device_ptr); auto d_output = (float*)(data_vector_[1].device_ptr); - - unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; - cpu_transfer_data(h_input,h_output,total_size); - - mlu_transfer_data(h_output,d_intput,total_size,CNRT_MEM_TRANS_DIR_DEV2HOST); - -// printf("mlu before cholesky result:\n"); -// print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); + + unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; + cpu_transfer_data(h_input, h_output, total_size); + + mlu_transfer_data(h_output, d_intput, total_size, + CNRT_MEM_TRANS_DIR_DEV2HOST); + + // printf("mlu before cholesky result:\n"); + // print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); interface_timer_.start(); float* workspace = nullptr; size_t size = 0; - MLUOP_CHECK(mluOpGetCholeskyWorkspace(input_desc_,&size,&workspace)); + MLUOP_CHECK(mluOpGetCholeskyWorkspace(input_desc_, &size, &workspace)); -MLUOP_CHECK(mluOpCholesky(handle_,input_desc_,d_intput, output_desc_, d_output, upper_,workspace)); + MLUOP_CHECK(mluOpCholesky(handle_, input_desc_, d_intput, output_desc_, + d_output, upper_, workspace)); -MLUOP_CHECK(mluOpFreeCholeskyWorkspace(&workspace)); + MLUOP_CHECK(mluOpFreeCholeskyWorkspace(&workspace)); interface_timer_.stop(); - mlu_transfer_data(h_output,d_output,total_size,CNRT_MEM_TRANS_DIR_DEV2HOST); - - if(parser_->device() != CPU ) - { - if(result_mul) - { - for(int i = 0; i < batch_size_;i++) - { - if(type_ == MLUOP_DTYPE_FLOAT) - { - trans_mul(h_input+i*n_*ldda_,h_output+i*n_*ldda_,ldda_,upper_,trans_,n_,ldda_,type_,false); - } - else - { - trans_mul(h_input+i*n_*ldda_*2,h_output+i*n_*ldda_*2,ldda_,upper_,trans_,n_,ldda_,type_,false); - - } + mlu_transfer_data(h_output, d_output, total_size, + CNRT_MEM_TRANS_DIR_DEV2HOST); + + if (parser_->device() != CPU) { + if (result_mul) { + for (int i = 0; i < batch_size_; i++) { + if (type_ == MLUOP_DTYPE_FLOAT) { + trans_mul(h_input + i * n_ * ldda_, h_output + i * n_ * ldda_, ldda_, + upper_, trans_, n_, ldda_, type_, false); + } else { + trans_mul(h_input + i * n_ * ldda_ * 2, h_output + i * n_ * ldda_ * 2, + ldda_, upper_, trans_, n_, ldda_, type_, false); } - h_output = h_input; - fill_zero(h_output,upper_,batch_size_,n_,ldda_,type_,true); - } - else - { - - fill_zero(h_output,upper_,batch_size_,n_,ldda_,type_,false); + } + h_output = h_input; + fill_zero(h_output, upper_, batch_size_, n_, ldda_, type_, true); + } else { + fill_zero(h_output, upper_, batch_size_, n_, ldda_, type_, false); } - if(type_ != MLUOP_DTYPE_FLOAT) - { - set_diag_imag_one(h_output,batch_size_,n_,ldda_); + if (type_ != MLUOP_DTYPE_FLOAT) { + set_diag_imag_one(h_output, batch_size_, n_, ldda_); } - - mlu_transfer_data(d_output,h_output,total_size,CNRT_MEM_TRANS_DIR_HOST2DEV); - - - - + mlu_transfer_data(d_output, h_output, total_size, + CNRT_MEM_TRANS_DIR_HOST2DEV); } - -// printf("mlu after cholesky result:\n"); -// print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); - + // printf("mlu after cholesky result:\n"); + // print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); - return; } -void cpu_compute(float* cpu_c, int n_, int ldda_, bool upper_, bool trans_, mluOpDataType_t type_) -{ - - if(trans_) - { - for(long int i = 0; i < n_; i++) - { - float dia; - if(type_ == MLUOP_DTYPE_FLOAT) - { - dia = cpu_c[i+i*ldda_]; - } - else - { - dia = cpu_c[(i+i*ldda_)*2]; - } - float dia_root = sqrt(dia); - - if(type_ == MLUOP_DTYPE_FLOAT) - { - cpu_c[i+i*ldda_] = sqrt(dia); +void cpu_compute(float* cpu_c, int n_, int ldda_, bool upper_, bool trans_, + mluOpDataType_t type_) { + if (trans_) { + for (long int i = 0; i < n_; i++) { + float dia; + if (type_ == MLUOP_DTYPE_FLOAT) { + dia = cpu_c[i + i * ldda_]; + } else { + dia = cpu_c[(i + i * ldda_) * 2]; + } + float dia_root = sqrt(dia); + + if (type_ == MLUOP_DTYPE_FLOAT) { + cpu_c[i + i * ldda_] = sqrt(dia); + } else { + cpu_c[(i + i * ldda_) * 2] = sqrt(dia); + } + if (upper_ == false) { + if (type_ == MLUOP_DTYPE_FLOAT) { + for (long int j = i + 1; j < n_; j++) { + cpu_c[i + j * ldda_] = cpu_c[i + j * ldda_] / dia_root; + } + for (long int j = i + 1; j < n_; j++) { + for (long int k = j; k < n_; k++) { + cpu_c[j + k * ldda_] -= + (cpu_c[i + k * ldda_] * cpu_c[i + j * ldda_]); } - else - { - cpu_c[(i+i*ldda_)*2] = sqrt(dia); + } + } else { + for (long int j = 0; j < i; j++) { + cpu_c[(i + j * ldda_) * 2] = 0; + cpu_c[(i + j * ldda_) * 2 + 1] = 0; + } + for (long int j = i + 1; j < n_; j++) { + cpu_c[(i + j * ldda_) * 2] = cpu_c[(i + j * ldda_) * 2] / dia_root; + cpu_c[(i + j * ldda_) * 2 + 1] = + cpu_c[(i + j * ldda_) * 2 + 1] / dia_root; + } + for (long int j = i + 1; j < n_; j++) { + for (long int k = j; k < n_; k++) { + cpu_c[(j + k * ldda_) * 2] -= + (cpu_c[(i + k * ldda_) * 2] * cpu_c[(i + j * ldda_) * 2] + + cpu_c[(i + k * ldda_) * 2 + 1] * + cpu_c[(i + j * ldda_) * 2 + 1]); + cpu_c[(j + k * ldda_) * 2 + 1] -= + (cpu_c[(i + k * ldda_) * 2 + 1] * cpu_c[(i + j * ldda_) * 2] - + cpu_c[(i + k * ldda_) * 2] * cpu_c[(i + j * ldda_) * 2 + 1]); } - if(upper_==false) - { - if(type_ == MLUOP_DTYPE_FLOAT) - { - for(long int j = i+1;j Date: Tue, 13 Aug 2024 01:34:56 +0800 Subject: [PATCH 13/27] [Fix](mluOpCholesky): fix format --- kernels/cholesky/cholesky.cpp | 47 ++- kernels/cholesky/cholesky.h | 104 +++-- mlu_op.h | 376 ++++++++++++++++-- .../pb_gtest/src/zoo/cholesky/cholesky.cpp | 167 ++++---- .../pb_gtest/src/zoo/cholesky/cholesky.h | 2 +- 5 files changed, 504 insertions(+), 192 deletions(-) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index a4994578c..14f184340 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -1,3 +1,26 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + #include "cholesky.h" // calculates the required workspace size for performing the Cholesky // decomposition on a given matrix or batch of matrices. @@ -17,10 +40,10 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace( PARAM_CHECK("mluOpCholesky", dtype == MLUOP_DTYPE_FLOAT || dtype == MLUOP_DTYPE_COMPLEX_FLOAT); - unsigned long type_size; + uint64_t type_size; MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); - long int size_a = 0, lda = 0, size_c = 0, ldc = 0; - long int batch_size = 1; + int64_t size_a = 0, lda = 0, size_c = 0, ldc = 0; + int64_t batch_size = 1; int dim = input_desc->dim; if (dim == 2) { size_a = input_desc->dims[0]; @@ -68,7 +91,7 @@ calculate_body(mluOpHandle_t handle, int batch_size, int dim = input_desc->dim; bool is_row_major = (input_desc->strides)[dim - 1] == 1; - unsigned long type_size; + uint64_t type_size; MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); int size_a = 0, lda = 0, size_c = 0, ldc = 0; if (dim == 2) { @@ -101,7 +124,7 @@ calculate_body(mluOpHandle_t handle, int batch_size, } else { CNRT_CHECK( cnrtMemcpy(d_output, d_input, - type_size * size_a * lda * ((unsigned long)batch_size), + type_size * size_a * lda * ((uint64_t)batch_size), CNRT_MEM_TRANS_DIR_DEV2DEV)); } } else { @@ -157,7 +180,7 @@ calculate_body(mluOpHandle_t handle, int batch_size, cnrtQueueSync(queue); CNRT_CHECK( cnrtMemcpy(d_output, workspace, - type_size * size_a * lda * ((unsigned long)batch_size), + type_size * size_a * lda * ((uint64_t)batch_size), CNRT_MEM_TRANS_DIR_DEV2DEV)); } } else { @@ -229,12 +252,12 @@ calculate_body(mluOpHandle_t handle, int batch_size, CNRT_CHECK(cnrtMemcpy( d_output + type_size / 4 * size_a * lda * 16, workspace + type_size / 4 * size_a * lda * 16, - type_size * size_a * lda * ((unsigned long)batch_size - 16), + type_size * size_a * lda * ((uint64_t)batch_size - 16), CNRT_MEM_TRANS_DIR_DEV2DEV)); } else { CNRT_CHECK( cnrtMemcpy(d_output, workspace, - type_size * size_a * lda * ((unsigned long)batch_size), + type_size * size_a * lda * ((uint64_t)batch_size), CNRT_MEM_TRANS_DIR_DEV2DEV)); } } @@ -251,7 +274,7 @@ calculate_body(mluOpHandle_t handle, int batch_size, // matrices are either float or complex float types and performs the // decomposition either on the upper or lower triangular part of the matrix, // based on the 'upper' boolean flag. -mluOpStatus_t MLUOP_WIN_API mluOpStatus_t MLUOP_WIN_API +mluOpStatus_t MLUOP_WIN_API mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output, bool upper, float* workspace) { @@ -298,14 +321,14 @@ mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, ldc = output_desc->dims[2]; } - unsigned long type_size; + uint64_t type_size; MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); if (type_size == 8 && batch_size > 16 && size_a > 2000) { int stride = 2 * size_a * lda; calculate_body(handle, 16, input_desc, d_input, output_desc, d_output, upper, workspace); cnrtQueueSync(queue); - calculate_body(handle, ((unsigned long)batch_size) - 16, input_desc, + calculate_body(handle, ((uint64_t)batch_size) - 16, input_desc, d_input + 16 * stride, output_desc, d_output + 16 * stride, upper, workspace); } else { @@ -314,4 +337,4 @@ mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, } return MLUOP_STATUS_SUCCESS; -} \ No newline at end of file +} diff --git a/kernels/cholesky/cholesky.h b/kernels/cholesky/cholesky.h index 16f3fdf71..425b3f4ef 100644 --- a/kernels/cholesky/cholesky.h +++ b/kernels/cholesky/cholesky.h @@ -1,3 +1,27 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + + #ifndef __CHOLESKY_H #define __CHOLESKY_H @@ -10,6 +34,7 @@ #include #include #include +#include // #include #include "mlu_op.h" #include "core/gen_case.h" @@ -22,73 +47,74 @@ #define CNB (16) #define REC_NB (16) -#define POTF_NB ((REC_NB) / 4) +#define POTF_NB ((REC_NB)/4) #define CREC_NB (16) -#define CPOTF_NB ((CREC_NB) / 4) +#define CPOTF_NB ((CREC_NB)/4) // #define CPOTF_NB ((CREC_NB)) #define __CNRT_FUNC_TYPE__ CNRT_FUNC_TYPE_UNION1 #define TASK_NUM (4) #define NB (32) #define CLUSTER_NUM 1 -#define M (TASK_NUM * POTF_NB) // POTF边长 +#define M (TASK_NUM * POTF_NB) #define ZERO 0.0 -#define SHARED_MEM_SIZE (((M * POTF_NB / TASK_NUM * 4) + (POTF_NB * POTF_NB))) +#define SHARED_MEM_SIZE (((M*POTF_NB/TASK_NUM * 4)+(POTF_NB * POTF_NB))) #define OFFSET_ROW(A, i, j) A + ((i) * (lda) + (j)) #define OFFSET_B_ROW(B, i, j) B + ((i) * (ldb) + (j)) -mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, - int n, int recnb, float* dA, int ldda, - int gbstep, mluOpHandle_t handle); -// void mluOpCholesky(bool trans, bool uplo, int n, float* dA, float* dC, int -// ldda); -mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans, int n, int k, - float* d_a, int ldda, float* d_c, int lddc, - mluOpHandle_t handle); +mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, + bool uplo, int n, int recnb, float* dA, int ldda, + int gbstep, mluOpHandle_t handle, float* workspace); -mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, - float alpha, float beta, float* d_a, int lda, int stride_a, - float* d_b, int ldb, int stride_b, float* d_c, int ldc, - int stride_c, mluOpHandle_t handle); +mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans, + int n, int k, float* d_a, int ldda, float* d_c, + int lddc, mluOpHandle_t handle, float* workspace); + +mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, + int n, int k, float alpha, float beta, float* d_a, + int lda, int stride_a, float* d_b, int ldb, + int stride_b, float* d_c, int ldc, + int stride_c, mluOpHandle_t handle, float* workspace); // side:true->right // false->left -mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, - float* d_a, int ldda, float* d_b, int lddb, - mluOpHandle_t handle); +mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, + int m, int n, float* d_a, int ldda, float* d_b, + int lddb, mluOpHandle_t handle, float* workspace); mluOpStatus_t transpose(int batch, int m, int n, float* d_input, - float* d_output, mluOpHandle_t handle, - mluOpDataType_t type, float* workspace); + float* d_output, mluOpHandle_t handle, + mluOpDataType_t type, float* workspace); mluOpStatus_t conj_complex(int batch, int m, int n, float* d_input, - float* d_output, mluOpHandle_t handle); + float* d_output, mluOpHandle_t handle); -mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, - float* drA, float* diA, int lda, - mluOpHandle_t handle); +mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, + int recnb, float* drA, float* diA, int lda, + mluOpHandle_t handle, float* workspace); -mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, - float alpha, float beta, float* d_ra, float* d_ia, int lda, - int stride_a, float* d_rb, float* d_ib, int ldb, - int stride_b, float* d_rc, float* d_ic, int ldc, - int stride_c, mluOpHandle_t handle); +mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, + int n, int k, float alpha, float beta, float* d_ra, + float* d_ia, int lda, int stride_a, float* d_rb, + float* d_ib, int ldb, int stride_b, float* d_rc, + float* d_ic, int ldc, int stride_c, mluOpHandle_t handle, + float* workspace); mluOpStatus_t workspace_malloc(size_t size, float** workspace); -// mluOpStatus_t complex_set_half_zero(int batch, int stride, float* d_a, int m, -// int ld); +mluOpStatus_t workspace_free(float** workspace); + -mluOpStatus_t set_half_zero(int batch, int stride, float* d_a, int lda, int m, - mluOpHandle_t handle); +mluOpStatus_t set_half_zero(int batch, int stride, float* d_a, int lda, + int m, mluOpHandle_t handle); mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, - float* id_a, int lda, float* rd_b, float* id_b, int ldb, - mluOpHandle_t handle); + float* id_a, int lda, float* rd_b, float* id_b, + int ldb, mluOpHandle_t handle, float* workspace); mluOpStatus_t cherk(int batch, int stride, int n, int k, float* rd_a, - float* id_a, int lda, float* rd_c, float* id_c, int ldc, - mluOpHandle_t handle); + float* id_a, int lda, float* rd_c, float* id_c, + int ldc, mluOpHandle_t handle, float* workspace); -#endif \ No newline at end of file +#endif diff --git a/mlu_op.h b/mlu_op.h index 72a79a19f..814511943 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -3835,6 +3835,10 @@ mluOpDynamicPointToVoxelForward(const mluOpHandle_t handle, /*! * @brief Gets extra space size that is needed in the GenerateProposalsV2 operation. * + * @par Deprecated + * - ::mluOpGetGenerateProposalsV2WorkspaceSize is deprecated and will be removed in the future + * release. It is recommended to use ::mluOpGetGenerateProposalsV2WorkspaceSize_v2 instead. + * * @param[in] handle * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices * and queues in the GenerateProposalsV2 operation. @@ -3866,11 +3870,59 @@ mluOpDynamicPointToVoxelForward(const mluOpHandle_t handle, * - None. * * @par Reference - * + * - None. */ mluOpStatus_t MLUOP_WIN_API mluOpGetGenerateProposalsV2WorkspaceSize(mluOpHandle_t handle, const mluOpTensorDescriptor_t scores_desc, size_t *size); +// Group: GenerateProposalsV2 +/*! + * @brief Gets extra space size that is needed in the GenerateProposalsV2 operation. + * + * Compared with ::mluOpGetGenerateProposalsV2WorkspaceSize, this function supports + * parameter \p pre_nms_top_n. + * + * @param[in] handle + * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices + * and queues in the GenerateProposalsV2 operation. + * @param[in] scores_desc + * The descriptor of the tensor \b scores. For detailed information, + * see ::mluOpTensorDescriptor_t. + * @param[in] pre_nms_top_n + * The number of top scoring RPN proposals to keep before applying NMS. + * @param[out] size + * A host pointer to the returned size of extra space in bytes. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM, ::MLUOP_STATUS_NOT_SUPPORTED + * + * @par Data Type + * - None. + * + * @par Data Layout + * - None. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par Note + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ +mluOpStatus_t MLUOP_WIN_API +mluOpGetGenerateProposalsV2WorkspaceSize_v2(mluOpHandle_t handle, + const mluOpTensorDescriptor_t scores_desc, + const int32_t pre_nms_top_n, + size_t *size); + // Group: GenerateProposalsV2 /*! * @brief Generates bounding box proposals for Faster Region-CNN. @@ -8759,6 +8811,228 @@ mluOpActiveRotatedFilterForward(const mluOpHandle_t handle, const mluOpTensorDescriptor_t output_desc, void *output); +/*! + * @brief Enumeration variables describing the attributes of the AdamW computation. + */ +typedef enum { + MLUOP_ADAMW_WEIGHT_DECAY = 0, + /*!< Set the weight_decay attribute for the AdamW operation. */ + MLUOP_ADAMW_GRAD_SCALE = 1, + /*!< Set the grad_scale attribute for the AdamW operation. */ + MLUOP_ADAMW_USE_NESTEROV = 2, + /*!< Specifies whether to use nesterov on the AdamW operation. */ +} mluOpAdamWDescAttribute_t; + +typedef struct mluOpAdamWStruct *mluOpAdamWDescriptor_t; + +// Group: AdamW +/*! + * @brief Updates each attribute by using AdamW. + * + * @param[in] handle + * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices + * and queues in the AdamW operation. For detailed information, + * see ::mluOpHandle_t. + * @param[in] adamw_desc + * A host pointer to the AdamW descriptor that holds information about the AdamW operation. + * @param[in] param_desc + * The descriptor of the tensor, which contains the dimension and layout of param. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] param + * Pointer to the MLU memory that stores the param tensor. + * @param[in] paramh_desc + * The descriptor of the tensor, which contains the dimension and layout of param_h. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] param_h + * Pointer to the MLU memory that stores the param_h tensor. + * @param[in] momentum_desc + * The descriptor of the tensor, which contains the dimension and layout of momentum. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] momentum + * Pointer to the MLU memory that stores the momentum tensor. + * @param[in] velocity_desc + * The descriptor of the tensor, which contains the dimension and layout of velocity. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] velocity + * Pointer to the MLU memory that stores the velocity tensor. + * @param[in] grad_desc + * The descriptor of the tensor, which contains the dimension and layout of grad. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] grad + * Pointer to the MLU memory that stores the grad tensor. + * @param[in] lr + * A scalar of lr factor that is used for AdamW. + * @param[in] beta1 + * A scalar of beta1 factor that is used for AdamW. + * @param[in] beta2 + * A scalar of beta2 factor that is used for AdamW. + * @param[in] bias1 + * A scalar of bias1 factor that is used for AdamW. + * @param[in] bias2 + * A scalar of bias2 factor that is used for AdamW. + * @param[in] epsilon + * A scalar of epsilon factor that is used for AdamW. + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM, ::MLUOP_STATUS_ARCH_MISMATCH + * + * @par Data Type + * - The supported data types of input and output tensors are as follows: + * - param tensor: float + * - param_h tensor: bfloat16 + * - momentum tensor: float + * - velocity tensor: float + * - grad tensor: bfloat16 + * + * @par Data Layout + * - The supported data layouts of \b param tensor, \b param_h tensor, \b momentum tensor, \b velocity tensor, and \b + * grad tensor are as follows: + * - param tensor: \p MLUOP_LAYOUT_ARRAY + * - param_h tensor: \p MLUOP_LAYOUT_ARRAY + * - momentum tensor: \p MLUOP_LAYOUT_ARRAY + * - velocity tensor: \p MLUOP_LAYOUT_ARRAY + * - grad tensor: \p MLUOP_LAYOUT_ARRAY + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par Note + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - https://github.com/OpenBMB/BMTrain/blob/6abcf772aa1e120192f7656e55c4adbcde53c886/csrc/cuda/adam_cuda.cu + */ +mluOpStatus_t MLUOP_WIN_API +mluOpAdamW(mluOpHandle_t handle, + mluOpAdamWDescriptor_t adamw_desc, + const mluOpTensorDescriptor_t param_desc, + void *param, + const mluOpTensorDescriptor_t paramh_desc, + void *param_h, + const mluOpTensorDescriptor_t momentum_desc, + void *momentum, + const mluOpTensorDescriptor_t velocity_desc, + void *velocity, + const mluOpTensorDescriptor_t grad_desc, + void *grad, + const float lr, + const float beta1, + const float beta2, + const float bias1, + const float bias2, + const float epsilon); + +// Group: AdamW +/*! + * @brief Creates a descriptor pointed by \p adamw_desc for AdamW operation. + * The information is defined in ::mluOpAdamWDescriptor_t. + * For more information about the descriptor, see "Cambricon MLU-OPS User Guide". + * + * @param[out] adamw_desc + * A host pointer to the AdamW descriptor that holds information about the + * AdamW operation. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ALLOC_FAILED + * + * @par API Dependency + * - After calling this function, call ::mluOpSetAdamWDescAttr function to initialize + * and set the information to the AdamW descriptor. + * + * @par Note + * - None. + * + * @par Requirements + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ +mluOpStatus_t MLUOP_WIN_API +mluOpCreateAdamWDescriptor(mluOpAdamWDescriptor_t *adamw_desc); + +// Group: AdamW +/*! + * @brief Initializes the descriptor \b adamw_desc that was previously created with + * ::mluOpCreateAdamWDescriptor function, and sets AdamW information + * to the descriptor \b adamw_desc. The information includes \b weight_decay , \b grad_scale + * and \b use_nesterov for AdamW operation. + * + * @param[in] adamw_desc + * The descriptor of the AdamW operation. For detailed information, + * see ::mluOpAdamWDescriptor_t. + * @param[in] attr + * Attribute of AdamW descriptor to be set. For detailed information, + * see ::mluOpAdamWDescAttribute_t. + * @param[in] buf + * A host pointer to the attribute value set by this function. + * @param[in] size_in_bytes + * Buffer in bytes for verification. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - None. + * + * @par Data Layout + * - None. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - This function should be called after ::mluOpCreateAdamWDescriptor. + * + * @par Note + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ +mluOpStatus_t MLUOP_WIN_API +mluOpSetAdamWDescAttr(mluOpAdamWDescriptor_t adamw_desc, + mluOpAdamWDescAttribute_t attr, + const void *buf, + const size_t size_in_bytes); + +// Group: AdamW +/*! + * @brief Destroys the AdamW descriptor \p adamw_desc that was previously created by + * ::mluOpCreateAdamWDescriptor. + * + * @param[in] adamw_desc + * The AdamW descriptor to be destroyed. + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM + * + * @par Note + * - Call this function after calling ::mluOpAdamW. + * - It is necessary to call this function to destroy the AdamW descriptor to avoid memory leak. + * + * @par Requirements + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - None + */ +mluOpStatus_t MLUOP_WIN_API +mluOpDestroyAdamWDescriptor(mluOpAdamWDescriptor_t adamw_desc); + // Group: DeformRoiPool /*! * @brief Computes deformable roi pooling over \b input tensor. This function firstly divides the obtained @@ -14037,22 +14311,22 @@ mluOpSetFFTReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, void *rese * * @param[in] handle * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues - * in the FFT execution. For detailed information, see ::mluOpHandle_t. - * @param[in] fft_plan - * The plan for FFT execution. For detailed information, see ::mluOpFFTPlan_t. - * @param[in] input - * Pointer to the MLU memory that stores the input tensor. - * @param[in] scale_factor - * Input. A float-point scalar used to multiply the FFT output. - * @param[in, out] workspace - * Pointer to the MLU memory that is used as an extra workspace for the - * ::mluOpExecFFT. - * @param[out] output - * Pointer to the MLU memory that stores the output tensor. - * @param[in] direction - * The transform direction: 0 means FFT forward and 1 means FFT inverse. - * Direction is ignored for real-to-complex and complex-to-real transforms. - * + * in the FFT operation. For detailed information, see ::mluOpHandle_t. + * @param[in,out] fft_plan + * Plan for the FFT operation. This parameter is used to store the configuration of the FFT operation. + * @param[in,out] input + * Input tensor for the FFT operation. This parameter is used to provide the data to be transformed. + * @param[in,out] scale_factor + * Scale factor applied to the FFT operation. This parameter is used to normalize the result. + * @param[in,out] workspace + * Workspace buffer used during the FFT operation. This parameter is used to store intermediate + * results and other temporary data. + * @param[in,out] output + * Output tensor for the FFT operation. This parameter is used to store the result of the + * FFT transformation. + * @param[in,out] direction + * Direction of the FFT operation. This parameter specifies whether to perform a + * forward or inverse FFT transformation. * @par Note * - For in-place 1D real-to-complex FFTs, the input is a batch of n real numbers, and the * output is n/2 + 1 non-redundant complex numbers. This requires a padding of input array. @@ -14075,13 +14349,31 @@ mluOpSetFFTReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, void *rese * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM, ::MLUOP_STATUS_INTERNAL_ERROR * * @par Data Type - * - None. + * - The supported data types of \p input and \p output tensors are as follows: + * - real-to-complex FFT: + * - half(input offchip)-complex_half(output offchip)-half(input onchip) + * - float(input offchip)-complex_float(output offchip)-float(input onchip) + * - complex-to-real FFT: + * - complex_half(input offchip)-half(output offchip)-half(input onchip) + * - complex_float(input offchip)-float(output offchip)-float(input onchip) + * - complex-to-complex FFT: + * - complex_half(input offchip)-complex_half(output offchip)-half(input onchip) + * - complex_float(input offchip)-complex_float(output offchip)-float(input onchip) * * @par Data Layout * - None. * * @par Scale Limitation - * - None. + * - For float data types, FFT supports any combination of powers of i (i from 2 to 64), as well as \f$2^mL\f$. + * This means that for float data types, FFT can handle a wide range of sizes, allowing flexibility in choosing the + * dimensions of the input data. The values of i can be any integer from 2 to 64, enabling combinations such as 4, 8, + * 16, etc., as well as sizes that are a product of a power of 2 and an additional integer L. + * + * - For half data types, FFT support is more limited. It only supports sizes of 2^m, where m is an integer. This + * constraint means that the input size for half data types must be a power of 2. This restriction is important to note + * when planning to use FFT with half-precision floating-point data, as it limits the flexibility compared to float data + * types. + * * * @par API Dependency * - Before calling this function, you need to call the ::mluOpCreateFFTPlan @@ -14145,7 +14437,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpDestroyFFTPlan(mluOpFFTPlan_t fft_plan); /*! - * @brief Computes the Cholesky decomposition of a symmetric positive-definite matrix using the input tensor descriptor \p input_desc and writes the result to the output tensor descriptor \p output_desc. + * @brief Computes the Cholesky decomposition of a symmetric positive-definite matrix using the input tensor descriptor + * \p input_desc and writes the result to the output tensor descriptor \p output_desc. * * @param[in] handle * The handle to the MLU operation environment. @@ -14158,7 +14451,8 @@ mluOpDestroyFFTPlan(mluOpFFTPlan_t fft_plan); * @param[out] d_output * Pointer to the output data in device memory. * @param[in] upper - * Boolean flag to indicate whether to compute the upper or lower triangular Cholesky factor. True for upper, False for lower. + * Boolean flag to indicate whether to compute the upper or lower triangular Cholesky factor. True for upper, False for + * lower. * @param[in] workspace * Pointer to workspace buffer in device memory used for intermediate computations. * @@ -14187,26 +14481,30 @@ mluOpDestroyFFTPlan(mluOpFFTPlan_t fft_plan); * - None. */ -mluOpStatus_t MLUOP_WIN_API +mluOpStatus_t MLUOP_WIN_API mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, - float* d_input, - const mluOpTensorDescriptor_t output_desc, - float* d_output,bool upper, float* workspace); + float *d_input, + const mluOpTensorDescriptor_t output_desc, + float *d_output, + bool upper, + float *workspace); /*! - * @brief Calculates the size of the workspace required for the Cholesky decomposition and initializes a workspace pointer. - * This function must be called before performing Cholesky decomposition using mluOpCholesky. + * @brief Calculates the size of the workspace required for the Cholesky decomposition and initializes a workspace + * pointer. This function must be called before performing Cholesky decomposition using mluOpCholesky. * * @param[in] input_desc * The descriptor for the input tensor for which the Cholesky decomposition will be performed. * @param[out] size * Pointer to a size_t variable where the size of the required workspace will be stored. * @param[out] workspace - * Double pointer to a float, used to allocate memory for the workspace. This pointer will be set to point to the allocated workspace. + * Double pointer to a float, used to allocate memory for the workspace. This pointer will be set to point to the + * allocated workspace. * * @par Return - * - ::MLUOP_STATUS_SUCCESS if the workspace size is successfully calculated and the workspace is successfully allocated, + * - ::MLUOP_STATUS_SUCCESS if the workspace size is successfully calculated and the workspace is successfully + * allocated, * - ::MLUOP_STATUS_EXECUTION_FAILED if there are issues during the calculation or memory allocation. * * @par Data Type @@ -14231,18 +14529,17 @@ mluOpCholesky(mluOpHandle_t handle, * - None. */ - -mluOpStatus_t MLUOP_WIN_API -mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, - size_t* size, float** workspace); +mluOpStatus_t MLUOP_WIN_API +mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t *size, float **workspace); /*! * @brief Frees the memory allocated for the Cholesky decomposition workspace. - * This function should be called to release the workspace memory used by the Cholesky operations after they are no longer needed. + * This function should be called to release the workspace memory used by the Cholesky operations after they are no + * longer needed. * * @param[in,out] workspace - * Double pointer to the workspace memory that was allocated by mluOpGetCholeskyWorkspace or another allocation function. - * After calling this function, the pointer will be set to NULL to prevent accidental reuse. + * Double pointer to the workspace memory that was allocated by mluOpGetCholeskyWorkspace or another allocation + * function. After calling this function, the pointer will be set to NULL to prevent accidental reuse. * * @par Return * - ::MLUOP_STATUS_SUCCESS if the workspace is successfully freed, @@ -14270,11 +14567,8 @@ mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, * - None. */ - -mluOpStatus_t MLUOP_WIN_API -mluOpFreeCholeskyWorkspace(float** workspace); - - +mluOpStatus_t MLUOP_WIN_API +mluOpFreeCholeskyWorkspace(float **workspace); #if defined(__cplusplus) } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp index 49921170d..091ad3272 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp @@ -39,16 +39,16 @@ void CholeskyExecutor::paramCheck() { void set_matrix_zero(float* A, bool upper, bool trans_, int n_, int ldda_, mluOpDataType_t type_) { if (trans_) { - for (long int i = 0; i < n_; i++) { - for (long int j = 0; j < ldda_; j++) { + for (int64_t i = 0; i < n_; i++) { + for (int64_t j = 0; j < ldda_; j++) { if (upper) { if (i >= j) { if (i == j && type_ == MLUOP_DTYPE_COMPLEX_FLOAT) { A[(j + i * ldda_) * 2 + 1] = 0.0; } else { - if (type_ == MLUOP_DTYPE_FLOAT) + if (type_ == MLUOP_DTYPE_FLOAT) { A[j + i * ldda_] = 0.0; - else { + } else { A[(j + i * ldda_) * 2] = 0.0; A[(j + i * ldda_) * 2 + 1] = 0.0; } @@ -61,9 +61,9 @@ void set_matrix_zero(float* A, bool upper, bool trans_, int n_, int ldda_, A[(j + i * ldda_) * 2 + 1] = 0.0; } } else { - if (type_ == MLUOP_DTYPE_FLOAT) + if (type_ == MLUOP_DTYPE_FLOAT) { A[j + i * ldda_] = 0.0; - else { + } else { A[(j + i * ldda_) * 2] = 0.0; A[(j + i * ldda_) * 2 + 1] = 0.0; } @@ -76,9 +76,9 @@ void set_matrix_zero(float* A, bool upper, bool trans_, int n_, int ldda_, for (int i = 0; i < n_; i++) { for (int j = 0; j < ldda_; j++) { if ((i > j && ~upper) || (i < j && upper)) { - if (type_ == MLUOP_DTYPE_FLOAT) + if (type_ == MLUOP_DTYPE_FLOAT) { A[j + i * ldda_] = 0.0; - else { + } else { A[(j + i * ldda_) * 2] = 0.0; A[(j + i * ldda_) * 2 + 1] = 0.0; } @@ -91,8 +91,8 @@ void set_matrix_zero(float* A, bool upper, bool trans_, int n_, int ldda_, void trans_mul(float* A, float* C, int lda, bool upper_, bool trans_, int n_, int ldda_, mluOpDataType_t type_, bool diag_add) { if (trans_) { - for (long int i = 0; i < lda; i++) { - for (long int j = 0; j < n_; j++) { + for (int64_t i = 0; i < lda; i++) { + for (int64_t j = 0; j < n_; j++) { if (type_ == MLUOP_DTYPE_FLOAT) { A[i + j * lda] = 0.0; if (j == i && diag_add) { @@ -109,11 +109,11 @@ void trans_mul(float* A, float* C, int lda, bool upper_, bool trans_, int n_, A[j * lda * 2 + i * 2] = 1.0; } } - for (long int k = 0; k <= i; k++) { + for (int64_t k = 0; k <= i; k++) { if (upper_ == false) { - if (j < i) + if (j < i) { continue; - else { + } else { if (type_ == MLUOP_DTYPE_FLOAT) { A[i + j * lda] += (C[k + i * lda] * C[k + j * lda]); } else { @@ -131,15 +131,15 @@ void trans_mul(float* A, float* C, int lda, bool upper_, bool trans_, int n_, } } else { if (type_ == MLUOP_DTYPE_FLOAT) { - if (j > i) + if (j > i) { continue; - else { + } else { A[i + j * lda] += (C[k * lda + i] * C[k * lda + j]); } } else { - if (j < i) + if (j < i) { continue; - else { + } else { A[(i + j * lda) * 2] += (C[(k * lda + i) * 2] * C[(k * lda + j) * 2] + C[(k * lda + i) * 2 + 1] * C[(k * lda + j) * 2 + 1]); @@ -163,17 +163,17 @@ void trans_mul(float* A, float* C, int lda, bool upper_, bool trans_, int n_, } else { for (int i = 0; i < lda; i++) { for (int j = 0; j < n_; j++) { - if (type_ == MLUOP_DTYPE_FLOAT) + if (type_ == MLUOP_DTYPE_FLOAT) { A[j + i * lda] = 0.0; - else { + } else { A[(i + j * lda) * 2] = 0.0; A[(i + j * lda) * 2 + 1] = 0.0; } for (int k = 0; k <= i; k++) { if (j < i) continue; - if (type_ == MLUOP_DTYPE_FLOAT) + if (type_ == MLUOP_DTYPE_FLOAT) { A[j + i * lda] += (C[j + k * lda] * C[i + k * lda]); - else { + } else { A[(j + i * lda) * 2] += (C[(j + k * lda) * 2] * C[(i + k * lda) * 2]); A[(j + i * lda) * 2 + 1] += @@ -192,9 +192,9 @@ void fill_zero(float* A, bool upper_, int batch_, int n_, int ldda_, } else { stride *= 2; } - for (long int i = 0; i < batch_; i++) { - for (long int j = 0; j < n_; j++) { - for (long int h = 0; h < ldda_; h++) { + for (int64_t i = 0; i < batch_; i++) { + for (int64_t j = 0; j < n_; j++) { + for (int64_t h = 0; h < ldda_; h++) { if (j == h) { continue; } else if (j < h) { @@ -220,9 +220,9 @@ void fill_zero(float* A, bool upper_, int batch_, int n_, int ldda_, } void set_diag_imag_one(float* A, int batch_, int n_, int ldda_) { - long int stride = n_ * ldda_ * 2; - for (long int i = 0; i < batch_; i++) { - for (long int j = 0; j < n_; j++) { + int64_t stride = n_ * ldda_ * 2; + for (int64_t i = 0; i < batch_; i++) { + for (int64_t j = 0; j < n_; j++) { A[i * stride + (j * ldda_ + j) * 2 + 1] = 1.0; } } @@ -235,9 +235,9 @@ void print_matrix(int batch, float* A, int lda, bool trans_, int n_, int ldda_, if (trans_) { for (int i = 0; i < n_; i++) { for (int j = 0; j < lda; j++) { - if (type_ == MLUOP_DTYPE_FLOAT) + if (type_ == MLUOP_DTYPE_FLOAT) { printf("%17.13f ", A[j + i * lda]); - else { + } else { printf("%7.3f", A[(j + i * lda) * 2]); printf(","); printf("%7.3f ", A[(j + i * lda) * 2 + 1]); @@ -248,9 +248,9 @@ void print_matrix(int batch, float* A, int lda, bool trans_, int n_, int ldda_, } else { for (int i = 0; i < lda; i++) { for (int j = 0; j < n_; j++) { - if (type_ == MLUOP_DTYPE_FLOAT) + if (type_ == MLUOP_DTYPE_FLOAT) { printf("%7.3f ", A[j + i * lda]); - else { + } else { printf("%7.3f", A[(j + i * lda) * 2]); printf(","); printf("%7.3f ", A[(j + i * lda) * 2 + 1]); @@ -264,12 +264,12 @@ void print_matrix(int batch, float* A, int lda, bool trans_, int n_, int ldda_, } } -void cpu_transfer_data(float* dst, float* src, unsigned long data_size) { - unsigned long size_block = 1024 * 1024 * 1024; - unsigned long transfer_num = data_size / size_block; - unsigned long transfer_remain = data_size % size_block; +void cpu_transfer_data(float* dst, float* src, uint64_t data_size) { + uint64_t size_block = 1024 * 1024 * 1024; + uint64_t transfer_num = data_size / size_block; + uint64_t transfer_remain = data_size % size_block; float *temp_dst = dst, *temp_src = src; - for (unsigned long i = 0; i < transfer_num; i++) { + for (uint64_t i = 0; i < transfer_num; i++) { std::memcpy(temp_dst, temp_src, size_block); temp_dst += (size_block / 4); temp_src += (size_block / 4); @@ -279,14 +279,14 @@ void cpu_transfer_data(float* dst, float* src, unsigned long data_size) { } } -void mlu_transfer_data(float* dst, float* src, unsigned long data_size, +void mlu_transfer_data(float* dst, float* src, uint64_t data_size, cnrtMemTransDir_t dir) { - unsigned long size_block = 1024 * 1024 * 1024; - unsigned long transfer_num = data_size / size_block; - unsigned long transfer_remain = data_size % size_block; + uint64_t size_block = 1024 * 1024 * 1024; + uint64_t transfer_num = data_size / size_block; + uint64_t transfer_remain = data_size % size_block; float *temp_dst = dst, *temp_src = src; - for (unsigned long i = 0; i < transfer_num; i++) { + for (uint64_t i = 0; i < transfer_num; i++) { GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemcpy(temp_dst, temp_src, size_block, dir)); temp_dst += (size_block / 4); @@ -299,13 +299,10 @@ void mlu_transfer_data(float* dst, float* src, unsigned long data_size, } void CholeskyExecutor::prepareComputeParam() { - // cpu端把矩阵的一半设置成0 - //然后转置乘法,结果存到cpu端的另一个矩阵 - //然后传给gpu端 printf("start prepare compute parameter.\n"); - int long_int_size = sizeof(long int); + int long_int_size = sizeof(int64_t); int int_size = sizeof(int); - printf("long int size:%d, int size:%d\n", long_int_size, int_size); + printf("int64_t size:%d, int size:%d\n", long_int_size, int_size); auto input_desc_ = (tensor_desc_[0].tensor); auto output_desc_ = (tensor_desc_[1].tensor); auto dev_a = (float*)(data_vector_[0].host_ptr); @@ -352,41 +349,12 @@ void CholeskyExecutor::prepareComputeParam() { printf("\n"); printf("data vector length : %ld\n", data_vector_.size()); } - unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; - // unsigned long size_2g = 1024*1024*1024-1+1024*1024*1024; - // unsigned long size_2g = 1024*1024*10-1; - // int transfer_num = total_size / size_2g; - - // int transfer_remain = total_size % size_2g; - // printf("total size:%ld, transfer_num:%d, - // transfer_remain:%d\n",total_size,transfer_num,transfer_remain); - - // printf("matrix random:\n"); - // print_matrix(batch_size_, dev_a,ldda_,trans_,n_,ldda_,type_); - // print_matrix(batch_size_, base_line_out,ldda_,trans_,n_,ldda_,type_); - - // for(unsigned long i = 0; i < transfer_num; i++) - // { - // std::memcpy(dev_c+(i*size_2g),dev_a+(i*size_2g),size_2g); - // } - // printf("ddd\n"); - // if(transfer_remain > 0) - // { - // std::memcpy(dev_c+(transfer_num*size_2g),dev_a+(transfer_num*size_2g),transfer_remain); - // } - // printf("lll\n"); + uint64_t total_size = batch_size_ * n_ * ldda_ * type_size_; + cpu_transfer_data(dev_c, dev_a, total_size); - // if(batch_size_ > 16 && n_ > 2000) - // { - // std::memcpy(dev_c,dev_a,16*type_size_*n_*ldda_); - // std::memcpy(dev_c+16*type_size_/4*n_*ldda_,dev_a+16*type_size_/4*n_*ldda_,(batch_size_-16)*type_size_*n_*ldda_); - // } - // else - // { - // std::memcpy(dev_c,dev_a,batch_size_*type_size_*n_*ldda_); - // } + if (parser_->device() == CPU) { - for (long int i = 0; i < batch_size_; i++) { + for (int64_t i = 0; i < batch_size_; i++) { if (type_ == MLUOP_DTYPE_FLOAT) set_matrix_zero(dev_c + i * n_ * ldda_, false, trans_, n_, ldda_, type_); @@ -395,7 +363,7 @@ void CholeskyExecutor::prepareComputeParam() { type_); } // set_matrix_zero((float*)dev_c,upper_,trans_,n_,ldda_,type_); - for (long int i = 0; i < batch_size_; i++) { + for (int64_t i = 0; i < batch_size_; i++) { if (type_ == MLUOP_DTYPE_FLOAT) { trans_mul(dev_a + i * n_ * ldda_, dev_c + i * n_ * ldda_, ldda_, false, trans_, n_, ldda_, type_, true); @@ -431,7 +399,7 @@ void CholeskyExecutor::compute() { auto d_intput = (float*)(data_vector_[0].device_ptr); auto d_output = (float*)(data_vector_[1].device_ptr); - unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; + uint64_t total_size = batch_size_ * n_ * ldda_ * type_size_; cpu_transfer_data(h_input, h_output, total_size); mlu_transfer_data(h_output, d_intput, total_size, @@ -487,7 +455,7 @@ void CholeskyExecutor::compute() { void cpu_compute(float* cpu_c, int n_, int ldda_, bool upper_, bool trans_, mluOpDataType_t type_) { if (trans_) { - for (long int i = 0; i < n_; i++) { + for (int64_t i = 0; i < n_; i++) { float dia; if (type_ == MLUOP_DTYPE_FLOAT) { dia = cpu_c[i + i * ldda_]; @@ -503,27 +471,27 @@ void cpu_compute(float* cpu_c, int n_, int ldda_, bool upper_, bool trans_, } if (upper_ == false) { if (type_ == MLUOP_DTYPE_FLOAT) { - for (long int j = i + 1; j < n_; j++) { + for (int64_t j = i + 1; j < n_; j++) { cpu_c[i + j * ldda_] = cpu_c[i + j * ldda_] / dia_root; } - for (long int j = i + 1; j < n_; j++) { - for (long int k = j; k < n_; k++) { + for (int64_t j = i + 1; j < n_; j++) { + for (int64_t k = j; k < n_; k++) { cpu_c[j + k * ldda_] -= (cpu_c[i + k * ldda_] * cpu_c[i + j * ldda_]); } } } else { - for (long int j = 0; j < i; j++) { + for (int64_t j = 0; j < i; j++) { cpu_c[(i + j * ldda_) * 2] = 0; cpu_c[(i + j * ldda_) * 2 + 1] = 0; } - for (long int j = i + 1; j < n_; j++) { + for (int64_t j = i + 1; j < n_; j++) { cpu_c[(i + j * ldda_) * 2] = cpu_c[(i + j * ldda_) * 2] / dia_root; cpu_c[(i + j * ldda_) * 2 + 1] = cpu_c[(i + j * ldda_) * 2 + 1] / dia_root; } - for (long int j = i + 1; j < n_; j++) { - for (long int k = j; k < n_; k++) { + for (int64_t j = i + 1; j < n_; j++) { + for (int64_t k = j; k < n_; k++) { cpu_c[(j + k * ldda_) * 2] -= (cpu_c[(i + k * ldda_) * 2] * cpu_c[(i + j * ldda_) * 2] + cpu_c[(i + k * ldda_) * 2 + 1] * @@ -537,23 +505,23 @@ void cpu_compute(float* cpu_c, int n_, int ldda_, bool upper_, bool trans_, } else { if (type_ == MLUOP_DTYPE_FLOAT) { - for (long int j = i + 1; j < n_; j++) { + for (int64_t j = i + 1; j < n_; j++) { cpu_c[j + i * ldda_] = cpu_c[j + i * ldda_] / dia_root; } - for (long int j = i + 1; j < n_; j++) { - for (long int k = j; k < n_; k++) { + for (int64_t j = i + 1; j < n_; j++) { + for (int64_t k = j; k < n_; k++) { cpu_c[k + j * ldda_] -= (cpu_c[k + i * ldda_] * cpu_c[j + i * ldda_]); } } } else { - for (long int j = i + 1; j < n_; j++) { + for (int64_t j = i + 1; j < n_; j++) { cpu_c[(j + i * ldda_) * 2] = cpu_c[(j + i * ldda_) * 2] / dia_root; cpu_c[(j + i * ldda_) * 2 + 1] = cpu_c[(j + i * ldda_) * 2 + 1] / dia_root; } - for (long int j = i + 1; j < n_; j++) { - for (long int k = j; k < n_; k++) { + for (int64_t j = i + 1; j < n_; j++) { + for (int64_t k = j; k < n_; k++) { cpu_c[(k + j * ldda_) * 2] -= (cpu_c[(k + i * ldda_) * 2] * cpu_c[(j + i * ldda_) * 2] + cpu_c[(k + i * ldda_) * 2 + 1] * @@ -589,8 +557,8 @@ void CholeskyExecutor::cpuCompute() { // std::memcpy(dev_c,dev_a,sizeof(float)*n_*ldda_); float* cpu_a = cpu_fp32_input_[0]; float* cpu_c = cpu_fp32_output_[0]; - unsigned long total_size = batch_size_ * n_ * ldda_ * type_size_; - unsigned long size_2g = 1024 * 1024 * 1024 - 1 + 1024 * 1024 * 1024; + uint64_t total_size = batch_size_ * n_ * ldda_ * type_size_; + uint64_t size_2g = 1024 * 1024 * 1024 - 1 + 1024 * 1024 * 1024; int transfer_num = total_size / size_2g; int transfer_remain = total_size % size_2g; @@ -607,16 +575,17 @@ void CholeskyExecutor::cpuCompute() { if (type_ == MLUOP_DTYPE_FLOAT) { trans_mul(h_input + i * n_ * ldda_, h_output + i * n_ * ldda_, ldda_, upper_, trans_, n_, ldda_, type_, false); - } else + } else { trans_mul(h_input + i * n_ * ldda_ * 2, h_output + i * n_ * ldda_ * 2, ldda_, upper_, trans_, n_, ldda_, type_, false); + } } cpu_transfer_data(h_output, h_input, total_size); fill_zero(h_output, upper_, batch_size_, n_, ldda_, type_, true); } else { - for (long int i = 0; i < batch_size_; i++) { + for (int64_t i = 0; i < batch_size_; i++) { cpu_compute(cpu_c + i * n_ * ldda_ * type_size_ / 4, n_, ldda_, upper_, trans_, type_); } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h index 829338cc0..8351a3928 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.h @@ -35,7 +35,7 @@ class CholeskyExecutor : public Executor { bool upper_ = false; int ldda_ = 0; int n_ = 0; - long int batch_size_ = 1; + int64_t batch_size_ = 1; public: CholeskyExecutor() {} From 70dd01b64aded1bd53192d8a207e836481ac74fe Mon Sep 17 00:00:00 2001 From: dglr Date: Thu, 15 Aug 2024 09:21:55 +0800 Subject: [PATCH 14/27] [Fix](mluOpCholesky): add mluoplog when sqrt --- kernels/cholesky/cholesky.h | 1 + kernels/cholesky/cholesky_union1.mlu | 40 +++++++++++++++++-- kernels/cholesky/complex_cholesky_union1.mlu | 42 +++++++++++++++++--- 3 files changed, 75 insertions(+), 8 deletions(-) diff --git a/kernels/cholesky/cholesky.h b/kernels/cholesky/cholesky.h index 425b3f4ef..a80765ac1 100644 --- a/kernels/cholesky/cholesky.h +++ b/kernels/cholesky/cholesky.h @@ -43,6 +43,7 @@ #include "core/tensor.h" #include "core/type.h" #include "kernels/kernel.h" +#include "kernels/debug.h" #include "kernels/utils/cnnl_helper.h" #define CNB (16) diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index c6a7325ef..1dc570657 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -1,3 +1,27 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + + #include "cholesky.h" #include @@ -158,6 +182,9 @@ static __mlu_func__ void spotf2_sminout_fixsize_device(int m, float* A, } } factor = diag[iter * POTF_NB + iter]; + if (factor <= 0) { + MLULOG("The input matrix is not positive definite.\n"); + } factor = std::sqrt(factor); factor = (1.0 / factor); for (int i = 0; i < span; i++) { @@ -244,7 +271,11 @@ static __mlu_func__ void spotf2_sminout_anysize_device(int m, float* A, int span = remain > POTF_NB ? POTF_NB : remain; int iter_num = m > POTF_NB ? POTF_NB : m; for (int iter = 0; iter < iter_num; iter++) { - factor = sqrt(A[iter * lda + iter]); + factor = A[iter * lda + iter]; + if (factor <= 0) { + MLULOG("The input matrix is not positive definite.\n"); + } + factor = sqrt(factor); factor = 1.0 / factor; __sync_cluster(); for (int i = 0; i < span; i++) { @@ -399,7 +430,11 @@ __mlu_func__ void small_sminout_batch(int m, int width, float* dst, float* diag = dst; for (int iter = 0; iter < width; iter++) { - factor = sqrt(diag[iter * width + iter]); + factor = diag[iter * width + iter]; + if (factor <= 0) { + MLULOG("The input matrix is not positive definite.\n"); + } + factor = sqrt(factor); factor = 1.0 / factor; for (int i = 0; i < m; i++) { dst[i * width + iter] *= factor; @@ -926,7 +961,6 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc)); - ; CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc)); int32_t matmul_a_shape[2] = {batch, stride_a}; diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index 6d2b56349..6e31ac383 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -1,3 +1,27 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + + #include "cholesky.h" #define COMPLEX_OFFSET(A, off) (((float*)A) + (2 * (off))) #define COMPLEX_TYPE_SIZE ((2) * sizeof(float)) @@ -120,7 +144,11 @@ __mlu_func__ void small_cminout(int m, int width, float* dst, __memcpy(idiag, sram_buffer + CPOTF_NB * CPOTF_NB, width * CPOTF_NB * sizeof(float), SRAM2NRAM); for (int iter = 0; iter < width; iter++) { - factor = sqrt(rdiag[(iter * CPOTF_NB + iter)]); + factor = rdiag[(iter * CPOTF_NB + iter)]; + if (factor <= 0) { + MLULOG("The input matrix is not positive definite.\n"); + } + factor = sqrt(factor); factor = 1.0 / factor; for (int i = 0; i < width; i++) { rdiag[(i * CPOTF_NB + iter)] *= factor; @@ -263,7 +291,11 @@ __mlu_func__ void small_cminout_batch(int m, int width, float* r_dst, if (r_diag[iter * width + iter] < 0) { printf("iter:%d,taskId:%d\n", iter, taskId); } - factor = sqrt(r_diag[iter * width + iter]); + factor = r_diag[iter * width + iter]; + if (factor <= 0) { + MLULOG("The input matrix is not positive definite.\n"); + } + factor = sqrt(factor); factor = 1.0 / factor; for (int i = 0; i < m; i++) { r_dst[i * width + iter] *= factor; @@ -792,9 +824,9 @@ mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, float* workspace = cgemm_workspace; float* sgemm_workspace = - cgemm_workspace + ((unsigned long)batch) * 2 * (m * k); + cgemm_workspace + ((uint64_t)batch) * 2 * (m * k); float* copy_ra = workspace; - float* copy_ia = copy_ra + ((unsigned long)batch) * m * k; + float* copy_ia = copy_ra + ((uint64_t)batch) * m * k; int copy_lda = k; int copy_stride_a = m * k; @@ -991,4 +1023,4 @@ mluOpStatus_t conj_complex(int batch, int m, int n, float* d_input, cnnlConj(cnnl_handle, cnnl_in_desc, d_input, cnnl_out_desc, d_output)); return MLUOP_STATUS_SUCCESS; -} \ No newline at end of file +} From b1fbaed0834b1d657ec0ced872502a32be711289 Mon Sep 17 00:00:00 2001 From: dglr Date: Thu, 15 Aug 2024 23:56:04 +0800 Subject: [PATCH 15/27] [Fix](mluOpCholesky): reset workspace --- kernels/cholesky/cholesky.cpp | 33 ++++++--------- kernels/cholesky/complex_cholesky_union1.mlu | 6 --- mlu_op.h | 42 +------------------ .../pb_gtest/src/zoo/cholesky/cholesky.cpp | 13 ++++-- .../src/zoo/cholesky/testcase/case_0.prototxt | 8 ++-- 5 files changed, 28 insertions(+), 74 deletions(-) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index 14f184340..b289e3468 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -25,7 +25,7 @@ // calculates the required workspace size for performing the Cholesky // decomposition on a given matrix or batch of matrices. mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace( - mluOpTensorDescriptor_t input_desc, size_t* size, float** workspace) { + mluOpTensorDescriptor_t input_desc, size_t* size) { PARAM_CHECK("mluOpCholesky", input_desc != NULL); PARAM_CHECK("mluOpCholesky", input_desc->dim == 2 || input_desc->dim == 3); @@ -58,21 +58,9 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace( *size = size_a * size_a * sizeof(float) * 2 * batch_size * 3; } printf("workspace size:%ul\n", (int)(*size)); - if (*size > 0) { - CHECK_RETURN("mluOpCholesky", workspace_malloc(*size, workspace)); - } - return MLUOP_STATUS_SUCCESS; -} - -// releases the allocated workspace memory used for Cholesky decomposition -// calculations. It ensures that the workspace pointer is not only valid but -// also points to allocated memory before attempting to free it. -mluOpStatus_t MLUOP_WIN_API mluOpFreeCholeskyWorkspace(float** workspace) { - PARAM_CHECK("mluOpCholesky", workspace != NULL); - if (*workspace != NULL) { - CHECK_RETURN("mluOpCholesky", workspace_free(workspace)); - *workspace = NULL; - } + // if (*size > 0) { + // CHECK_RETURN("mluOpCholesky", workspace_malloc(*size, workspace)); + // } return MLUOP_STATUS_SUCCESS; } @@ -277,7 +265,7 @@ calculate_body(mluOpHandle_t handle, int batch_size, mluOpStatus_t MLUOP_WIN_API mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, float* d_input, const mluOpTensorDescriptor_t output_desc, - float* d_output, bool upper, float* workspace) { + float* d_output, bool upper, void* workspace) { PARAM_CHECK("mluOpCholesky", handle != NULL); PARAM_CHECK("mluOpCholesky", input_desc != NULL); PARAM_CHECK("mluOpCholesky", d_input != NULL); @@ -321,19 +309,22 @@ mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, ldc = output_desc->dims[2]; } - uint64_t type_size; + uint64_t type_size, total_size; + uint64_t size_limit = 1024*1024*1024*((uint64_t)7); MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); + total_size = type_size * size_a * lda * ((uint64_t)batch_size); + PARAM_CHECK("mluOpCholesky", total_size < size_limit); if (type_size == 8 && batch_size > 16 && size_a > 2000) { int stride = 2 * size_a * lda; calculate_body(handle, 16, input_desc, d_input, output_desc, d_output, - upper, workspace); + upper, (float*)workspace); cnrtQueueSync(queue); calculate_body(handle, ((uint64_t)batch_size) - 16, input_desc, d_input + 16 * stride, output_desc, d_output + 16 * stride, - upper, workspace); + upper, (float*)workspace); } else { calculate_body(handle, batch_size, input_desc, d_input, output_desc, - d_output, upper, workspace); + d_output, upper, (float*)workspace); } return MLUOP_STATUS_SUCCESS; diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index 6e31ac383..5e8760c32 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -581,12 +581,6 @@ mluOpStatus_t workspace_malloc(size_t size, float** workspace) { return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t workspace_free(float** workspace) { - CNRT_CHECK(cnrtFree((void*)(*workspace))); - - return MLUOP_STATUS_SUCCESS; -} - __mlu_global__ void complex_inverse_kernel(int batch, float* rd_input, float* id_input, int ld_input, int stride_input, float* rd_output, diff --git a/mlu_op.h b/mlu_op.h index 814511943..efc28be54 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -14488,7 +14488,7 @@ mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t output_desc, float *d_output, bool upper, - float *workspace); + void *workspace); /*! * @brief Calculates the size of the workspace required for the Cholesky decomposition and initializes a workspace @@ -14530,45 +14530,7 @@ mluOpCholesky(mluOpHandle_t handle, */ mluOpStatus_t MLUOP_WIN_API -mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t *size, float **workspace); - -/*! - * @brief Frees the memory allocated for the Cholesky decomposition workspace. - * This function should be called to release the workspace memory used by the Cholesky operations after they are no - * longer needed. - * - * @param[in,out] workspace - * Double pointer to the workspace memory that was allocated by mluOpGetCholeskyWorkspace or another allocation - * function. After calling this function, the pointer will be set to NULL to prevent accidental reuse. - * - * @par Return - * - ::MLUOP_STATUS_SUCCESS if the workspace is successfully freed, - * - ::MLUOP_STATUS_EXECUTION_FAILED if there is an error during the free operation, such as if the pointer is NULL. - * - * @par Data Type - * - None. - * - * @par Data Layout - * - None. - * - * @par Scale Limitation - * - None. - * - * @par API Dependency - * - None. - * - * @par Note - * - None - * - * @par Example - * - None. - * - * @par Reference - * - None. - */ - -mluOpStatus_t MLUOP_WIN_API -mluOpFreeCholeskyWorkspace(float **workspace); +mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t *size); #if defined(__cplusplus) } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp index 091ad3272..91dffca54 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp @@ -408,14 +408,21 @@ void CholeskyExecutor::compute() { // printf("mlu before cholesky result:\n"); // print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); interface_timer_.start(); - float* workspace = nullptr; + void* workspace = nullptr; size_t size = 0; - MLUOP_CHECK(mluOpGetCholeskyWorkspace(input_desc_, &size, &workspace)); + MLUOP_CHECK(mluOpGetCholeskyWorkspace(input_desc_, &size)); + + if (size > 0) { + workspace = mlu_runtime_.allocate(size); + } + MLUOP_CHECK(mluOpCholesky(handle_, input_desc_, d_intput, output_desc_, d_output, upper_, workspace)); - MLUOP_CHECK(mluOpFreeCholeskyWorkspace(&workspace)); + mlu_runtime_.deallocate(workspace); + + // MLUOP_CHECK(mluOpFreeCholeskyWorkspace(&((float*)workspace))); interface_timer_.stop(); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/testcase/case_0.prototxt b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/testcase/case_0.prototxt index 565179d15..7a64b5404 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/testcase/case_0.prototxt +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/testcase/case_0.prototxt @@ -3,8 +3,8 @@ input { id: "input" shape: { dims: 2 - dims: 8 - dims: 8 + dims: 1000 + dims: 1000 } layout: LAYOUT_ARRAY dtype: DTYPE_FLOAT @@ -19,8 +19,8 @@ output { id: "output" shape: { dims: 2 - dims: 8 - dims: 8 + dims: 1000 + dims: 1000 } layout: LAYOUT_ARRAY dtype: DTYPE_FLOAT From ffc19caadc3e946641faf6d7b5fd249c3d3a0c3b Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 16 Aug 2024 00:04:24 +0800 Subject: [PATCH 16/27] [Fix](mluOpCholesky): rename getworkspace size function --- kernels/cholesky/cholesky.cpp | 2 +- mlu_op.h | 2 +- test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index b289e3468..3490923ca 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -24,7 +24,7 @@ #include "cholesky.h" // calculates the required workspace size for performing the Cholesky // decomposition on a given matrix or batch of matrices. -mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspace( +mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspaceSize( mluOpTensorDescriptor_t input_desc, size_t* size) { PARAM_CHECK("mluOpCholesky", input_desc != NULL); diff --git a/mlu_op.h b/mlu_op.h index efc28be54..0dbd85152 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -14530,7 +14530,7 @@ mluOpCholesky(mluOpHandle_t handle, */ mluOpStatus_t MLUOP_WIN_API -mluOpGetCholeskyWorkspace(mluOpTensorDescriptor_t input_desc, size_t *size); +mluOpGetCholeskyWorkspaceSize(mluOpTensorDescriptor_t input_desc, size_t *size); #if defined(__cplusplus) } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp index 91dffca54..d5627d325 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp @@ -410,7 +410,7 @@ void CholeskyExecutor::compute() { interface_timer_.start(); void* workspace = nullptr; size_t size = 0; - MLUOP_CHECK(mluOpGetCholeskyWorkspace(input_desc_, &size)); + MLUOP_CHECK(mluOpGetCholeskyWorkspaceSize(input_desc_, &size)); if (size > 0) { workspace = mlu_runtime_.allocate(size); From abffac66e93293ef8daf3d6fab39e339c80913e6 Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 16 Aug 2024 01:09:17 +0800 Subject: [PATCH 17/27] [Fix](mluOpCholesky): rewrite description in mlu_op --- mlu_op.h | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/mlu_op.h b/mlu_op.h index 0dbd85152..6ce08ab63 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -14460,16 +14460,24 @@ mluOpDestroyFFTPlan(mluOpFFTPlan_t fft_plan); * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_EXECUTION_FAILED * * @par Data Type - * - None + * - The supported combinations of data types are shown below: + * - float(\b d_input) - float(\b d_output) * * @par Data Layout - * - None + * - The data layout of d_input should be \p MLUOP_LAYOUT_ARRAY. + * - The data layout of d_output should be \p MLUOP_LAYOUT_ARRAY. * * @par Scale Limitation - * - None + * - The dimension of input/output tensor must be 2 or 3. + * - The shape of output should be equal to input shape. + * - When the dimension of tensor is 2, the shape[0] and shape[1] should be equal. + * - When the dimension of tensor is 3, the shape[1] and shape[2] should be equal. + * - Considering the size of the GDRAM, the space occupied by the input matrix should not exceed 7GB. + * For example, when the batch size is 32, the shape[1] of the tensor cannot exceed 7,662(32*7662*4<7GB). * * @par API Dependency - * - None + * - Before calling this function, you need to call the ::mluOpGetCholeskyWorkspaceSize + * function to get workspace size. * * @par Note * - The function assumes the matrix is symmetric positive-definite. @@ -14498,26 +14506,25 @@ mluOpCholesky(mluOpHandle_t handle, * The descriptor for the input tensor for which the Cholesky decomposition will be performed. * @param[out] size * Pointer to a size_t variable where the size of the required workspace will be stored. - * @param[out] workspace - * Double pointer to a float, used to allocate memory for the workspace. This pointer will be set to point to the - * allocated workspace. * * @par Return - * - ::MLUOP_STATUS_SUCCESS if the workspace size is successfully calculated and the workspace is successfully - * allocated, - * - ::MLUOP_STATUS_EXECUTION_FAILED if there are issues during the calculation or memory allocation. + * - ::MLUOP_STATUS_SUCCESS if the workspace size is successfully calculated. + * - ::MLUOP_STATUS_EXECUTION_FAILED if there are issues during the calculation allocation. * * @par Data Type - * - None. + * - The supported combinations of data types are shown below: + * - size_t(\b size) * * @par Data Layout * - None. * * @par Scale Limitation - * - None. + * - The dimension of input tensor must be 2 or 3. + * - When the dimension of tensor is 2, the shape[0] and shape[1] should be equal. + * - When the dimension of tensor is 3, the shape[1] and shape[2] should be equal. * * @par API Dependency - * - None. + * - The allocated extra workspace should be passed to ::mluOpCholesky to perform the Cholesky operation. * * @par Note * - None. From 810c19dab08b40c47f4235abcd9b738cb955cdbe Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 16 Aug 2024 01:12:10 +0800 Subject: [PATCH 18/27] [Docs](mluOpCholesky): update docs --- docs/design_docs/cholesky/cholesky.md | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/design_docs/cholesky/cholesky.md b/docs/design_docs/cholesky/cholesky.md index 3b61e4aa8..dcf748898 100644 --- a/docs/design_docs/cholesky/cholesky.md +++ b/docs/design_docs/cholesky/cholesky.md @@ -116,10 +116,11 @@ A=LL^T | :---------: | :------------: | :--: | :------------------: | :---------: | :---------------: | | handle | | 句柄 | | / | 无 | | input_desc | 矩阵描述符 | 输入 |float、complex float | | | -| d_input | 输入矩阵 | 输入 | | [batch,N,N]/[N,N] | batch<=32,N<=3000 | +| d_input | 输入矩阵 | 输入 | | [batch,N,N]/[N,N] | 所占空间不超过7GB | | output_desc | 输出矩阵描述符 | 输入 | float、complex float | | | -| d_output | 输出矩阵 | 输出 | | [batch,N,N] /[N,N]| | +| d_output | 输出矩阵 | 输出 | | [batch,N,N]/[N,N]| 所占空间不超过7GB | | upper | 上三角/下三角 | 输入 | bool | | | +| workspace | 用于矩阵分解的额外空间 | 输入 | void* | | | ### 2.4 算子限制 @@ -449,15 +450,26 @@ complex类型多batch性能测试: * 输入输出矩阵的维度为2或者3 * 输入输出矩阵维度数相等 * 输入输出矩阵的后两个维度数目相同 +* 输入/输出矩阵所占空间不得超过7GB ## 4 算子接口设计 -接口为: +计算接口为: ```c++ -void mluOpCholesky(mluOpHandle_t handle,const mluOpTensorDescriptor_t input_desc,float* d_input, const mluOpTensorDescriptor_t output_desc, float* d_output,bool upper) +mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, + float* d_input, const mluOpTensorDescriptor_t output_desc, + float* d_output, bool upper, void* workspace) +``` + +获取额外空间大小接口为: + +```c++ +mluOpStatus_t MLUOP_WIN_API +mluOpGetCholeskyWorkspaceSize(mluOpTensorDescriptor_t input_desc, size_t *size); + ``` 变量含义为上文所述。 From 600fbd8bc118f2f407d117ab2bdfd72e6a406e04 Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 16 Aug 2024 02:38:43 +0800 Subject: [PATCH 19/27] [Fix](mluOpCholesky): del printf --- kernels/cholesky/cholesky_union1.mlu | 12 +---- kernels/cholesky/complex_cholesky_union1.mlu | 3 -- .../pb_gtest/src/zoo/cholesky/cholesky.cpp | 52 ++++++------------- 3 files changed, 19 insertions(+), 48 deletions(-) diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index 1dc570657..8e12d1c9d 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -175,12 +175,7 @@ static __mlu_func__ void spotf2_sminout_fixsize_device(int m, float* A, } } - if (factor < 0) { - if (id == 0) { - printf("factor:%.3f\n", factor); - printf("iter:%d\n", iter); - } - } + factor = diag[iter * POTF_NB + iter]; if (factor <= 0) { MLULOG("The input matrix is not positive definite.\n"); @@ -996,7 +991,7 @@ mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, &workspace_size); if (workspace_size > 0) { - printf("sgemm workspace size:%zu\n", workspace_size); + VLOG(0) << "sgemm workspace size:" << workspace_size; } CALL_CNNL(cnnlStrideBatchMatMul_v2( @@ -1375,9 +1370,6 @@ mluOpStatus_t transpose(int batch, int m, int n, float* d_input, CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, cnnl_trans_desc, &size)); - if (size > 0ul) { - printf("transpose2 need size: %zu\n", size); - } CALL_CNNL(cnnlTranspose_v2(cnnl_handle, cnnl_trans_desc, cnnl_in_desc, d_input, cnnl_out_desc, d_output, workspace, diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index 5e8760c32..8a671c013 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -288,9 +288,6 @@ __mlu_func__ void small_cminout_batch(int m, int width, float* r_dst, float a1, a2, b1, b2; for (int iter = 0; iter < width; iter++) { - if (r_diag[iter * width + iter] < 0) { - printf("iter:%d,taskId:%d\n", iter, taskId); - } factor = r_diag[iter * width + iter]; if (factor <= 0) { MLULOG("The input matrix is not positive definite.\n"); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp index d5627d325..c2ac4ef9b 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/cholesky/cholesky.cpp @@ -299,10 +299,9 @@ void mlu_transfer_data(float* dst, float* src, uint64_t data_size, } void CholeskyExecutor::prepareComputeParam() { - printf("start prepare compute parameter.\n"); + VLOG(0) << "start prepare compute parameter." << std::endl; int long_int_size = sizeof(int64_t); int int_size = sizeof(int); - printf("int64_t size:%d, int size:%d\n", long_int_size, int_size); auto input_desc_ = (tensor_desc_[0].tensor); auto output_desc_ = (tensor_desc_[1].tensor); auto dev_a = (float*)(data_vector_[0].host_ptr); @@ -320,34 +319,32 @@ void CholeskyExecutor::prepareComputeParam() { int dim = input_desc_->dim; stride_ = (input_desc_->strides)[dim - 1]; ldda_ = input_desc_->dims[1]; - printf("n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n", n_, ldda_, stride_, - upper_, trans_); + VLOG(0) << "n:" << n_ << ", lda:" << ldda_ << ", stride:" << stride_ + << ", upper:" << upper_<< ",trans:" << trans_ << std::endl; int size = input_desc_->dims[0]; - printf("size:%d, dim:%d, \n", size, dim); - printf("strides:\n"); + VLOG(0) << "size:" << size << ", dim:" << dim << std::endl; + VLOG(0) << "strides:" << std::endl; for (int i = 0; i < dim; i++) { - printf("%ld ", (input_desc_->strides)[i]); + VLOG(0) << (input_desc_->strides)[i] << " "; } - printf("\n"); - printf("data vector length : %ld\n", data_vector_.size()); + VLOG(0) << "data vector length : " << data_vector_.size() << std::endl; } else if (dim_size == 3) { batch_size_ = input_shape.dims(0); n_ = input_shape.dims(1); int dim = input_desc_->dim; stride_ = (input_desc_->strides)[dim - 1]; ldda_ = input_desc_->dims[2]; - printf("batch_size:%ld,n:%d,lda:%d,stride:%d,upper:%d,trans:%d\n", - batch_size_, n_, ldda_, stride_, upper_, trans_); + VLOG(0) << "batch_size:" << batch_size_ << ", n:" << n_ << ", lda:" + << ldda_ << ", stride:" << stride_ << ", upper"<< upper_ + << ",trans:" << trans_<< std::endl; int size = input_desc_->dims[1]; - - printf("size:%d, dim:%d, \n", size, dim); - printf("strides:\n"); + VLOG(0) << "size:" << size << ", dim:" << dim << std::endl; + VLOG(0) << "strides:" << std::endl; for (int i = 0; i < dim; i++) { - printf("%ld ", (input_desc_->strides)[i]); + VLOG(0) << (input_desc_->strides)[i] << " "; } - printf("\n"); - printf("data vector length : %ld\n", data_vector_.size()); + VLOG(0) << "data vector length : " << data_vector_.size() << std::endl; } uint64_t total_size = batch_size_ * n_ * ldda_ * type_size_; @@ -376,10 +373,6 @@ void CholeskyExecutor::prepareComputeParam() { } } - // printf("matrix A:\n"); - // print_matrix(batch_size_,dev_a,ldda_,trans_,n_,ldda_,type_); - // printf("matrix C:\n"); - // print_matrix(batch_size_,dev_c,ldda_,trans_,n_,ldda_,type_); mlu_transfer_data(dev_d, dev_a, total_size, CNRT_MEM_TRANS_DIR_HOST2DEV); if (parser_->device() == CPU) { @@ -405,8 +398,7 @@ void CholeskyExecutor::compute() { mlu_transfer_data(h_output, d_intput, total_size, CNRT_MEM_TRANS_DIR_DEV2HOST); - // printf("mlu before cholesky result:\n"); - // print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); + interface_timer_.start(); void* workspace = nullptr; size_t size = 0; @@ -453,8 +445,7 @@ void CholeskyExecutor::compute() { CNRT_MEM_TRANS_DIR_HOST2DEV); } - // printf("mlu after cholesky result:\n"); - // print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); + return; } @@ -574,8 +565,7 @@ void CholeskyExecutor::cpuCompute() { auto h_output = (float*)(data_vector_[1].host_ptr); auto h_input = (float*)(data_vector_[0].host_ptr); - // printf("cpu before cholesky result:\n"); - // print_matrix(batch_size_,cpu_c,ldda_,trans_,n_,ldda_,type_); + if (result_mul) { for (int i = 0; i < batch_size_; i++) { @@ -600,15 +590,7 @@ void CholeskyExecutor::cpuCompute() { fill_zero(h_output, upper_, batch_size_, n_, ldda_, type_, false); } - // print_matrix(batch_size_,h_input,ldda_,trans_,n_,ldda_,type_); - // printf("cpu cholesky result:\n"); - // print_matrix(batch_size_,cpu_c,ldda_,trans_,n_,ldda_,type_); - - // printf("mlu cholesky result:\n"); - // print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); - // printf("mlu after cholesky result1:\n"); - // print_matrix(batch_size_,h_output,ldda_,trans_,n_,ldda_,type_); return; } From e7361af246973e5af2fa0b9a6ee4d1492173117d Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 16 Aug 2024 07:43:47 +0800 Subject: [PATCH 20/27] [Docs](mluOpCholesky): rewrite Conjugate transpose symbol --- docs/design_docs/cholesky/cholesky.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/design_docs/cholesky/cholesky.md b/docs/design_docs/cholesky/cholesky.md index dcf748898..d8d454c30 100644 --- a/docs/design_docs/cholesky/cholesky.md +++ b/docs/design_docs/cholesky/cholesky.md @@ -31,9 +31,9 @@ Cholesky分解是科学和数值领域中最重要的算法之一。Cholesky算 对正定厄密特矩阵$`A`$进行Cholesky分解,即求矩阵$`L`$使下式成立: ```math -A=LL^* +A=LL^H ``` -其中,$`L`$是一个下三角矩阵且对角元素均为正实数,$`L^*`$表示$`L`$的共轭转置,是一个上三角矩阵。当$`A`$是一个实数矩阵时,Cholesky分解可以改写为 +其中,$`L`$是一个下三角矩阵且对角元素均为正实数,$`L^H`$表示$`L`$的共轭转置,是一个上三角矩阵。当$`A`$是一个实数矩阵时,Cholesky分解可以改写为 ```math A=LL^T ``` From ba492c28046b13db1f5ecd06126fa5e0524dd4d8 Mon Sep 17 00:00:00 2001 From: dglr Date: Sun, 25 Aug 2024 06:26:42 +0800 Subject: [PATCH 21/27] [Fix](mluOpCholesky): format --- kernels/cholesky/cholesky.cpp | 31 ++++----- kernels/cholesky/cholesky.h | 70 +++++++++----------- kernels/cholesky/cholesky_union1.mlu | 7 +- kernels/cholesky/complex_cholesky_union1.mlu | 8 +-- 4 files changed, 52 insertions(+), 64 deletions(-) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index 3490923ca..d9c6a4fa7 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -110,10 +110,9 @@ calculate_body(mluOpHandle_t handle, int batch_size, transpose(batch_size, size_a, size_a, d_input, d_output, handle, dtype, workspace)); } else { - CNRT_CHECK( - cnrtMemcpy(d_output, d_input, - type_size * size_a * lda * ((uint64_t)batch_size), - CNRT_MEM_TRANS_DIR_DEV2DEV)); + CNRT_CHECK(cnrtMemcpy(d_output, d_input, + type_size * size_a * lda * ((uint64_t)batch_size), + CNRT_MEM_TRANS_DIR_DEV2DEV)); } } else { CHECK_RETURN("mluOpCholesky", @@ -166,10 +165,9 @@ calculate_body(mluOpHandle_t handle, int batch_size, transpose(batch_size, size_a, size_a, d_output, workspace, handle, dtype, workspace)); cnrtQueueSync(queue); - CNRT_CHECK( - cnrtMemcpy(d_output, workspace, - type_size * size_a * lda * ((uint64_t)batch_size), - CNRT_MEM_TRANS_DIR_DEV2DEV)); + CNRT_CHECK(cnrtMemcpy(d_output, workspace, + type_size * size_a * lda * ((uint64_t)batch_size), + CNRT_MEM_TRANS_DIR_DEV2DEV)); } } else { recnb = CREC_NB; @@ -237,16 +235,15 @@ calculate_body(mluOpHandle_t handle, int batch_size, CNRT_CHECK(cnrtMemcpy(d_output, workspace, type_size * size_a * lda * 16, CNRT_MEM_TRANS_DIR_DEV2DEV)); - CNRT_CHECK(cnrtMemcpy( - d_output + type_size / 4 * size_a * lda * 16, - workspace + type_size / 4 * size_a * lda * 16, - type_size * size_a * lda * ((uint64_t)batch_size - 16), - CNRT_MEM_TRANS_DIR_DEV2DEV)); - } else { CNRT_CHECK( - cnrtMemcpy(d_output, workspace, - type_size * size_a * lda * ((uint64_t)batch_size), + cnrtMemcpy(d_output + type_size / 4 * size_a * lda * 16, + workspace + type_size / 4 * size_a * lda * 16, + type_size * size_a * lda * ((uint64_t)batch_size - 16), CNRT_MEM_TRANS_DIR_DEV2DEV)); + } else { + CNRT_CHECK(cnrtMemcpy(d_output, workspace, + type_size * size_a * lda * ((uint64_t)batch_size), + CNRT_MEM_TRANS_DIR_DEV2DEV)); } } } @@ -310,7 +307,7 @@ mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, } uint64_t type_size, total_size; - uint64_t size_limit = 1024*1024*1024*((uint64_t)7); + uint64_t size_limit = 1024 * 1024 * 1024 * ((uint64_t)7); MLUOP_CHECK(mluOpGetSizeOfDataType(dtype, &type_size)); total_size = type_size * size_a * lda * ((uint64_t)batch_size); PARAM_CHECK("mluOpCholesky", total_size < size_limit); diff --git a/kernels/cholesky/cholesky.h b/kernels/cholesky/cholesky.h index a80765ac1..7859f3f46 100644 --- a/kernels/cholesky/cholesky.h +++ b/kernels/cholesky/cholesky.h @@ -21,7 +21,6 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *************************************************************************/ - #ifndef __CHOLESKY_H #define __CHOLESKY_H @@ -48,9 +47,9 @@ #define CNB (16) #define REC_NB (16) -#define POTF_NB ((REC_NB)/4) +#define POTF_NB ((REC_NB) / 4) #define CREC_NB (16) -#define CPOTF_NB ((CREC_NB)/4) +#define CPOTF_NB ((CREC_NB) / 4) // #define CPOTF_NB ((CREC_NB)) #define __CNRT_FUNC_TYPE__ CNRT_FUNC_TYPE_UNION1 #define TASK_NUM (4) @@ -59,63 +58,60 @@ #define CLUSTER_NUM 1 #define M (TASK_NUM * POTF_NB) #define ZERO 0.0 -#define SHARED_MEM_SIZE (((M*POTF_NB/TASK_NUM * 4)+(POTF_NB * POTF_NB))) +#define SHARED_MEM_SIZE (((M * POTF_NB / TASK_NUM * 4) + (POTF_NB * POTF_NB))) #define OFFSET_ROW(A, i, j) A + ((i) * (lda) + (j)) #define OFFSET_B_ROW(B, i, j) B + ((i) * (ldb) + (j)) +mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, bool uplo, + int n, int recnb, float* dA, int ldda, + int gbstep, mluOpHandle_t handle, + float* workspace); -mluOpStatus_t mlu_spotrf_rectile(int batch, int stride, bool trans, - bool uplo, int n, int recnb, float* dA, int ldda, - int gbstep, mluOpHandle_t handle, float* workspace); - -mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans, - int n, int k, float* d_a, int ldda, float* d_c, - int lddc, mluOpHandle_t handle, float* workspace); +mluOpStatus_t ssyrk(int batch, int stride, bool upper, bool trans, int n, int k, + float* d_a, int ldda, float* d_c, int lddc, + mluOpHandle_t handle, float* workspace); -mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, - int n, int k, float alpha, float beta, float* d_a, - int lda, int stride_a, float* d_b, int ldb, - int stride_b, float* d_c, int ldc, +mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, + float alpha, float beta, float* d_a, int lda, int stride_a, + float* d_b, int ldb, int stride_b, float* d_c, int ldc, int stride_c, mluOpHandle_t handle, float* workspace); // side:true->right // false->left -mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, - int m, int n, float* d_a, int ldda, float* d_b, - int lddb, mluOpHandle_t handle, float* workspace); +mluOpStatus_t strsm(int batch, int stride, bool upper, bool trans, int m, int n, + float* d_a, int ldda, float* d_b, int lddb, + mluOpHandle_t handle, float* workspace); mluOpStatus_t transpose(int batch, int m, int n, float* d_input, - float* d_output, mluOpHandle_t handle, - mluOpDataType_t type, float* workspace); + float* d_output, mluOpHandle_t handle, + mluOpDataType_t type, float* workspace); mluOpStatus_t conj_complex(int batch, int m, int n, float* d_input, - float* d_output, mluOpHandle_t handle); + float* d_output, mluOpHandle_t handle); -mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, - int recnb, float* drA, float* diA, int lda, - mluOpHandle_t handle, float* workspace); +mluOpStatus_t mlu_cpotrf_rectile(int batch, int stride, int n, int recnb, + float* drA, float* diA, int lda, + mluOpHandle_t handle, float* workspace); -mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, - int n, int k, float alpha, float beta, float* d_ra, - float* d_ia, int lda, int stride_a, float* d_rb, - float* d_ib, int ldb, int stride_b, float* d_rc, - float* d_ic, int ldc, int stride_c, mluOpHandle_t handle, - float* workspace); +mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, + float alpha, float beta, float* d_ra, float* d_ia, int lda, + int stride_a, float* d_rb, float* d_ib, int ldb, + int stride_b, float* d_rc, float* d_ic, int ldc, + int stride_c, mluOpHandle_t handle, float* workspace); mluOpStatus_t workspace_malloc(size_t size, float** workspace); mluOpStatus_t workspace_free(float** workspace); - -mluOpStatus_t set_half_zero(int batch, int stride, float* d_a, int lda, - int m, mluOpHandle_t handle); +mluOpStatus_t set_half_zero(int batch, int stride, float* d_a, int lda, int m, + mluOpHandle_t handle); mluOpStatus_t ctrsm(int batch, int stride, int m, int n, float* rd_a, - float* id_a, int lda, float* rd_b, float* id_b, - int ldb, mluOpHandle_t handle, float* workspace); + float* id_a, int lda, float* rd_b, float* id_b, int ldb, + mluOpHandle_t handle, float* workspace); mluOpStatus_t cherk(int batch, int stride, int n, int k, float* rd_a, - float* id_a, int lda, float* rd_c, float* id_c, - int ldc, mluOpHandle_t handle, float* workspace); + float* id_a, int lda, float* rd_c, float* id_c, int ldc, + mluOpHandle_t handle, float* workspace); #endif diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index 8e12d1c9d..22e328272 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -21,7 +21,6 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *************************************************************************/ - #include "cholesky.h" #include @@ -175,10 +174,9 @@ static __mlu_func__ void spotf2_sminout_fixsize_device(int m, float* A, } } - factor = diag[iter * POTF_NB + iter]; if (factor <= 0) { - MLULOG("The input matrix is not positive definite.\n"); + MLULOG("The input matrix is not positive definite.\n"); } factor = std::sqrt(factor); factor = (1.0 / factor); @@ -427,7 +425,7 @@ __mlu_func__ void small_sminout_batch(int m, int width, float* dst, for (int iter = 0; iter < width; iter++) { factor = diag[iter * width + iter]; if (factor <= 0) { - MLULOG("The input matrix is not positive definite.\n"); + MLULOG("The input matrix is not positive definite.\n"); } factor = sqrt(factor); factor = 1.0 / factor; @@ -1370,7 +1368,6 @@ mluOpStatus_t transpose(int batch, int m, int n, float* d_input, CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_in_desc, cnnl_trans_desc, &size)); - CALL_CNNL(cnnlTranspose_v2(cnnl_handle, cnnl_trans_desc, cnnl_in_desc, d_input, cnnl_out_desc, d_output, workspace, size)); diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index 8a671c013..1fd015212 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -21,7 +21,6 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *************************************************************************/ - #include "cholesky.h" #define COMPLEX_OFFSET(A, off) (((float*)A) + (2 * (off))) #define COMPLEX_TYPE_SIZE ((2) * sizeof(float)) @@ -146,7 +145,7 @@ __mlu_func__ void small_cminout(int m, int width, float* dst, for (int iter = 0; iter < width; iter++) { factor = rdiag[(iter * CPOTF_NB + iter)]; if (factor <= 0) { - MLULOG("The input matrix is not positive definite.\n"); + MLULOG("The input matrix is not positive definite.\n"); } factor = sqrt(factor); factor = 1.0 / factor; @@ -290,7 +289,7 @@ __mlu_func__ void small_cminout_batch(int m, int width, float* r_dst, for (int iter = 0; iter < width; iter++) { factor = r_diag[iter * width + iter]; if (factor <= 0) { - MLULOG("The input matrix is not positive definite.\n"); + MLULOG("The input matrix is not positive definite.\n"); } factor = sqrt(factor); factor = 1.0 / factor; @@ -814,8 +813,7 @@ mluOpStatus_t cgemm_real(int batch, bool trans_a, bool trans_b, int m, int n, mluOpGetQueue(handle, &queue); float* workspace = cgemm_workspace; - float* sgemm_workspace = - cgemm_workspace + ((uint64_t)batch) * 2 * (m * k); + float* sgemm_workspace = cgemm_workspace + ((uint64_t)batch) * 2 * (m * k); float* copy_ra = workspace; float* copy_ia = copy_ra + ((uint64_t)batch) * m * k; int copy_lda = k; From c84b719301b162d7e8d1d865e4875523bf7d66b4 Mon Sep 17 00:00:00 2001 From: dglr Date: Mon, 16 Sep 2024 09:44:22 +0800 Subject: [PATCH 22/27] [Fix](mluOpCholesky): add layout check --- kernels/cholesky/cholesky.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index d9c6a4fa7..86d8d8638 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -268,6 +268,8 @@ mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, PARAM_CHECK("mluOpCholesky", d_input != NULL); PARAM_CHECK("mluOpCholesky", output_desc != NULL); PARAM_CHECK("mluOpCholesky", d_output != NULL); + PARAM_CHECK("mluOpCholesky", input_desc->layout == MLUOP_LAYOUT_ARRAY); + PARAM_CHECK("mluOpCholesky", output_desc->layout == MLUOP_LAYOUT_ARRAY); PARAM_CHECK("mluOpCholesky", input_desc->dim == 2 || input_desc->dim == 3); PARAM_CHECK("mluOpCholesky", output_desc->dim == input_desc->dim); From 1c7a41078ea3a1b236de6e0870c1239a86412699 Mon Sep 17 00:00:00 2001 From: dglr Date: Mon, 16 Sep 2024 09:48:48 +0800 Subject: [PATCH 23/27] [Fix](mluOpCholesky): fix mem check --- kernels/cholesky/cholesky_union1.mlu | 33 +++++++++++++------- kernels/cholesky/complex_cholesky_union1.mlu | 10 +++--- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index 22e328272..fda31bef3 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -176,7 +176,8 @@ static __mlu_func__ void spotf2_sminout_fixsize_device(int m, float* A, factor = diag[iter * POTF_NB + iter]; if (factor <= 0) { - MLULOG("The input matrix is not positive definite.\n"); + printf("The input matrix is not positive definite.\n"); + exit(-1); } factor = std::sqrt(factor); factor = (1.0 / factor); @@ -266,7 +267,8 @@ static __mlu_func__ void spotf2_sminout_anysize_device(int m, float* A, for (int iter = 0; iter < iter_num; iter++) { factor = A[iter * lda + iter]; if (factor <= 0) { - MLULOG("The input matrix is not positive definite.\n"); + printf("The input matrix is not positive definite.\n"); + exit(-1); } factor = sqrt(factor); factor = 1.0 / factor; @@ -311,7 +313,7 @@ __mlu_func__ void spotf2_smlpout_fixwidth_device(const int m, float* A0, if (id == 0) { for (int i = 0; i < span; i++) { __memcpy(A + (i * lda), sdata_A + i * POTF_NB, (i + 1) * sizeof(float), - SRAM2LDRAM); + SRAM2GDRAM); } } else if (id * span < m) { @@ -425,7 +427,8 @@ __mlu_func__ void small_sminout_batch(int m, int width, float* dst, for (int iter = 0; iter < width; iter++) { factor = diag[iter * width + iter]; if (factor <= 0) { - MLULOG("The input matrix is not positive definite.\n"); + printf("The input matrix is not positive definite.\n"); + exit(-1); } factor = sqrt(factor); factor = 1.0 / factor; @@ -652,7 +655,7 @@ __mlu_entry__ void mlu_strsm_rectile_kernel(int batch, int stride, int m, int n, factor = 0; if (id == 0) { - __memcpy(sA, dA, sizeof(float), LDRAM2SRAM); + __memcpy(sA, dA, sizeof(float), GDRAM2SRAM); } if (if_execute) __memcpy(rBp, OFFSET_B_ROW(dB, start, 0), sizeof(float), LDRAM2NRAM, @@ -666,7 +669,7 @@ __mlu_entry__ void mlu_strsm_rectile_kernel(int batch, int stride, int m, int n, calc_length * sizeof(float), sizeof(float), span - 1); __sync_cluster(); if (id == 0) { - __memcpy_async(sA, OFFSET_ROW(dA, 1, 0), 2 * sizeof(float), LDRAM2SRAM); + __memcpy_async(sA, OFFSET_ROW(dA, 1, 0), 2 * sizeof(float), GDRAM2SRAM); } if (if_execute) __memcpy_async(rBp, OFFSET_B_ROW(dB, start, 1), sizeof(float), LDRAM2NRAM, @@ -686,7 +689,7 @@ __mlu_entry__ void mlu_strsm_rectile_kernel(int batch, int stride, int m, int n, __sync_cluster(); if (id == 0) { __memcpy_async(sA, OFFSET_ROW(dA, iter + 1, 0), - (iter + 2) * sizeof(float), LDRAM2SRAM); + (iter + 2) * sizeof(float), GDRAM2SRAM); } if (if_execute) __memcpy_async(rBp, OFFSET_B_ROW(dB, start, iter + 1), sizeof(float), @@ -1144,22 +1147,30 @@ __mlu_global__ void set_zero(int batch, int stride, bool upper, int m, float* orignC = d_c; d_c = orignC + batch_id * stride; id = taskId % 4; - int span = m / 4; + int span = m; int pre = id * span; float* start_c = d_c + pre * lddc + pre; + float* zero_space = (float*)nram_buffer; + __memset_nram(zero_space, m, (float)ZERO); + float* temp_c = start_c; if (id == 3) { span = m - 3 * span; } + for (int i = 0; i < span - 1; i++) { temp_c = start_c + i * lddc + i; int num = m - pre - i; - __ldramset(temp_c + 1, num - 1, 0); + if (num > 1 && id == 0) { + __memcpy(temp_c + 1, zero_space, sizeof(float)*(num-1), NRAM2GDRAM); + } } - if (id != 3 && span > 0) { + if (id == 0 && span > 0) { temp_c = start_c + (span - 1) * lddc + span - 1; int num = m - pre - span + 1; - __ldramset(temp_c + 1, num - 1, 0); + if (num > 1) { + __memcpy(temp_c + 1, zero_space, sizeof(float)*(num-1), NRAM2GDRAM); + } } } diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index 1fd015212..3346156f0 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -145,7 +145,8 @@ __mlu_func__ void small_cminout(int m, int width, float* dst, for (int iter = 0; iter < width; iter++) { factor = rdiag[(iter * CPOTF_NB + iter)]; if (factor <= 0) { - MLULOG("The input matrix is not positive definite.\n"); + printf("The input matrix is not positive definite.\n"); + exit(-1); } factor = sqrt(factor); factor = 1.0 / factor; @@ -202,9 +203,9 @@ __mlu_func__ void cmplout(int batch, const int m, float* rA0, float* rA, if (id == 0) { for (int i = 0; i < width; i++) { __memcpy((rA + (i * lda)), (rdst + (i * CPOTF_NB)), - (i + 1) * sizeof(float), NRAM2LDRAM); + (i + 1) * sizeof(float), NRAM2GDRAM); __memcpy((iA + (i * lda)), (idst + (i * CPOTF_NB)), - (i + 1) * sizeof(float), NRAM2LDRAM); + (i + 1) * sizeof(float), NRAM2GDRAM); } } else if (if_execute) { @@ -289,7 +290,8 @@ __mlu_func__ void small_cminout_batch(int m, int width, float* r_dst, for (int iter = 0; iter < width; iter++) { factor = r_diag[iter * width + iter]; if (factor <= 0) { - MLULOG("The input matrix is not positive definite.\n"); + printf("The input matrix is not positive definite.\n"); + exit(-1); } factor = sqrt(factor); factor = 1.0 / factor; From 18e299e82a1d7668319a2daa62763a3e2bf203f8 Mon Sep 17 00:00:00 2001 From: dglr Date: Mon, 16 Sep 2024 09:50:14 +0800 Subject: [PATCH 24/27] [Docs](mluOpCholesky): add test doc --- docs/design_docs/cholesky/cholesky.md | 16 ++++++++++++++++ docs/design_docs/cholesky/coverage_error.png | Bin 0 -> 73318 bytes 2 files changed, 16 insertions(+) create mode 100644 docs/design_docs/cholesky/coverage_error.png diff --git a/docs/design_docs/cholesky/cholesky.md b/docs/design_docs/cholesky/cholesky.md index d8d454c30..7d87b0bd9 100644 --- a/docs/design_docs/cholesky/cholesky.md +++ b/docs/design_docs/cholesky/cholesky.md @@ -444,6 +444,22 @@ complex类型多batch性能测试: 图中红框中为调用底层的矩阵乘法,且由于没有复数类型矩阵乘法的底层实现,当前复数矩阵乘是由4个float类型矩阵乘拼接而成。可以看到矩阵乘法的时间占比总和已经达到了60%,矩阵乘法所占用时间超过了2000微秒,已经超过了pytorch运行时间的10倍。 +#### 3.3.3 新特性测试 + +- [✓] 数据类型测试:完成float和complex float类型测试 +- [✓] 多维张量测试:完成二维和三维张量测试,其它维度会在算子入口报错 +- [✓] Layout 测试:完成测试多种layout, +- [✓] 不同规模 / 整数余数端段 / 对齐不对齐测试:完成不同规模,不同形状输入测试 +- [✓] 零维张量测试/ 0 元素测试:完成测试,0元素会在运行中报错 +- [✓] 稳定性测试 +- [ ] 多平台测试 +- [✓] gen_case模块测试 +- [✓] nan / inf测试:输入中若有nan会在运行中报错,与Pytorch一致 +- [✓] bug 修复测试 +- [✓] 内存泄漏检查 +- [ ] 代码覆盖率检查:代码覆盖率无法检查 +![image](coverage_error.png) + ### 3.4 防呆检查 算子中做了如下检查: * 所有指针不为NULL diff --git a/docs/design_docs/cholesky/coverage_error.png b/docs/design_docs/cholesky/coverage_error.png new file mode 100644 index 0000000000000000000000000000000000000000..7c49188dca78848ce1aaeb6a4a021bc219bc6c51 GIT binary patch literal 73318 zcmaI72~?U__b#4}sm)k5agH?+jYg%KM59LiswPGdNu1+EA{r;qI^hVk&0w4pCyYiZ zMx2LW6ypRx2_hqkw|oCH=ndcklYI|Glooa=q~Ka`riU@AEu+KPTg+ ztJATs)xSP;=+H6e8;)*=4jrc*I`j$tt1sp6cxgNuk^lQ7&dtgGQ1g)Hs{G>f(C;DN zA38+JJFxYd$-njc$JZzH=-S{zhYpC&j^BGE1#U>OFZI09+~j5BA3l4O z|3mlD>;KBoUbyVm-Vkuw#P7Or^poELP9b*Af`vjW^B}!@tPe@JU!Ls#j=d5z`zkjl z!^c_qFD>7RgAGJqQg~kQH@OpD(~6E4Tolw^s^yIXG6aX*KABG+JmPluKJsRhooM-1 zxulmHT=Dj*KGufs?Rb639cuebpUQPw?j7Eno3Vkm5hGxNSw1WPO-)G_j5YzifpelJvDTqOKPJE4t^a-Ep|-UbRXyr} zsoijVA6T>Q{htR;2EnAj|33ZDAw-*s_pm3BEe;Y$!7a5E_ujfL599>D@pmslnqXAL zCZM-gT(;JY3-m_w_q-?h@~^&q5Jips?=!s8`+WZQ<7#=J+}<_?2I=2-{^x^V&7P

|vgjS?Sui!FB|MStI)1m8sBsc*l*5z+KzVi1Z*0XnCz#MO>UyG*fI41lxEPILh z;?CDQ;5)BIxT3RqJ{MUQZX>S6*Pz+Ge?7u;0__skHrL=MY+v@~ZmnkYrXSa;hj~tP z#&OLlwvQE9X)RrqDH3aJotS|YcJcHZ2TECtf3JXcBL%y9xB<5*0SV^O!}1CG^~&qh zpZy(>CirXZw>+%|*s+d-^*qnRvJdgnk4}8(<>`G_g%WKsfJRgLXrH3yZXJB$gqBzZ zXe3*VJT&$T?9;M`wp10K|>Pk%;zx)ZlCwZ+p7>DYz2L>UJCY@QOkaUk4%x`svn z{?wCPpi-gPEht~>9;O!S(?3J@^s<$nn>^o^Uo9s_(PqzFhTfzN$k1))?oZzx?+4fPSo%jZlL)Mb@^Y?>#$^OC_ zKs$E033euFomw9#0%Bia+Rue*#UOpopkrF7UtOqioee;5yg5u}nwmhW|E6YNO%*C> z9E9kat~x2jx-}B1Y}AJl{WA@+XzhJ|1CEzk|4)#v|KD6>FqnQ}(_uMuEpn?mVC+Z0 z>)kbZ#2!qL#1lVAs6wBFKY>tY%b~{@Xx!g_PO#TeqaXIRxeXUXT@$-Dr`J2H0>?T} z!;cs5#(jp?&&Gf9iOW*C1*M@&FbQjc@m?=2m#wwOJ8U+XALHY+2a@e@jHeO65>@XEj!s_r3uB~9t5g;PUfy2WPK9VG zrS7XXOIOadqg!GSUkRG{n1!r~f&+{QSF01Zjw33;C6m;FC3v=>MgiHmm^*A2P;Q-= z3-omgrmk#WG=L+;G{*w}w6e)u3*-l)4M#MykN4&z%huMeO>}sdX-dlF$vb4W zoCK-A<{blvWrcITx;;O9Y=-{Gm}K%vu`3^6(&f=x!R z+ZeN&2Pp|M8O50TmW2un;AV<$f_8Iacr0UZI0tN;Sl__eE)%ehLqLbWGycyEexuZ= z#?vqmla*{jyck_k$$RVEElIOMQDz2ZpXmq&gom|;g#AX8LHUvSKa$%E6rbW1L(@uc zkXGdrhx9@+jXx$a&KvcIEJjuXw_fg)U1E$NbIp2~O86(mH~*93TjeR9#}>N^X1Opw zd5XWVo_Hu5f^`gj_uB4_Ta4Xn@5z!yO1m$!$v5VoSyU=_0=aF-Gm|`ZP5#6&_+;CG z+UId2`wR0zMP}g-C8g(UKfeYz?FxaOPRJgVQ1 z(3lp>K1b&;!WXI$W0IbV<%odYjb|xMw`-vhx6d^n+g_HPMd31vSzdS&tXKyqxnwEp zx?-wk54)FMd^xkd00PRDc7x@c%z5~6=KP4jMZPUXjutAHEvVP(yW9>Ps8YE9U=ZL7+pRw?0*#6lkJMlks~}PPVs}nG zv)IA(C?MQ$xyCiEN4ye%pKf0{kS&VvwH;kXWIQ=V8SM%cX1`B$5Nbv0iw}IxMh{S0 z(Cl@2tai@x+;6)VKdDBuV@-%?VEc?glJBYS?EaIXOTjLM4^I33sI#)|`EfV)`cL$^tTh`TSN-a%-Z0B`S)O0@l#E|FR_F~V31ihcsbVic))Pmp^}oQhFFtdzZ}yxJp$;ju#se}6_-V`;*O0UpjQ?T zuQ!dReN27@SK@7$l?hVV*Qse8hr&*;S@<40G-$p^iE#g0di>*9T{8N7?tC__pJMpa zFyJf0@L^-IUD*s%uv%sLFyGitRM#>YIrt4|Gi~KuUWXpsm(VqWu@}@HbnFBVtJ-`r zzShe3u~x8gZUajqL0vhFQ_-A-6WHYL9%Pm=70vUDkOYFM*~!y+J%m8^Q*o8vp`{Vv zPw(vUoUD>N@4`6Ng=aQ{PWOH}l1usQQ1^d! z<`VS`(yxz53PmX&LLTG43-4GhEqQzoUEv>rU#So7BL_o!5BkmxkGG&6teF)WvAt*` zB)D@22<2&6s+f!>za`X6qf8f_NdXV?ada#54=|pVPbXzP%g~7`w4)cU0j}zz@b>j(|GcI!_A4PTrq=r$6o%$5a-g&_@Kqcj6)C4aAf4O^>YI(RBNH406uBg1hKoX8a%C2?-I zd2NccaFC?byNwIk@p$Hj^<37+uIH_{YhkynsdC^ZD28>XGTpN_$U-Ub;C*_c&BV&D zeg~d{F6^F$prO3xjLjY&+lx6>J^T?5oLyx&lM%K&?rNT_@>E;~NvxxyKT_~+VIJHR#X@U0*G)c~l#QR0knH}l!9gs&J!;;yXtR4T{< zgv=@fJ*&AV@Q17i@i|WQCel{tp^|J*oTYeQ8D6v)p-;lB28|^i5UgC(8tfNpa_VLi zL2p@WZwx2C1owAm*`g`qyIS?pP7>9WUYCi>dNN`2b8;GH&p}W&s+~s84zL5lcgjhN zs?>5fo;9ug6e&amnVVBc9_8WSeTzfgu9UwocCRStMzs=UZX3re_VFtI++k`yYt(ZUUf(V(YlU z`Iy6c1bxZM>W^s2O$ItE$9qUd*;c7Zr1*D)h*p8O@Ro5Yf5%9K<@|uIR`JmG;RY2X zx;tf>Qd6&_@9H?%)Ee>gzwocQwP4=Dq+cS3+cq)%ABNU+X?l%=UZFNO+rHk`s$WTS zla2h;FsOqZ35-i)<=IW!#rG7V=3(7w;*BAybZrQyo}~j%&P{^8De?C*#zp>A55#Ja)_JgtrPNjhh%@xpBaPPy94o-VAH{jAtFS$xB6a z@=uj+{@!HEbj3Gi5ttuzXi*#hc8;N@;#aI^Sc8R)FTH}4ihJgth~xYkdmnCIAcE1p zmi%e^N5XPFOID(bdw{r<)v;rVW!v(6EZd|k+u0DKGbxCw$cXy^G$oQ?In2)b;P^p# z{+LG;%#WiGBdLhPYlA^X8))36S%<>cExrpj4?+gt1868e2_+{EpStSL+{;fe05szu zv4Ujk|Hn104{|SJ){`7^Pemw=UOrRTwKJ$^2{-c!SYFw7&-To3DDM*!v)+wJYgBm| zB#^6{eYRq2p)y=4#_^y=RMc06l*CFBC2np{ayjV(1a#lPOh-?^;Fv(6+%Rv5-OpB6 z)abCYGxv({FT6?&HpPSOb8jW{OpE+4>l@;WHId(i_Vge0C?;wZ0o$-r{9Nk%(^)9b zg=p5^?tm;%)b2h3;4f7!)D*Ty+gNp0kl7Ze4cG|~kLRxp@KK2MKlsr)0Cq=vAx5oI z!{XpehU~143eM-xuutPuN)OHT55lM-h2J3_wvg|bjw>UjtzPshiqMb3Vi1kvug~k;WSparx4vI$ zr+ehTe_osB8{78g`Ic*lXsl+qJ9e!nL1v<7OPxV4T$~idhPimS;x@QAXp7)Lu^uLc zai$Pf6VI~havF}VT0~(x*&XCDp=?*nup7#+ne*;Hisabx2FY`-6Cf{(L z^Qt~C8s=LUXF-^eA3UV3K=e-!l4D?>T8k^g$KKiK8R8{cCDO{9JAmf74(`*+MGE*m zuG%n<;^Da$BoAA(2oMi5|1yBZ5Ej5i@(XMN4|Fk8{U zIG~_u2D6Fgy~7fOXu7D?s)vBmuI)Xl!8pR9rHL- z#H{IK)hX_QI=+YHCb32PHCC+GVnk~!==SsRSW$R-=UM=LV#|GirCbBZEPn-g5Y?c; zCL?~6oOIXgu(Hy|hDW9nFeM)xVK7DvQbB6g2f6n$cJ{esLl=Hv@Pkf@I!vjXN?vTO zmWJEMvvtFq*wc)KfY%G>Z&&k5QaFH#qf7d#&hA$)Ir|q3y2Ixq8!W)|@E5UFx}Q$P zAu6Q_O-@M8WfPWpNInKNnn$ksoD#Zf17E?B!R@_fao3g z5wdhE8O?@&c^`S+4D)gRq5&8*`-H2$KRdR z4V6qvSSIZ=lxC40X%U-Vj=3mw2b%c6{*--@#yod%XA zmHyfTuLK@)fSr64)@=e)EuKjVD@Xkgi!oW1-Tu*6J$>1|7AVl0b)<7?g~aux%>9Y$OiLK7U~z zKV?k}Ky&Uw1mwxZUuKw(O2#P{mrBfN zs(x;DTd8kNPcMy7eh=f6+&AS@XJa18dfYQ}RjfJ+gKg0!0F%r_CZe@EVj?~$zw`!?7i8*UdCz^C&ANl2 z)+rsp9|;HZO$~120OI|VowRznWn`t)3W#KKcU*2*J4>c{0lM-UDwC{Y-UPHB6UWlJ z-;LPvw?#zhjuy3s61be5F;Jzi$wt@uL-;xY>?^9Hq5DGfja2e2?V~(Kh#Se!y-KotuNd9^W?>)-r};>zg|=0Wb~cC{gE;_i z;Rqz_C2*8qZgpJ5g~W|l5ZG%u`s)&Gn=rij7e1LX_A*ktvZQKT4j;G-^rB-dr3-?+ zNKWU1%_EYyM;psKryLi09J_e#(kH2m*g1UdoNIYk>aeXxG=$#-$%u4#3`=CS1Z@;Q zsjFCV56Vs;?WkOk4?f&2c7Mo*R*leR>9u{U#k0;9+L)x~Z2;p6+-U!pn;{JNz}3Q1 z9dIYLpxjA-f~QC=^Mx&QyoR_qJVTkWvuJBCIEic-hJ-sWh)$Bsm)xZiH>_6&yMO3K zJm*ayfL`E6b$_fb6 zhQvG}r@qNwNMx+LGPb%IBbeJ}dVuv-N4)k~rq8JhsZ3PR@T+i+I{s1H_x~-!Pa9+y z?!;dMa2`}VS78M@o1+yQ+kk){r-O|x6Xq8LzCPy}7d(z@IVd=oeMnov3JY&j0ZD>bKI)rSVVQdYD{DlL9t?bw2^*tmrRI`12xjTI{HfT!pufNwafU@5 z(Hcglq7U}>liJEV&@g-MNy9|<83##De@`U3dNL#ei@T{2&srN_9W!y**dy+kIyZqc z%3B6DrqHW4fNT-ZLI*wU8Qb4|fvv(C@UKS?DoR3>Qr)XHi0o+VThsvuX!?%ypIY(h z8pd1MgAmA9iBRAN&e-BzNXaA>qgYgaqv?UNWbu2*=N--lD7(sq@ZY-1OSD*_kBzg2 zuF-R}23^O=bkY{$g44=Xc)}C)0$W7qHHp`!4z>FP#@}_O9m?K-Zyw}7VX@$~u6@x3 zlo26G_4us0*CS=LnCNb%hiq%1B;2kXX2)GH^-9Qqg|VwqifcGUL#Oi?1LR1>3**$< zw-EGf@wwASV4&yjbvBAYNK4bI+e$br(}L#o7+t;dqOcK;j;v(;VDq@nO9Ja ziZ1@qTXOILBJ;;%aU&gQu#p}qwuTnGW?u3_dk?e&&gchRdUPdVjcHL;zH7gb$XPFX zFw_qAFU_^{qQ2EqdarUF343Yid>83_Ca%iagW?xOsS9NLcLoJvoru7xZrK;Xuv*Ro z44(-6QUGX?j<(i8i)IoZ=frSi6~3E_kJ^;_O)vemK!9HLn%#PH;3ODDL%sbbmRjyP zu<1juf&*A_uHj>K_|;fBOmH7KOTg46^#=v#jkPyLXqRr>TX!sFx!}lJhBZpoG@q@Z zreJ&r(zE!a5*~Y;yg`23IqwDHn3BWal1#(?F?;y(J2J0OOj@`-7DMkbe=KZr4j~Lr z#e(rA$8M!$3zjI_M~iB}(`C7^Sou{pqhH&`KVWw&4LgE3V4i6>p0*h;jd&KuSr6yH zUzr+yqpeSGAH$_$TdRb;BSn4!bt|uxEAW5<2eg|SAH4~m$d4*Hs#Ir8A-J(oq?SNrBkKz%5S$wVm38Iz=5@~Ims)s5}S~IPOiA-Ve1S73W9k8J6CBT`&QyO6 zKnEuL^@{NmGDrcE*`1R-Fm}Djh3kHFzw2wjhex?~Du#X>bJcw7x#p$}&RiQX^*Kd) zv(&ahgqVgyFSTGLy_Ko`oss&{1PyQYJdg8hh*K`jBBe;z5KR3PIWut)@+2 zCYWigbU#fVJ!hJLvU*?~x1z3?Syt@7wRv`&>JTCW@o|v&&#E=Poy}h5*jltf zE^STKIMW^Wpw1sFS?$u6^6cn`dUb65%QQX5^;d74r)=<#hzsUg7dl3gs|`(^XH>ve zPe)=TwmM+?h+{lN*ujlv*Pf~m1=#lKk=K1>F_#o0`HvZFaV=;z7-W?GME7rxp_HgjyF8qX6PG_fA2D&2>&H8B)L^Uqg-tE~KA- z?;5MNJy#RXiF*Mw-ib?9LMv1OJ#OYwJU+BfF!V~|TsB}Ia?sz@0Z%v_wAh|x~cJY8iVdP_nmNc7}%?_|omj{`{qA~)OXul4 zz4ayEjn}7H)%+k4MB@6YL>Ta+-fxdivr(6Bo41uy|F$lRkCZ0Y0NHbzQ?iKkn@c89H9zm((_qi$-p35m5uM36%aH2Y=aLM!e;s)0=0!3RMI(~8_ zCNuLmKi!pM6P4j(1zy1+VR*p8mz8F#;Ssi4cHR=z-$%CE9D5vM zVOjDGFo6OI@w7|}XvP8f&?k7L1VHDLSc%T^LWB+HWNLP2e{gdb8QBi8778wNV@7Bp zjj4-(kzttOyb(O-ms(Ur?_Cj8li-9kjmVtpoiowm8w@;z6-QQJ)U8cHxy-;^!%?e=BB zC#VonNI`KLdO!>Gc`G9v67sl6zxdD7pw?D)wH|f4m0t06SW$}Mo%5?V8ENKw^~S4- zQC)3sp15tvgFGk$i5_^8udA}QP9pP_!OOBzt)=`yNc{ZCPy4$8xW}Dg5~z>g;eXRUqM-8=gH-L zygoge5y4)$o=&Z$xVc_{Qcd~}*g&nI8N>_$;iC;5a!C;zsQk!%lCr)Z%Eo{kIX^fPAAVnl zIM35yaUE#tRQ3SN0s{#8ITn<|F$J&_84|-9M~SN?XT%g#!SyB1xG%j%y-n8Cl+1-@ zW1f{!vPW@l!Xrv}O42wC0UHa>uzU%-S(9eagNn<|DF0R;30rob+H{5vg{Ir=DA0H#K_Q#_=%vqx}a{h_%@Ww+(T#a%tzeyUxqhNqbZK^skSf_P@* zKykKyXY)3Ef;ccKNC_|6*f!v*W$xXCAP3jRT+<*Vvgxt*u_u&;j*C?=5ThjCv&%$I ziQLs<%+2k=h2E#5Wst*o3fInH=D*dCo4&`WlpKTDk_A{$1F5drmR`rBTZD?{(ACD%iv zvRKHUuFrnZvYLES(@6`n--qoF0Vyxzovq?`l&=HIZzG{Ue+{rb-{~b0GdSV}6}=p8 z$(2AehCREskq~lY+FgIS<|R@;@!&CI78pm<4Ib!vJHbu=$7l*8(BS{DC4UPA^3MY2 zasqaG1B>N~3P*MKIAf$6S7urI@7UB^_6uG*LbC0|S{>cI=iRV7tvYHH%&ES3iU$d?Qm?_4A4!NPiO+V|3q zqMvF=<~#CcYN=JYC6Tw2 zqO=D=$w4Kork1*>{RiJTuVxV?uC9i#W_fX8EHS*G_1(z`dc+uk7^wlANPHd2eph_1 zJ*Th>mCeYpbDwkYA!2K)q)X~xch-Opb5Ap2T9jgW4-Z{UB$U)>`mtYcu}mBWDz>}a zQ}%P!79?%Ys~ZRX(U?DcE~BiU&8J-P4MG@fhCc@NjVc{+$eO!g4AA+5LxY-cGtnl~ z6M7{?-QKmHQ#f)+_SDWdU`#nYXvUxQ?Q%aLkF}KHR8iM4MKIT54Z(ttfd|?km-ZbB zd3`hyQ8vLXYTMLLj==iv9FT{PBe&PQQ3wR6^h7@dJR~npX3&ud_mj8q_PPN|SEY=< zWRwkzmPH23HE784nN`AXDFrhH-Ip``GG6*#dgcc{f1Ptxr0u7qv4e4sEiJngrJsu% z2P~U>X^0<9hX$%wRDN4dpG?TcG| z79n`;ly^o&we5v5KR6feRzxy9wH?^7vO*~|26jzyJrmHbBv3jyqff31Ea2{L51ka} zSM!dXM(bEDX6gG+hW<9eYBJRWP<2hDU&Mc%qXDYVizzt2;h3pIi_m8?NPY>i8^nfG zxu>nT;e=Tc{Z|}G!i2p~10){kc4Gp=JYY#21S`P$Kl6}{A9Gzu#H%MZXA;+ zhBdD*X-d*?B{)?oy|T4a9O?V07#l{_9Q0o^BT92Dy;ggWWnUS`A-Os)0JLSy&vp6N zin3L~h5=_q${@u$l;S4BzO2m#f^vyO+>a{~7mh&|^{eZL!}kAHk$1Tg85QK-M-}(f zQxwDeT5IQ+D9Xr~yZeLuAmlvHe>%3NZGuSVw+p}k<-NkfXJx4@=xSd&U}OjK# z^w+0>$4uW$(Rx}c(dWM41wXUnSB>Hb1fU~sXKBEH^@uI!_Qg+7O5YSL#TMwht-VDj zH;Fed{#5B`9WMT`*DvW=ocaWs`_6fb%+<;yO2L0dLRcf6>sbd=Ri7 z7M8M)D#mF0Rl-bozE{7X)!&mZ2J7=uox&%oDe46L(R(<{uK>sPp!X9pLZhrzE{Ya? z=zx*tXY-Z%O}=O)UaVh)hKoTuGTJ>1{F?{+=7-?1ASGo_*kSs>5MjZcTm8}wOzo}Zg$!7P=;#go9ztImVv3 zm~eT`q$J_lm@mCvQCS60;*+Az)--BAMk`#|h8w=uFFvDWH0C4njEm%Kiqn(tYR-_V z@QGcqu*beGv8pu8n~uo`?Iw>It>KL~%UA2J85ErN;Gp%sEDqB*|7ko$$C6wMjcr7) zWt8_BU>cYAzv!{SnT%t`bB)TQ5#OJY*DkZOB2B+Lk);7oh%AhJvlesWC&s2o7bnABP zkp5Z6>W&AgU*TCl0uzh`4{1qMNL@fCdniXin79y^H7lQ-lwN-~ymsV%F_A-GUcpT& zw)A&4B{ZrJhGme<)$Tq1C_$0;xAMDP%XfDx9(;#gz0f{`9)dE8AkL3T=xW`^26`a9 z^{yS~dBZo8q!)b`?4BhV!pUz&5vgATbiqT5C>W^qUM+Ctpj56F6IHGDUVfRN0ZwX< zSt-lrT&^JWIp=`EH!@GLoq{sxUgurK)M|kIuIA-&ApMTg^H{*~aK%=L?)J3voa;c< z{#0nwIImhcvqSN{*meFmL_H6pc>+*ag!;24SFSILml-xx{7cPGYcS_MT`mcl!+v52~^cvzz~>g@8IPeR#OFQ)1@yG$(gyyEZbx7E)? zBzAC6XS;T8b+s4+=1hs=tF(S8BMB6sQ5?bGTGBq|X+#_UUR$(F$f}5A`B3l01>4O~ zS7{{wNIM#*`0hwUt==U2>Q<2bQdDXL{VMNK2oBvgd6-BUU zEo>tJ0R5T9*nt8RcZw2{A&vb?a_Lt}h64nexvd`~O0M*8t{HjvmqK1?Pm}XZ)U6%? z*2_qz>4-)64E{>gl55l}=TFU(N3by7+0aVgT1;Fyezs%CfjgPt?AZ!i&Sc0tXHH>u z5Xq2@S5cHcpm&M6IX8w!1C$`8x$76EzoZv844=5WV4L9g+`*+-ubp4$A@_~5PLj#c z*jaDLECp=ii9L?|A*xiid%X;heczoUS$PFN=($)-JSdtmnEsrs16yqBR}9Ku2Z|mE z<#j0cxPtTh8OmO3pXd0?!D3CJ+5WGT=t%}i@`6EvPZGjchJXF1ohp` zgH!toUzC3PonY#z*y&ucI$zn)yID?MVz8qDYmRlMEyn_;J`5-L$1MMyo?Lx~D!5IX zN~lDSQEpAT@u`A~uFU|wOKP)ohaYVo;c?Rlsl|XDj=#vaQ&^v+{BF`a;AG?v7(0zQmy};X-LwQwgYe%%4!QvL14=jt70oZ8%Zpdm}6L0(3 z6Mu}b!LUVS$+6twt*BgIwzU^QGJ-|xUIppj)spqdO6=<&_Ny^odIqeA2|uN3|dP;-UjIrgggtR6cuHk@eZ&yHtC7z z`5>pc(*bxtUpGW;UXAXVVN*uR8pe`53e|}yi*IorkeLVMa?IaMSCk!n79Ws55Q2}b za$f4xk4Yc0OK8Ompa>eRC+3>w6ZavVs< zS2gm+?bJzqNupC$%yy`)f8}%dhTB6uts;*#*0<>92DrRDDHlZ@=9*(fTJTq#bL*({ z%+?>;_-XgXD5lRYckn$*RVyNInK?Y3+vZD$o?8aA@Rs$z4@*Ux=qdAwdY{Qf-8yng z+!(T-`(B0!YS^6Ue*U7DDN?%DCA(`{<0xs z?k*|XtR$ynpge!kLf?FH^Etw;nB|6#ixj&s$d+_RklmK?~!F(Brm{ zTd{=v7u_c0x1hFo{@w-(sjyF|deiCFdT+(K(TZDL1+gx!lHIy*!t1F+q05No!D7Cof1~ILAprNv3VL*pM>i%zvBCi^2BNEaXB~EGuR4mrL8ZGl-#jkiF!sT^jmAb8!ynf7gZ|#PT&>FwVi_~4J3y~ z>mD4QaaWU6$i;VB8vOQl{!KS8zu8^;g!TCa*!}KFScV+-A|w?NS64q+6Gss)=F(F0 zC8>o_nDdZ;#Z49+4o&2ysb)Fj3}Mgg!$y99}4Qh0p_WPRYYqv{}U%#|D+ zgpbl=In_2jxP(+o73*HwX{u?VuXg(_4CU*bj(&w0}f%Mph$XWb;~+I+krMUWBMkljemtD zN?F?98@=r-3A7TlkZp@wbCfbWG)}nB+jy262@)?xs#4hx)~?gR36uLK|57+A+(vI^ z#L3&VmuX3r6s9A6)D&qQ;a<;2*yH&@Zza(i)#kq>Svz2B#<{Q5hZAu+w#9kqa=bl$ zeW?UZS;YiOgzIjs=63)ZKc87Xy3nz>iEAB?jH#28!bbHMIoNouN;iL6|Buek zU-*2AQ=~NNpGjVR7;GZa^};0)rRSX)(7xCMeerWhG?Ob^(30 zJ@{uXKl)tC>`oP3XNw&2_jHK`@&Y0DlFc_a6>gi~GHL5=!jY5Gh3iu(*%J*xCYTv; z$D8~uGIGQY*sWc3%Q?o1JhzS4EFHB7*wD1`9HTMAt?A7OztvHS7IhypEjTdXUG5`0 z_b1&js~vI=iV-AOdLfSPu$%hU+E#r?N$VE_s-A;4lhUhlVV1|_$DUSkI$fmWvsG() zuZE8i%71MRCYU(q>)7r2Vf|Zp8^ZuYgYSRCzvbofqR$~iUWN$6e|!C4j+rVK6ZKNl z-Y%x)(WT0GE6druj42-Ho$C`yDEe3SZb6NI)1L9pLiT%`cy{})sSlN9!PX_q16t~py##*H#>O# zaL+na5PWX3+J0#_i?i3Cf@~QnQDn_cg8h0NBnb!`+M1`SirprWeP=ZZoM9V1$m@Wq zIIWst@?v`9W9fag-&>UYLNQvRI%rM!(ZAt~Ll;rGMDdU+O_d68lAu6y-j;H@3Nt!A zYTjIgxSx5g;N~+2!BeZD#WkGyoNJ?W^#FOa=C!$x0eU=!eYqW1y=#RQT3@He$M!qI zN*+*lnJRFx;&la2naV{S7Rm(sm#v7elh!HXQQG2mmI5-I&-Mjdq#nhyT8EL$cv}O> zkzV|0G-#8Zne)Xwy?RbGDExH`LsBoX!}vpRkDe zRZ5IvUQf^rSl>5a|2X3ZuL;tu+`o325Gj>+qGKk_y_f90nPkDZjKOVeo6jLbj`X(} z%te}tB1uh7R98g#UdS0YJOs^%#$#ZR-fGJzyC`j8`wKKU?10#O%zWAKzV!+3Ga0ko z4j#4eV^{-(x$I`Sz(k!2&Lw#@m-wf>lOM#rOkeZuxXfLCbZVpjXxL{kk!=L5j}C6fln-sy3r#zAPHe&?xRAKN&~kUZ63;D%u>qb z@SNFoGyul7U3i7}p5BkG2~0JmC@whH+SKJOiOb#ORd9fCFW+8ZQ14#DmnLRN>f2`= zrXs87trinO^GqTPN}lb4v%3%%95&bYRTIM5i9_<7z)Yq#w{7mC$5hyfEBkF!|Amgi z*PB;}wCEZ~Y9Syn3y_FU{C8N(^>%K&K3Y!`?gp& zupp3|J^pl)#2JK9#Jxaw^ZXTGKr`G)y2gI4`hdR%I5n{AWvYt<8`0;4&bgCMlAzZV zaKfwj?=t*cS5_-PpsesroXx?M`!@mBw=By|S$7OD2&q{nD^F28Qz_hFRm&zKJF0%* za*KUZ*Gn~VevQL@19OGXZ6EshYvov5(Z^GikQ(ZrNl}pt^Htdoby(iG5R*cmwIlsV z*L`u;1i@t%2p;PT!H%}*iI+_!(4hAn8j{cU}Zw- z@$83d_B`WwO}w`Df#$y8i9Fcd>tUdblI&Y@?PAZaWM{A>ve9tb6?9ATio2@6w~ILN z+ADgkIHn9tm61rlw<{Hmsl7N>?9nrb*!q7k%HVxX+M%lgLcCs0)IIn zF)dW5Hu>}XKpgL!35xW}rG9&OZSSU>uKyxQLntU9asLE1X+70wIbc1h5j8pbd#d<#{6y;ZqTg<`rLdE+vPcj=`olK0f1)%nv8{{HB9g|Ot3V&6vMAF0V z1EykM>*ukZi%x^qPuxqPnn64KLC8=MqF!Qw_tGCjrJ*1A&%1ltC^nGZ$nc zVq6X&S4WcTuQxhO#kO1fskdoY_;{HWXC^w4(OX5une-+IleG28uH=4f1;3p6(f8zx zBk@iXG)EbqW}R6*LCYd86A5m%QI}p3$m=!g`)E7E_u8|Nf(jV z0Ze87qUrdvoCR$#g|jld+~B0-{07l+>=L|lJ^1Id(>O*-SGK}}_bJJyB2V_xIacd! z35Y|U4Q$JP)2*Kdl~NpvY4yb^SgcuN=7g&RAEW6ur{Y(ZXbbZBpf(#l8W5R2G)q4} zo9$&(keZrbrJ9L{PUi;tHXLZQ&bv0!C3kd8xnGt%o}KC2Bg zKMVx+q?vInkDDJ&l0L@h%*t>QV&^KKg5VEYFLmo?{I(jH!%f=AGUGlUr011eM}{;OP|>d#A4Vfn5{6DrMVRdDx0TB@Djf|S&4a7i-Ad4yqYLQ(UKid8eKj?TvHi;v%C=;YJ$n3^G33}>D8+v>$41a#{(jxV)V1HPF3?A*xFL- z%-szjkT%yFHMKHbQoV{Q-=yBE&XlS7uaEO=j)}-b%}h4v0+S|=1+1^Emn-H zF42}Z31|z5L%D@`ccaZctpEWA_}XnvBnI!1~;!Roplh6 zZl-mE?!&mD6tm@P!WV7uj%Z&Ejs1Y(Tt%X zW|Gh{Wf8m5;!frriRf*tSpcnq>3bd4LYVKpP}M}f>CRJuZK!D z?UoGthCI6)ocS@;h304lMgBfEQAiQ9_?g}^nC8$1qxbcq`se$n^pI3$a4U{sW0|uM z4O(4;Ep6saAd?bl%c4;hSq1b4kJ}ozQnaP>Nq?7h;u@pRYoNMNMgFVM@(Yz7z$UIS zws9FD8d0h^|C9K!gTzqMycucUC7-TMX4y76*r+?@pt~2=B|<|vw^tWXtD4LdY1@G$ zEBl7(2Ui!gICIql@Hj1-5fqtI^3;%n#_vwXGrO!E=YP=~T(Be;SOSd>HBtaR( zS`Jo~*A7KhF`sX6hw;S>R_+jmyRg~B4(&^#)OEK`7MGN>+pL1&W3l;)f6&qFKAkdZ z!(6&Y`7lc1s975|MGR_6dN0B_U5k7idJ=WfzJ@sxy3xN{92&TZwoJjIDub>w?P{5E zk5)wd8bWzT6jlt-tcvl~54GK{g3JswcbLBVJYWw)t$$Rv24)t`P?>{xp?uES3u%;h zwZqgLn8j9%Cyz9FJQ{vO0b5rMuzjTco}L|r)ReJwd^hBWr$g6MLi6p(i_@W-w`0(P z1Nk&}hW9{zq6i$J#rAWezpEI}W)^W@WbSV$h-BrCyX#~tUdH$fr=%0=3t05k)f2>U z6YBNj8L`~my5QQ3=XNhG22J6mxk%sR9%h8$a*ik2YnCH(_Nvh)idRyCM-Ifz z=H5t7vD>)9d~|GcNy0%E?R1%g^Na+TqO8UoZ$G!6w1W{d;6;OGg#bl~AUUjIzY7sb zU}+M%H9ek+dM%O-WUs)vl#eOuZCbX`QP6T+LJ3mShke{6D|B;Ri&;dJ_5$}m<@xqK>@;o7Z*P%8% z_^}Y)OsMv?NvJQkCz5M?aWd#v6#lq(rbMnh*j}Lx*EP*XTC^{c3%%D6E;D{@DO)23 zp+o;ahRZCc+B~7`ccOT+c|}E5aPiCu^uvgIXGX3}_3pNa-DZg=o>*Ob>rfDS+H_y! zkxC;g{IpsfdT!d)bfGt^_{kD#scy60p8SCs32tbRx^8?UJYKa3SW)B^?PgV#B2=xb zy1Nk@#sDz&XT~{PFK#-U$c9gR?;{ z^q%F{o;i>Y4%#?Abwyax&?tR~bLllUlTE)MX3MGk7CyiD;EHZMKJl0^1?tcn|DrqTF8G7gM<{Tp=xuWV+d^|iJB z`uSDOws_EGqe=iX%=LaxEk^Gqo3$1j-&NQ;gKTn+cK+& zuJBP+zD9~j1nTHgMHBnRW2vB>e`st=7l3u0WGi29iG!SE|1%z(b#kwOM}fiMorcfD z=K;B-RH6UUN)Y~A0{O#iGN1rNE4|1?XdRR)M`*1MkIoIC0t%l zk>CVKVJsgJ?Pkjh+@GCI_dpjAx^-V&&|STXJZ98gagcD`@#IX3^^TiZ%(_NZR2Nh?{s zILnwDhCc1z;dmI319ND`!Xc}lqcO_?i+ueukJy2Va58V8t$1gg7h#t%zcT?W>BVrE z_4ToT7@fpc1bTvFIv z36HNJw{@M-g%&gbvH{p*xW9MZ|AgY7r9!Ki12oRSzxHU$)U@6KdBIFkH7ft{zVBBX zTy9Z027E{A?#S;n%PP-lwqqQASNCsK#;|~PI0r~v>~W0FR390fuu=VmAtrD?qzfI| zXio+c?X=2bJ>-F}IJf_MNi@d?Kq+`}6GMOXowjK@^XxM82C>CP%THXO=OA1b5ZTx- z@G5YQIf$P^0Rk?xGJZXx5#OZJ_>Y0>{$J}6(>yokY6_|*=(-;!s2!(ZWosMq<30g_ zNch?2b#Upga+-x=s~py9VJ>%Cdubln65_WG~4V-$F8R$HGuD!aP#^S!^dzr3~5 zFy!o(`q`IPRFuimUVGN&f#z<1MMO?I&LC@wDhBdVKe?{r4~PpP#yU zb{tR_<6n+R*jT%O71QR}{Y<{HoVbSX3zUV$bXR7HHtXX~X32Yf{uo|h{tcYUet;k8 zIf%S%*Ai^kA~q8NNTBd9)z_`WljZU!G`6(!tN^CZb4?h2`}-@8Dg?c^sNgb!#Kma5 zwa_3hrm)W4dX2!(G{$5Tm21o_(&6^8NB+P34Ao(gM)PuD!kHzX?7bJo1`cixI1E;7 z6+%Fuv=zs8@T~kV=`~#}>p2{@VkE zUeq*py6V*y=XJhh9654vbIc=HV!J`MZaS|z&F${nb&3jTq0wvK^Cx5^xh~7lhhB!? zeB(XSlg-Y24`{rJl-zF?DPcL4>h588*{@{2*L~S``Q-x?nP7hUbkrhAr2|5Y_HT(DH|Ik?h)E9OSp_kD+Z@zv^D${rnvaAg2S6Uz?ak zy8}l3s+Q1UzkYG~zMZ#)g6p|hVP?s>CMvcntV2O#unQ0M;!UoXL*449M>(|Gkx9Po z6vv8d;-)FxSzP^!aplF%UsZ9ZFy2v4XHmmuom)CGYd!4;2P-IMO68-q%^luT?`B^w z&WZ|$23r~1YU4Mu*^t?T+O~vpbc!Oo%i7MU6`S49ng+xb6wAvV%QbmDVIk_oe{=D? ztIM?wC)k7&f{Rnbe2c(~bpdc8_&|N~Ma!h8BqEvAhz$LiP~1#7S(4|>iA^eWMowL! zMv_->!b-}LP8B8E1GbUqzVwbVE|YuwVgdAJ1ooI%lcYLuDv9UB9?wMvxJh9z&W|ek z>d5MinQDw;eA-X-11La)Pqanv^oHMD0a|U2qe1P*o%P>W%UQ?)Q!(TNAM)e68ZwQ< zg)x)!N|up`xL8*8vKNj%-UIl}M~s!Rqe1ivVugRl#A-Emb<3ER{TKa!S(`VvwZC(? zSE})OXckzlx32rOXIL%P5D_0?-xF$w&Rr3VT`GseZgZ?0)tfN_qXoSu5krx-M#K%XCUpXo$b06L8ExlE!D4J!|`7B0FX!i`FOm zJh=EUniY?zrkY{wH|69$E~CeeAv?^o${01xtXY!bi#hFPs_Gr=aS{$(t%$KJZHlcm zYi-KQ_G`CR(FRmP++J`SSO~i6Ut}~*a@yMB2|WSJRWC2~L3qe$!YxF}S>0VL_UH4p zSR(<(A$HYm3Ea2q806uiG)A?}p=seQ zdX6mFreB~+B7o%g-;5iWj69ZHRsj&W$;`B&iH_psJ5e8=@|#av!b2gQGGJc5rR;Sr zA2&F7@&tld=QZU?4RRpoCzFW*JIdg3>UrJMgZc6xKiM>;QPBP}Fft3{ei3}4#BjD( zs!;a!KDdP%D`YCDB28!&$LY!ki=|aPS6s=wGq>(Ls|jyw5tnM~c5|14b(SL{B)f;k z<(MaLko_7W&NM7Y>WynzYJN~8x#ElP!KpH%dVu1?nLA)HJfr)vKQEhlxd5Br3@s+sD?58Xm2>F0*SZF%*_e0V5scKj{41zS!^=$*Ky#v=D_&1q z34=EsOe&ex%8P=$FtdaE6*8KP_9WUd$RFyDZ4vp|HD0!{J-2nqVabw!Y7!@;<8(V^ z(RfQ5U^WWlej{1KGD%3Y+~JmvymaJ{eDWqhmJ*bMh?F$8t6L^{Qdcsk*Z`l$CcP)1 z@i-~KyG@_4rVg1FcbDq*_nalGMYNC=tjN7r+k+p+bM7eBAmav%HyAZK#4J6IWyT$` zk^(^55iuE?gnY6X{Mb5UX?RhV8TaEbgPnC1jT>+4lCyNp=;?Jf0JK~=Ta^CxFO3Jt zC@}O3*zk0|N!>q$aIxFAt*^6;; zC54J5SqYEm3Ws<#tQQ2y89{&eSum+BtI*ZSo+_n31g}dPI0gi8mbc%?z7Q^{lixx? z8Bdppgwg6fngnOXggNSXd(VdS(F9!ETwi$Myry0lTdx%6s(UkqWmSbNOl}IpG01zE zou{D1g3L$OM!IBG@lZeoC)PlWAVZLm1{$z89~w@|Oz7PN7GIkVYY@O!@}yz{`vPV( z!8A=Xf_~e6@w4U?*n_PkgzO-UWRY=dasCaVwCrV(Uy-$MGv5ML^~PXTnZ6CGVJxw; zkD|rBg5{C)67Ys$YhGOWw1f=!1uVQv5rl0hKh;2x0haTaL&A}aI`BTSs{=ChDhwDB z2(XK+g5%iK0X8i=2e%zWbVZTatl|n8_**+_(4Pqj~*qlVb8gG!AZ<)PhX36OwrNohi85HF0($sxibvfT=Guy`_ zC9derZYn$T5_5!A* zn6`Z*vF^k!sB!llJwIqY&{j~BAP+2QA*7Vpgzd&g%8|; zjb|@oh^(S>mVVWF!4OWvFSaPQu?E{%4cy#Sl9kcnPR`{5GrVx^1FW`!I~3A>1wnWm za7NnF<&Gx4oIA=AGqfHB<*ZP_MZ(+l-nxfq7qT zkkrF2px(<^;$I?S3;n0p6vtq|Mx(0KtyCso?xYf-`MMaP_>N&TK#~mTnAV*{R{^`e zn=6^kgeL~JH!?`M9k=sq?iPAQ8@7;X8HdSIlGzwpbHLR-;_Uje&|OLoa%3TUZgKDe zif*lnsV#$rk9x>db2A;$9?LyAvtG%Xgz_n6%VDG+SKO~WFmA;7`$|?eTmeTXVe_mY zBALHKW-PtczMQY6P|2X|6@7LW?jzpHZy&&hRVk0>q~zm0+vE#=7Zgkqt&yQX8Oq^B)J zfh7yvMBInGV?3jI7pUiyXo9>>VLo;-8^BSe2`HtE=a3NhzX2Q1S>Z+|XpRC==S7OX zrGW=)S&Oy?bdBVsr7#E+?qD`Dr1&@ZVcz*^^bf>QWA8Isma~*25k0*s^7&)1#?0^Q zDbV#I5+ejg4YXu8Kun1(>X2Q%VZTE5?UBol^z%u8JJ~zvNrfe)UsQ218Z?YtXh^nW zUJufBqMZhU6GPQvhaw~NWQ&mW4Zl$Ym}aec`6bw?l(f%Bq@*VrQ#>5MpmO~vvhYSM z?!1Lja_WSscd4lnLUVAV zyF=8elU!$<(jSmaF-|UMS$NU*q>eRC((UW5-Qd^a{{G6Od)o_crADq4!M*qWjpdpj zuPagPT;t;;C&DDV--Bc1=^sN_p<>f;AancWnHKb0mxZwq2 zE#-bB){OYtgF>%1Cj(BH?=H`R55jPjyH|wWWVr+D>L%-Rd6pmY8itf^gM)hYA0w5n z9stJ)CL1K|9{^ll>jilR5$=VeQH{o`dB_D30KdDQ@`?>uNfd4G@*mV;kGa%|4RyxH z*Cq5?Z-8^~Bo~VReP|-jTAII$6YR+7TpLljcin|4uIc~MzUdwIgEj*;p(C=U7ES*tB7Cw)smv zJoV>RwaZ(3pabnk7BFN3d3c-VvEY}WI4Zl8_33Es>z+-g(0YZbQiB646-~hjHceX3 zER#Q<|CmlPkO#HN@I=qIF1+9nf@%KSu9dA$x&L0#2G}!p3>oC6o99Mq;d+FvN_%!? zoS1XYN{saMUn1KjqI&f^S~v}@w<4j`f6C0MX}VLz8ZxH| zh>o8wG&$ZV`66HD$bz-0=M07IPS?yc5dUjIns*hpOmRY*(}#t}jSbS5BTt6kTYj{nCrNi-}g-CMXEZv$X)J zOSpIL^m4gy-}8d*ywhaXK){Y5H>_QQ)UwA8M_a97rD&~LvW$2_ujht0#1Rd|93M(I zC$hraI5kP~j5yYaEJzzdSzCA^A2H{3Ae_+BW_^TUT}u=q$OON{8jNY6?IL@kErR9T zO4Eyx^i(X>9iP1k6GGm$PX?x!H^3@N6(TMJksZ$t+XB0Ycbo*u;>nWC31t@d?;!?f zG?W}zadU(9SG)z8MmSb%drTtKXvQW~X{z=CeZ#&1t~6lnLjI6AsGYS{F+$GVklk0O zii#*manN{o?`cJ~QxM@!<;Z@b>J5NP0ZC}pLkmJ|ATCl;Hg~B0Ro(q7VFH@c3Wuw+ z(`nL&SL31f?&rV6t!C683-T((nLQMj(s!wGAp$4N0Y4`vt>LMzg zy$$yvrR4o2GWr7uj}7i*i!}LXd236P6Cc;bHU-SJo5g_^P7&&Lw&ofGK{)l`R6Q?Q zC7Ox=6xYV{tMEKCp(<_&k@HLl*_Ogd$A<94s(=+RC}Ag8C-w(; zGHeq@H>&D+!pJUk&#l!8gL@F+QRN%ggd;+-*Va8{EjF`-e?~OaLIb*N0#1_vN_2@i^)KI+n|drZ+}v*TeO_GmxEdK#!&sZ%D{V}wuN%P zyCz{h)tRZvMRSjKMD&SaKJ{R5YrqRVM&)u0t!^cw?=}XJAZ%e5_=r zmDL?V4B0nk)>=^-BK$^IxW6S1P*JGXk{XKU$>%H?wf&f4XAWzX zSkql3HPIKq%2}ARkdc%|*O8SoL;rc|KUNVDea~SsMZv|*bZZd;6PlbgrHhAMN?ow2eQs zSZiWR-5y%GRz*uPT;r{;bxeqUP$j7i)=NjGzn9AlB)UBDSRR?iQ!Cttg{X7J~$FpEFw-xlPQAI*g<#1pw|{ zf1gFfI4uSvQx4^HN!J3tkfu}$*OWb9>E-V#rNvbN=*8`HlZ7Lmzk* zcRp4KSr{jTfF_4H@HdnFxGlehbeS{EMtvTiiCfG9wh~u3RnrBjEhwI=3im*z2U!9L z4g3(1JZe03I6;Kr6^qZE7h1q6N`LszZL{`QbU+J`devyST{RNQbl3!9>4uKsKCjbm zhqBnTjP8&i0Z)RE>CGkOjWmh4T22MUK>d%aj^F6S zn^rI?+pxNv*jByG_xSZ%fOOiCruTyK1Y!GpGb5Bk{;=#(oFJ)MfG}+NH%2+23<~C4 zZ!*iMC@zB`FCWghjYj%wvLJVRQHFpd_nF-g1XQs{eLHj_BgvK8r9!l;ZbrVqg_Kh^ z*W!i9rbz9UTAgDlL+TAa>l0Es!$AjNtHW4oFCZOD2*bE5Fd{Its(84CTe-Z&pj9)? zqKOn1_1C;`Z6+c#$N^)Nj|AeLjfqGsBL(AL5f4O}J?c7p)T2&;Te3XsjlPlI6@&9R zj=WtVqeVE&Nq*5PervHR(X8Ur{u{1OqXbA@`haZsAL-f7sPzp?z7X~4yk zt>#cet$*mklX4l{@#rH9<#&x^RQPu9II_9fVPzG{$pv)qMCU=>$Y%EI3ZBcCEI$GO zgzPEaFh7$YgdCtGNZ9Rw+{s@>k0GT-@!Uk%RpBS$7w@(Ne8J}%wIgrJfZ$cq00a&O zG*jIBC;u#hCovZEW@fc8QlC;x+)LAKjb;umw>D4RSrO06xkI?j2Q@J58^I6C5op{a zQ`Jwaf&60KXEv>|0sJ%+^^y#hrkyVYBH27!c6nUk*dL^MUqqEg$a@^psKa%du-a{V zLG1!A>wfR8FH=yn3iDy~*>h@;>;v0%|8ldh+HayUH-+Z+D?dX~ z3DN)gHf9X~%AHp@yDN8joGkIrYV>1fN!bJ0Fa+Z{s=J6|z!6+PAaOeDv%9d<<;XIC zWSp~&dfh)EfAS*WV47~D^JC0=a|a&aKgK0&H;sp$lWhFc()$32v(x$^;5E&bxR#elGyhwS^oWk-4PQ#vH*6fuZI6z`OdCXRuS$ciUs86NgbD54$lq_U`_%z{ zm9|mkZ*?oU?8MKHaO;xb?%Um{ho^V`{X74C>(7A%7|qy}@`j))E#gudK7 z>@Eh@H-1joY78BoE0D{`_?z|p2>n{CuOIkw`|7vvQiN?il^XFM(A?4geE;^Ns6V!< z{J*aqz4Bbc*nfk;XD6<_<-QOiI%I03b@|+N-l?K%-kh5rcF6MNK7fT5ErB_aNUZj;?_JEwKRDdy$_v>i4at z4rTKad8Q0V(pO={>f!^2LoxoyEy#8@p2%6s1{Y&uZ7AAM} zRvg1uTF+BLak?ALYnz;s`7*wc`@3-Kqji-ESVR`JVK*Ka3CL=yVJ}h z(yf~QB8>{sz59H{_0Aa?zBJ?PkHp-2_g^Ii;$6O!`~G1LE!65$PqrRYw9zy^2P*gi zk$6pA6?8{t-4sYWvgc&@^?gao`KhEl>${jiRcuV&9#H2Vt{*H>gaJOT*YkfhdaebK zP)+Fbpb>O=B_^#@)G#j*L%Y-_5ntKRe)xQ=yk*2G!>0@}de3yhwbCoCvzZ8JHa+7H zH~5T#Vq2_u1(Ea@E2F9uB6~}NSbbXu(;T+XAhx#g^kn#uh9T^!1T_DIRZ8I5LrOJ& zN1K$^8a+V$ujLH7Z(D6tP9cmy;iK`SJBC7w=&XVX{K3o(Mop zRC>f6&RPSa{To@IK!cM7m53*e(r2%}oQ003|Ms6Y=DTHMXhYub>j;((7Z}gg-+9lS z{GicEcF-FA=V(vipgl0zOWtQd%rKz~P1=k83Hg$(PsYleWxxiNh{S#LA=byxQR;!t z+{T*mAKC*Zf7uGB`ex7D7NTP7;xtzvWxz7nrwznb01-z(B0gHJKaeD>nOgz*Q#S2y z&jb}PXsjG^CR-)>F#Swg#~{M}CAWK>@!HEv8v3?1WEk@{=BG1{0s(=TUH|<90;R*- zsLHX)FDt;A>q~>SK;je&i>S6n{1v~X%jsIos|7|20^J0JQzY@6FWB0H zPO~wBTBm9d656&Wk9h;8*TB{Dmg@qTW2Bt2 z!iN6*E)J05MWMx0ka62n+Hbm=aa#H04H%*D7x6yO9ySMK&X4f=X%*4=v6}mjcsCUg z^@I7w1Q-=q*%WZ)`P;c*Ut+8pmV-hp(t z=eP|z8}%EE2xjCk4E|&ZHe$jA9n6Ftfeot&Mj^kp=SbMD8d@71dJ@o) zkjCXIEE;qfdM?3lB|K8g6sim{e3-r$w$of2#F1V!Vh*XYW|1lV!++^W(o*LGwVN$D z7)82M*W%HnZf2TE>5&wi;-Ac6^AtAzRNJ4F%@JIGc(O0_S#Dd0<>afh z)d$G6_0VBOs;R-HHcqf_v-ozU25+iFV!Q{zo7PT3_VUOst;C*R#=YoS?o(?ONxwX} zM~`=$TdgqP+=K=9BuqZX=FLm5Pp&+p_@s%enPzEU+&%Ct9@7}@VT@LtjjdkpZ<&JA zxYlI%^Tfa?p$XIRGKOjV{0RiPR=XBqt7JbHcQGtaa!=Q#F8_ETA4CGFps7aqI;Bv= zhfMe(_yA%RhT+DH?1u4yesIyZrr$2@D~&Q4#lcVwkU?6?UqTA$@xq=}CEbz+L(T^| zjpjlvquwo@(c)P7aDuE#cAI2B8=Yb1gic~Bk3dwWwO{Ni6-hc@Q^WUU(OL72xvK;7 zBTmc)?f2@Vm|YD8)^41{(>j>}vg=(8y!25BRKU2%%)vJ2ViEXA4e_M&@d{gJtHK^CV+BR*%3ovNa)#$n{ z5g!!O++LWCag?`PvT~z^V&KDfF(vA48a%ku8Fi9@rfb zYvED+zjozaCa*ZCom2wuy)6vV%9Ay_2C09ng?Z;4c@imoyq%bTfE52n(muEG(!6MS z1Hd2T)cHKDONg`ZM^oWGq7kE1`hY5?tkX<* z`h*u5Ph!_LYb!IMoGk4OrNI)sDyYZ5Hj$Otlvx9_Wo$O}yU0H4{ctye0*=147&&64 z(wYx0_YeV_ZV~UK2fH$p*BVomO|ihS`}uV)MGw9}*o&19l|GKI_`SZ*BLr zh_c->!f-^+o1U?UMiBWV? zwSBx}N8e0xm!w90%f8ja>M0*TNu41fp7#j%Bwm~{b9x-F?i7`j9#{~x7o^H&N*faX zP*YV`Nu~&&&yy&EtTWpaP$}&!1X*GF4XCXku^W^*OYuJa&s0rfVBD#IF$~Q{Fc{YM zT0JMz=q=2QBMAtL?*E?!;~aKClE5~&fBO1?+X=TXx1YeYil#q55yT#$#1GxCtwsiS zLG#VMOFIhn=Mqof$jlG|Lcq>~vYx!m3rsoMxSN@;aX{;}sbDO|rOQ>yB`hApU3|?w zbf_-VBbSAeQdTt@-VFu*mlrx`$r*L zyHWo$*m;9Nj7xd)kg@p{=G*elenEVry(;%voVD0~STM17cZyBQe;4Qeq%Fd!bS)C< zV+54vv-_~r{nmIjT|rU!aiBGVX`*cU(rJf`mA7Wos!@#PgN`3>re-A$sS)$u9tL|#J;AeY5!*M3tqTjPeenCy-JX%PWm3=N2u`?#7QG?jXPAbIZ3X_yqC5~s@% zBdU?4;>NG`5G`-JFDV!7|jR>?W0!R`aF0Me%Q195PAygK7TuE*eE#;LYiU{7AA z2%u(Ic|es0)&!1`LS?S68fK%O+QzD;cBcCMdx*HdzxLbEGAuEUPHF%>84BA05^NQk zPkePV;a*1k;}F%zH&Dv8dm3pKUb+~_(e`JlrO5DjrKggI@dc_coKs9f%o1`_x|o;b zp7cpdx)meVkXG=Xr}~vWx0$DVOq?uS_ZdA)&&&7OJ20Eb&U^&J03`GbX>`orFoD*E&>HuM@oyJ+T?E$tQK^Ej8Og<|&QXyg?VE_QgEt z#iWZ(P4=XPI)eM7hQ$9}#RM9Mw(~sZ@S_O;?ZPh!COYrX&$R!VC~$)y^>2lt(bh{>QkMHSDX5KZ>1+bXX^s3i{`rOyhx`PK--6Deo1Jq}1L!a(gi2B$=59dS z6NZj*^2~fE7ffRja08&<(_ep$why=E0?GjG zyY<|3Ly4^*)wRTZf9Ku*g;#wF)2)s@S)H!=LeRge<8FxgZJd9O{ z&7(g)O2j_z@q=>w$@%?0U4e1K#@?c)DL}?vzR4qWr^kE{Ez~PcE`X@qh&tJ4br{hI z$Pa!uiJWmN**x}TMtfE4sqf@z*4uK}RA^6#;kzD|>Y=xW5F5mk2(56)t(o)?c-*8^ z(7r>4cQAr-k9`p5l3yIjVVDUG5B@d?C9yxHpU}i|6e@;=pZY7|I<`>iOI{{PZKu3>;Q-2(R!K{EMJ zXuiIBi_`{4b{KyAJ~{O;cW?al=1{@we&-Id+( zJ*NnG{{AfD+a6lzzh3}-vUT%ymjxt|NY;k`|50M9pO*V zHtGL*sLl!qkjg&U`0z5-20h&7*P7dkBeXfS>?4Do`w`=V2@5ZXy%$%WwUu-uX(i0F zMltL!JRIPs|1da4^#!K2^IyE`Zvd}46i8FL-Rb{DJ^EJE|38ge`NdF~V_)pFZHq=y zNd|H}Rs10EbZQIPxcSq;lcnE70`K2)14e<|yMaqZx9PQ&2RG&m=_;io;tN_f6@N8P z?zaNakZ<)GxJU_F5(kzjujirV_Io+}lx{@TZiS5wm4SnJ))6utpxC#v@UdeofIeET z%}v(*Zwzl7Q zo*hX@^m}!;K1kI*NaaO<>B0B-K9YRU$_fQs(7PE;z6}{??!D8Qx0CFsg`~=fr<`(P z-f(|dC8;N&;p+;0Ii++esNHYa*_A4-l4ay(pm^iW>6;lMoOv?f@orp?aR>tYZA{xc z;2Y;wr10j*wS!ru4louA4S1?|;@JTCV?@`ba`IhLm|Lg#*@}TxpPX*oZ1Gvv>120_ zxX?x0_GuS*ufAZnL*P_{F$a*XF!rvB#PHLr+ta(|OX5zoDO_$ZRDG>H!Dnu6u_+zi zKPpR5KTQaazeOUTU@6-y6k_g{_jJ$wAr4-I4LuGvh|bTHZ4*k6)+4Sx1w98FgG;xM z&l3NR8B>7%2a|<&V({k&N%h{ymo{UYfy8E{Rh+`~o1Pg>sC$fv7AFHf$( zdyl4+D=J)`bS>$oP!WcYW#SS?vg-^>PP8r;Fi}I6|L?}8$^@q8)8g-f{+oxdX8ra5> zi_Wqw&egsKLa@dFJi?2UAcYRXYH-^X5DKSY30AfE9w9w%Pc_vW{Xkf~SlzqUINm#t zhXxqv#U&#@T1#6ApVJEiga@H0US!bOmDegi&dB@YD}w9z`^QFLAJ}x1ON}r_RgUh- zc(m)uS;NIeFmn}Y>(LYHfA`~3R%bZNO<%jo99Oa zBEEyVwz|sD))@%4Gh?@#$}h@GuGpV382FH|52<14$%CywaFA)rm=GA( z$D<=XI`q|Bt=o%=D=5N9Kl^NFGx0v?Z_gj)6J3gMX6{=xlHf_dXbYWZ0#hJ$Ex)ec zqp8j|*Tl3m-a_vrdye?99fNww?#200dN%L5&rM;VQ5N0{)7s1d_7THRadHA9LppbI z(2u;ipAMJ)^wo9$>Q~#e?Twv4NzFek*4L{NC9PCJqw6(g3tLg4%`ENq&%mMTo#d*D z7P8p6A?2T}IH3vBSk5ZKuHagtCcyYxoair*s_c3nPp`_0A8$83l;E~*P=Q*S%qVYs zo>*9zUv9AtCp;LC+)Hj5J5>|PnHYkgauobO=MCNc+;;T-&8elULFc^wdo(^Jf2Czd94bA+!KKMQJR=aKOO0E-U0}Gj_Z`GLGo6zVW8Q&J5Rs3ZIb(vRiDl#`Dl)Y#Hx^r{b2#FhM3i+X|0rM0bR&{8L za<|xrt5}9KsCj$hJJnUUhQ@tn!`c)8tfEKOxz`VHwm@v?_aPKcXc5~dNcTiZ_xwsS zgt;QEdDl2$X{yxg-ZaWyJxub(RQf+R-kAFp84x!yjET?YAaO}oh+FV3XtFL*Sme^@Uul5r@VW8+Uu2JkbOj~S!anw03vkV!t$5;j1X zC(~>ZnQlo3vAKnv=KL(v(l^Dn+R0JbmWlr|?+aME0AxussoU&? zUqOC3mPLXvJ*lKF^=C?jSc6C4T&cTjiHC{cs{ae$PQ>6FnK1j*37KR!|>i^@_R*f)uUInF8jX#0M_dj zlLlj^x?q}CG&TcV71wbH9Qfn|7Ga>n3ZmwDEG5K}Yu_1naP?Cf4<~^pwM%>(QrwGx z5KU7QLbwSU&{%T_QD;E#l$yU#T!97P9c+-?<>?_mqYqeEf~pzGCgp-`ZFj&H|E)>2 zhFf})jr_yib`PRX8vm7X{-y3F2cXe%d7aLm?dJ4b`lS@*uNM{eyt_;e(;ujnyw!NQ zqA>lbX8+U1cZ<8s(t@e8N?E@M=T!=3c{S3!mnadukL^JS2Aqb)cA%E8T>e_CH?n%)fGTUvRT@6>In|Bus3Ul5)s-w$8I@)Hd!{;0hsLM_i|F40v=8lKpsI)gkGDmU2UaZLi$=7q**N<{SF$Ut(e+U~j} zO-oijOw0+!7NR5<$ok5z? zTZl-L77Qg2O2Rl4si6l#$w()ZNJ}80+!HLL@4WZk@Au^o0wm|0UDn=vt!J(EtTk?! z$jle@yMG_3a|PINtXSogC>b#L{eVseKxJOCnMUn(PZS<7)iW*AM(eyNv+wik7c_;t zH9xbO^+GZzX_qKvTfw3MmZe0*ZANMhW~MR#a7!5hu|2lHJ%BWMDpqyY%W$}SrXu|w zNT_eEV41eVV$3?F}MTcAWGo*xf?dDmQNr7Yqp zud=yS2`eur-T5mYp30FdXrkk4aCc5j=uWetwvMQ20z^i7 zWo`hG(>HBhv*7$vJK5P7;1}OnpHo$2-&gwxC$DUI6%`SEfpz-XP(@l&CXx_>8Y)Il>Y1Ohn>EbH9xAoz*vW1XZ7kTim+g| zq;m?s&6PcwI*>E-N?6Z1+~tOc>Nh*a(+NR^R)d>8hSAbF9*qG}eJ%+*b`kL+H&jm? zzVS&4Nc^8OG}qkb-YKofVMk`p76n#W9WpG=Epj3o>W`v1Y**cNX{5fXWV@mSXz&eN zcaF{lLR3A!@q|G!n0MG*M##p;hXpc2{pEzAP1@<1@5_eDMY+&GrK&M0X7&5P zpDX%pi<**}O>-x^fVaxjX7yBj|AAtF5i^PEWkpe&C9qjp#wG7c$`}%hIiJIo+>yV0 z=R;;1S8kn7c}0%4GZUj7>ynWJo3569ZNw?PDYQ~5dD6=hRgeqH0jLIvrrdS+;JU#} z;8l{npk3swe9(;Kpf;KoTBE1B>RR(c!3lg3@zi=P&V^FeLW!vfMSkTO`EwXbzb!OVD;<#1tLdR?C1Q@$8H{_@aDS8jzuQO z>y2Q)32z6$$_>?<7fruy3Gj?k>%vviyftsiaW*B|2mWmwY^${13!;<#>{vE6UPSE9 zj+sEbD3ZRf3)^kzHux;<36mmUU2J8pwmo7$U1Qh`R1%hwUln*P3P%_hhc8H~gNftF zj?E{Ow#~C#_?YY>r~>>A&yI<{V}uP z-U(8;LYl;W>67F!YYVGr@=&HQ=LMku(y)yYyVX;D(}1#iAORr3&K|K1;V{z}TnCG$ zmlehCBb61?&)E$rL7Qc7s&$K^wY24RR@fxdpQQ??GxsA0b0Fdr9cJg$V0_O~l+l94 z#xu2G1s9I=o_OFMVe@tx54Po{(iwflFML06N>1H%g9kMcf0@@H+QXoIw!u_U3SMlp zQ+PG6;4G#(eC$hyzy9VWrFNEdxwP3br${m>Q_6m6lqdd9f)G>JaCl||93=uj7Sr~*!vTM%hDkeMZ6fzc zEKBZWh1l=1<0Sd`fS1|Pu14&6+`GoQYC=mwi77zocTqOs_y;jh7ubj3_jwa;XCl$r zH)Z0o;(2QWU)NRz+F9{>Te?-s%{r|1kQ>+3@|Dsr4KT$?U!(HAPF(7pBpzNc=r_oa z^X19$wbY3|fCHk)3EjAaJR^MqkuN+d=Fw%n(d6f^^P)(s)2+Kzj0;@Y0u+?bZIY|o z_fZch0moXfyfKM~TjRESE?X|kMQlFh(GV58PTv*3uxJYwSWq<#zHweC{RUl3oOl{w zWioNI>F&?D+iujp$lIONQkL%0mDfz!VSx2bf@W+{pZ+w;Ht%|c8eVXeRthH2;Qo&h zbG<%l8ZA(@3tuZ8Vc)YIQueULPTopLr!6pk2S|5oy(+;e}T~`xlOVXI+I_V986_>NBP>Sy`v`$&0>y1W%sY;7m{?G%#L*E=zJ6*l% zmVYDM=m`Q*yO1{C;lp9($i#%+_bpDyJS}gGC90n7V^c9A>V7cKk8J{qzgJ?m(XCk^ zp!~9QV~oWgY?HOZsibE;iLrIP-N+3Cz=~ZX)ZnaKXuSB9c%TkU2dCp>?D`_o4^qQL zJB0Uy)c6jt&<23iW)2cGu$j4Z8(fk7Jd!ATOt>yBKxfZQQGSKz?XuoQ-BVkcnLVCG z+9({S&b^c>+^er1KE&EFh@QDhMoY^30&`a0JVk&8MM>!=-ML)^JCQ?||{TFAnPzz?B zIj^>w5y0UHeIj}BK>D-39j?N%_bw=W9uSBOSn<6s%Zq7)LH+@3LW`4cnNr&fTdI+| zI^7jYlwoGB5A%$ipnAL5kHYyHm6GH^rM8K@opgD~%^T>-+8^FxDu-=>P5 z1UF@A^o#J{aDAJ;Mb^CmRSatKEDDWYKYl?-|HWvQwvS6>0wQogZa}u@h+$fn?29g+ znRd%C8u@q*Q1M647Dz2PvvQAUvL`>dnhwFSV`fu!J@+F7(Fw144tIsZ-pJmq)cJ5c;v!I4`@UOQQ!@w)WS@hD3YMnY<2d1?ipltfS{yd+eots9^jn z()d;gf$CT3B5~Ge!><3CNsi?G^x`n23^O|)b#MVM7U_#;9npV^5aTbcE*gIIh8T*d zS{3;aV+C`s=~&Nczhi+(Kx8u}Rp-iYDzAI>-{!&Kf{2DhVL9r*#XxKJb@9$U2nM=)!KWW%momO0u`bIvVUfTv3!Fs6&j7y1?t?mu`Yqlu1S^9`X)OeSc>|P9_t+L&=?KnOS}+@o`;fSsO{qGR!;5OX z9L!WHOaw>hFb4@A?dfMW7g!+@`hx31X+*rNfv1bau`9kK3emR2zY7u`#f$qwI4ud2 z{t%a)k)8K@)~>D2?Dnx_(=;3M8DUfTT^DfCW3QPt3YUahtCrW5s(3sAMl}{hHvkyI z(?ygOeevl2O1XE>D$w4W>+Vp!W062)!_bs1lDjH`FDm=CEvq=7s=!;GGQJiyE5}q{ z)?!dpQ-Lb=ntE-Iv-sYB+&Xq$>;Yl{y_}Ov9aa$qA$;lDi;=eZ+P=W32)ezDZA$1n zP5PARb24Q`wbA!dA7E#_7lt&U&+3>#;OcpUumq7IXXcLM&bYRk<+b;Mykvowc|_k? z=04JK2%8Ti-XG#cPsz1+&)6IoPvqe<428q(MxIAzE#JfnPPRc zgn_8x!^O&mqAQ2ZBz(dpbEzh>Wzw?-p&7Z2&N^4)aEn&WWp^DUhTv{5wJ8Dz94q4q z_CF`SyjKOVkq$v+$dyBo&wZKFo7B^}FFs4OfyUOdoCQO=Ix2umq<1?YiNm~XT1BWZTd-b{bO)(DP~O~O zPp&;JIq*g+4oE&{ew|@FOBvTG*4A5#mfjbWzHv0qME#_vS)xIZ276u z3(D$K)6-T(J!O_nvQYcu_|ydxONtzef7gr@{iH#ZZm^EVSlq0n@zq15!{-f|DR!=w zSR>WA?kU@Jx_r8&i4du&E0=H00}XAMvOWHi+~1G~=0=*hZX{;b3CWJQ;mzW`p$AnS z-_Z-u>%5%?*@XWD9X(q5%H(J@w3Gfk@0hZYrEMxC9I$&=-|QDhEzRj_a=&u9OWPO! ztYbjl%qBZh#K}v8~!DN8oGJj$X^`vlf1`Xa{87ov&#m{qC>3OIvSV8#bo6Yee-~Sx#DWj_sEA`irq1` z?q68Y;BPkfl}wfOfj4Zi4G6;^3Ve4N^M7Lh z#xuf7W&3vZ4ZY_o4?o*%ibrOCiA;F4_b12c{;ywrb2-}fr-p=weh+21+OfW@M|24g zJ4*wSoU~tlyJqLoi{k&pDCp}v@@W|;LH1j**h1!?^|SJ!YQFN>eSJ{4-S>>Ium9P3 zyE!_V!fwosQnSLJa@(E~C_JL<`GXJq;rAZ>c+sw1T>s;Mi*MKcJEr36C+`pbGezIN zyz_2z3v5F^%qz*-O&dKop6)FFM{23x4-$oez@kU?zd@u@2S%d_F%aS zRaORydW#;d>W@+Y*OqO-rlfm?C)0YWDJQLPA$Tn?NA*pnNfnZ%Ft2`6dsrNFLJPl- zQL^d41|+=*0Pf-M0ohlBxXFPPl@B%-8NCm__(5L(kQ1jbBP}Phh=)d3MaM;M56#XR z&xC3U)okVjm0At{DY)rTQ74bUJFovGzA9(@~$`-Ciog~WqFl=9cl^C85MPrfef2=-WGq>a< z_GGV^mUeSWXdHU)3(-a^FrKElIjer8UJG`@Lh_uO* zo{SjlJeK#iugJH%TWxcEARF3f6O)?_BAAs46~b9Q4&%jfXG*r(pmE$don__Y&7@S) z*M{AdOz@-&J;H<0317gSKD(K*HxyHC`kjU4&x0}1wjs21j?HM7Q+@X##3I0^QA!Eg z`rfU6kAJOQg&)XkGt{K-Jam=_)Db=;bVmH{>~OhS1;X)Non-m+cC5rF!{T#ImeV?H z^3K+1p`Ld2!SzFvBC#6%`b7gSYs@PK)TK2*FzoK2ayS#hLkG+IH<{ibx>{fm0iAv70JzCAgta*BEZ`c< z&W<1jBg;ugGQ^`VXk+%;^g}ew(yTpka5?qm=-?Et0Mc!e#bAE8Ob{wdFi}I(Roh>A zV8&IR9iCr?cmkD&vMX20R)K23?t?jeV<8x8B`zQAi^D2ke33xcYqKkN+>x_UL>43V zI*_3imHJO4RG2=J$3>+4!k=djK+KFsV}wy5!s9Lml|VIiueVF0Y*!~-#LF=!IWBPX zHtKPtM05O75Y4t%Xiz&8p5I;uH)nqAjq)@cXmlil4a9{}g=@?s^W+@58}Tph`9dj9;ds@FofOq@eZkrJjR zuK=(}3_kc;YcP-eFyTAD{#~pO1f=SLJv{{_oD6~nuLWuiwh7%$2V*t055YM?)thp< z9IUfu#4NI6T}6aw!?8Vft59Acpq^4f=Ixu6r6*i&JYoNvw}enNHqGesy;&nrb;MS_ zUhKArSW_UwLI<9Q@a8Kjg%UF%d>N1gOlKC{MG2~QAmzdEBIiW?g4#;mKo90P3AA#+ z0x0MFpY3}z43I*tiiE=8&M_xaJ{Rq4F^RS}{P2XCiZ@Azs)J)No{g$xk=KsxdO6PE z`yYj1N{ewM*j%VHDJQz6Y>kQRHk593C4==Z36q2y$auVr_6`T7X$Dh3RY2KT(ksTG zVTb-Plm;C;J-Tpq1;@+1sfPP>xQ00WL29|f-t}b+B@@h%$Fav~n_wA@6=m^wuzW*FqlpY`u?w<-D>!|0I1 zHj0ChxBjN#x3*o1E4NpKAjj!3CUeY4fKK-A=3YE=hS7Tja?3dS8RAMXaHMbrzKpf! zJY+keowd}@_Bq`X>^P{}SUz=BX{6t?Q7#jp5-FqhZ0fv=xProGy@$(|1)CU_Qlni@ zGFK4M(lcFmUhrfT#oOB5KBuaS3K}0DK}xJ9bBUKzWW5VIJUr{%+SKhrMcWyBd4rUF zprKHKFt2yIf_JtpNpbTc(fPk+N1}nMF98q-HsUByC~=1ga(YC>J5me{IVlCEf6qaG z1o8g=Zh|tVLu`;EdUug>r&3~zv?tTI=}1vI;ees+dwb$6IVa%o(X7o2lZx?MZLh0e zJ%yYRTl5}`p3m&T8PYGL%U#M6BhiNS6PGr(nQ#85aHS)%`rxBL4_J zaO3at803{afs|YC#_LUL>eK2@R*vLK);4C8K&UbfhGzoIF1$gqTbJP z0hLQ8or$gM6tPNfEwZbRzSWnvGdC#aZ$g1L>*L`CG!=)sd#7J|xuVqv*A8_JJOq}N z&k!J(02N8T9>Y?Wfw^M_tta*>Z%iy+B<)wB1|5vt@0uzWWc$+9QeRWR43{7llEgwz)15 zXvxQ2YFqH)j6btP0m+AlLR}V0c|&R}+t=*z2p> z`=DUa-8MkvwSaDuXWMEjxPOJl2T)s`(ps&<@+1^wYBKqlxj}YIF=saGh1GM}UtpGj z)-g_&5Mmx$r*Ry$3-(s`fqa1Xlc=(d)eDXS&LL(N3K~#@vB2gEXpFy6-%;}sl#kB8 zr0mg>Fc?UNyz4Y4;(RyAaqm?aNV`e+(Tj>_LPGPe68DZzMbkYcbVO5MMnXB6u9q?! zqE*KZYPYq>Px@%hUyQ#e4%pv*$@$Ns9W?AuE%gjX!@rpx-^KaCKxQa z;M!9Y-dpE1rgI}xR#uGOgfjl%c-kXn5FHm=s$qL1zEySo*t(;|d!k*$1<=VZqQhyw zZX@F!7n5SF1*}roY*21vp`5RP2r7Zf$Sz|3%et|Ua{>_cmrRE4QFs67SSf(iPZVv) zIZ*}5`w9=`qJ5hKz-V%q=$Q(pQPreL&bzHh5CZ*J%zjv~)_ zS3~r4N}AX#ksG~0QPgt^wBhoIm3euEP8=QJrcs~J*3N5JIAUvN(cNq?YL3zjeQww7 zv6;EEx3Ef^i@f021yA=a?zvZcnl)se!=as9b2S;!!9ZQ}%>fO1T4W7UXFf#eKloaJ z#+`9n0&=h(&_$wMJOfgIYVnwyHb%@yz1n?l4Nm#_dqHsl;m@8cjRbfD$Tsw**ot78 zg|rw@&T(OIE7*L`YumUk_vC&%8pJFq4^9|3Wp~N@^tEJ*&Eh75A}3L;Q2#t*$#fUQ z^!BivSSTEWm`=_Lh*n4$wTf}^NWN9<{H@Y8S+Ly}%dK1AnLXoTT5}{aHu&{dtfvex>kJ@{1)IP-(<(VuM%GeMeDH zvH5#>7@5AB?2=d)4~psw3mEAcCA$TlQA5AYVx>chxK1tnTf=$(O%s6a1m=Iidr4Nl z=31OYyO>7Vfsx?i7=p@#*a~iBb}2SHcI=9q5(KciC3V>L>)dkea=@QW9(PCD2wTz( z&CJ80o*!72CV_^g85JV{UD`1C!3NLtHdIA^RX#r#Qn`F|`yo0=;XV`8mIl^IIVg4_ z=GHcb`T7=bENChw;PmOR2z|2R@%lH~qN?ePTg4VS0RJV}dZDs)7-!^giW~@>^AQ)a zr89v!vfJ#lk{WD4+)_wJB>+2YK`H>vE5-)s`k1=5CB z&e@ewg^~X7$+{*phIIeeQlk?qJB7&}Bz zHU^R;{^K<{tyzSnl;JIl9O(G%dU+spWy_j9Fx_f!Hn%7!fIwSHu=YnWS~y7JM^n6O~1#2^W zK$6nPUr|AD=4D9b!et-4Dl~s{d+3v~JH8KhJPgF-iD}OAjXo8lr?zF>E3nnHPT6K` z2bKnjpn&Ctc30!tUU`|Y@bc*(K%^GDUPtqFf7o`x>YZIDQ{0f76s5Y{ytysN15=w6 z+w7W`Dg%{yL&;eK6?x&7W0V%YnC-b$X8G(nj6h>i6&<;;;(L-&hjm$>x`kAO&p)<= zd*0xCfzpR(t_U>ND;u*0bEo*3%dCV^!g8ax!C>gxywaU=jCIfw+Fb@?q=W)@Z|06A zV0&t^E7n?9%`BEGyURcmR6Kq2pHgT(aVNhIb@v)Gn8B(Qv%OO*W~r^nSZ!?r{~ZT3 z3Hz3sWb%#`a(#Oz{5l`;EcUEk0gz48f1DLGQq!Pv{M#*SdqBAL2R@$|UlInlfP}&G zKT^5?Y*UZ+2ke}!O1RIhUaO~oD{U7ZQab_vE&bNF+kAV!7z{u&7e4PemjMwu^E|DB zO4gWhLJMp?6`3{u_O);CkFY>~k;xTi=H;`wK=aP%()GTym=7?oaaO>;etxx{3`N(f z&xkqbP~Pi>?SS1bz&9RvQ!^*Fl-!5Bwlj0;Rd*n-pDzUd^1^(}O}l)||JMS@-v{*# z8V>}~cQC9+Z6_Fe&z&7o`!3h5jrH;XJETSMPC6Gs&5ZBixc5JFZlCK+Y_elNev*RT zpa3_Ye!=`#Vwnj)hu!1nhU$$odO@L?EYp{;6ICz>rt7i%Vn00WYh!EoK)!Q zQ=;rYu%lkPpRlSg4gvC&Z3`eUptDNVuxPi*%P(nI z{rN|%^noym>GM&>NM?`mtZ~10^5@{sfB(tojbsHL!>7j&0CN2Zl6?;-(NqMaXc5Df zrwGOl?WtdQnYMS;Dn9u3aGRL_XV2;Z4R|M`Y43ZnxhofaYzsJCza%Zq$9j7KB>cSq z`O%GPv8#Hu#mARf$!d7@BOtUytc^4gv&kxRRW$!*`xvKd{4E2MgQ2#|bv_P!e>8b} zlFu6^d>!D)@b8ejnAJ*hz7=zD)!)N?GXcmS^W9h%fTOK;p8mNfn2}E@!Y|D;de?$z z4n<_$o*O8^XUbDtm#QO>5-D;5Iv8V^bS5r+UiX5S4IdT zGBr4wZ#G=pcI;L&FgqKvmH+5wsNMBv`_ULh&W~pxEw8SHUrXjVp8Q0jsraLDAbD7b zYYI^Vqt;Z>r>4&H;f9$sL;!80-)na&Rf#XOwwSyY5nMeSk|C9ohl96BmLDesZY@vb zhjCBf#v+w4EnG89^7e>9ocpmJwvD4G_4)Xux?q=MNDZ2rwB z&rOC?4>Xqy)U3xPgp~)*gC};j=Jy~sQT(2Z=?A+)hf})v5)wyp4cQa4h^Y!=^5wpF zqic)%H5E_?OiJ|$s865;B1cPo_1@vdlmG1NfBWyT+wI(SCdJ>`a|)r=a{m5mk{&=C z_8FhId?k{Y2r+zM(6cyDq6gGrwFsszGMg_yS`V{UejVUje|a{oiYmUK=CM(Eya+xz zt}$UG5$$)Na^mu(?`x`dbzj4t9*fGA50O_>#5HRnWoK$YdlcqQ4I)g&t$VTiw})>$ zjd{y%{JZ_JYg}UO-8ep_wVA=%x8p&J*INI2Vx52l?l|@19c6(Q3*VBwp*KHRUZnhF zf}@Jot9jG_#eM+JYa;CQz<%zMIFIXk(hr8kBOF`!{zUaT6Ju_r)ITqgurB076Z&bm z-y|+5)uVn{{sA5ADF5^E(-2Ja*a)AJpO&ADH)SnmqVrx?;QhrREO!_oXbHP?bL96C zrKW%!87ijhlJe@ulKN#r_3gAaa!n((3bgCIZ-xu!!{txbn>Re>`WstH(gPN7`O2qs zmif!<%!GVR^AE&cdw-*Q;bD|2?V+DHd_*6kL{&~6cA|cGeyGrQaP~6w)yR}+2IWjI z411vy6F?hMl*9Y`sgtxAec8y#%!#zj-|olZ=n}%YR18!xLY#UrkF14M@tRzTq?_GF ztT^39TvNGr#&XbK4_OxX(P6C&ZZ+0QiqDYn60mr-vVOvzS3YDiJlXxN+pJd*>NnxS z7-t+?NvS=2$avyri?LL&TUyFgNtNx0G&S?i-H}{g*~#-!s%nEFD7EvmXbxEOAI=9IA z^=c~QKdSb}dGHs7P~fwpWloF@iB_ettg%Z*t1%GXW%w#+$z4Nn;#)Qub&xClu4b`G{q4g({ zVUKeC9AxZ*f_j=ZX2}069p%D%HgobIcvVSkv*l%|&IBq+F3Xd6GsOVwk`2inlI_mE zy^!o=C=P2Y6^pC7Pulq!$=m6u&LCeWBleepYe~t?Cmzc{x!xLAwljh3UH9?V*&rX& zg>1f@bNCVmXyzykkK)$Wbke?##4cm58oM#*i$w zR5W7``(UhlPo(?I^1W2i#&=ggRH6op-QpO3i@S8IjjjufGJBVm4e}~MvM5QfQHFQq zDUoJxO3z=avG-zKQ?-Us z%I3=3t>B4)4b2C)RU8HPfK@$~(w7QME7z2`Y{T|)RLoWL$Cdi@jiiiH?viy zE;SuOo;Oqk-Jh{OOS~RaI`xi@%q(hsJF7Tj*q!JozAQL@S$F)z?-Zw$XC7l{0>&al zwZD8twX!l~s6Uk6G=a=#4tbSA;96G4{u{-7uLWu9_yj3y zhTn@A>A&q4v2K(MW8}p+rGcGlWFyw)?yrYie3i+Th31jX`iZ~V@K<04RiHE}U!;;{ z+Ro~mwH1Pe?>C7xX+?m9+0j+qmmJN`2!yrS9rGK`tU|9Q!H|`rFb3n@{`EyGfKwXu zvn?8I(mBet2X8m7kNCA##G+eOpVPK30klI9D_@o8ha0aycdS z3E0LaIz2B_ElQf~f{yiQ<5%3ji?z)$)`?K}Vb0<`B+fP@L&oGhmt#3w+RR{uXXc+( zG5ZiJtcHuz+vS4Om7Bxp;dHAzkll~F_r(>yEmdjUZ*naF;KTLylPeYT>3v>`%K&Lb zwCI$!yxT?6mTZx_c#!e8tUfmcSP|K2*~b7w)IF21)c?2w1SfkeYmg$l;{$pVVc~v# z30wb2p2U+{lSkJi7Tk1l40Ukk8m*gw|#mQW3-Uf98XltNoX)^6)r(ch{ad23S_ zKK5f!qu67+@1~sk{QA&~6v2)%$S$b4lhuqaa?HCop7FP5C_|u^a9yP(XNHU&wv@+0 zTb0_v*B9Q!Y)${L#@`+U4kiPM4XgM!{jUyTW^|B#^xQB{SlQpfNiuPT`3Qvs(^C&S zeRg@F-0Vas{ZK|{BY*JuL_5R1ilj)SlV;6zzNxNEpAlno%F}$Iv@rS*oXG3c<5djV zZr}gOP5r?;zo}DpxM0e?pdTkeADD!jpXLT~62wKhgpWNo&Cyk{Nc$~o$_IYnGRR<` z@im+JisQR^pyLz_>etB|dleJg_Oho1W+2{1dgVfS?@ac_zt#gfn1z|JE>ksOS1ima zF`gQz{303_9Wz%f4|tTJ+X+3wlL!|%cq`$lqUq_)FDZL0VMt8)-ygKr5^0h9d*gr& ztbM^M-irI&L5J8ChjSn8EG|O#!R7}_a+G<0iICaPs2inL>Jwj>2&RG5-77;>m-!vK z=&0F@p&3~yN!r^~q&qoNtryY_W5$kXbzgy=mygepMjh_lLWh|R(qAg7ve*o9Bld_D z0MPL@!+P}f3EE*>Fct~Gg2$z4z*Y4~@qqKEYrQ}YbgY77w?M|VM>_jT3YlK^gf#P(i~&u{i6^$_d9uYIc1D8+ z0rQHX>DSJ4Chdjx+TlHB1`%2R<#_tPonqa>&GX{KRnWEB(-I|4@plud zO9}i6o&kO`W+Zl^lUDtT)u`3^TH7v}!nJ{elkX0AXZAV4N>q_4t3jt8U$NYW)3qh- zw9S~%;1kr43=3bkmmw%>GijBb`0vK`Bh8biuRL_jS6&!K$G~E>G;$)G!<$}peav{e_7q$HutpTap13={@xv4X3TI+%UbaAx?xP{FNIKunN;&&0%&T?2|~gT_BAQ{Y=&EgNRxdK5-D;% zm)h1S#q(9o;QzlwUBKE&01CD5#W2k!Xhfyl6CV@ZW3P;0Q;3Z1V<3{Bsqeusn@5#0 zZ_bxF5DEb`%%9qF=81yvNm#B0qBWX)DX~nK5!(j~kI+=6cHwGn%=RCU*9U!e8M=(& z59f+sJgJm#vCUs!U6LH{H#*6ID-Jj&qAazHAMlWL5K2p%xYCtRLmRKFWErOLk39tOMW1!rfOe5y0oI`>WtSV6OJfixBMU5N7t;J{h zDO`tYm+Yy@KjW4O@PW*V$m7Gi?n}tkxOb~$ZB_w$7(8c0KsbrWxHmG;RDD?u-lq+s z>a6k`;b#1@P92zWV7?>kdrq_s8>cuVZXt+0xUlKQoyl!N%?(>ys0NlQrwpUQrox7& z`ltzUtZ4Gu>=Hl#pV{hrSDvYII{VP=K$ecg-zce3HMn}1Rm3SXhbJObj>T8}x2-M7}85S#Z zP%3jhyxFA(o?VHGQEf9IuZ~q(1a%_E{7&am$Z_JkFrXzA3Ot)lWhrVEzmkSv{%x6K znKLJO^OOx+XyNn8ehwnb?HenmBJdzgQG#TGrzf#2*XxJV%ESk-IYC=9eOjxLEv{){TCF4jynQAT{);0-gSf| zY?xPM#lZ}EKXn$6hbmv8_t5WVImFC9e9cjp2GVLSK;H49# zexUBuz7>VLU4h@F^U>Du+JgDWviXDu)r~pLY{^KM3Zz_^_@XBpZ?C1)m*ch&ORif% zi`?wotB5~rkF8!tZU_4H&9zwLOg2bGoBn=s-hD;NmW}6K#CK%&X1@yZ*5Y8r3Sh2YxT?Zn<5M5 z4;50b)$~!*MSmfd{hs9;q*G=<%U0r%o6?J+bDTF<_{z{V! zyCIF%b@puaXs>rp4t(Dn_o==t(KC3`O1g5e)xj7-;Q2U5XReF zbm-V_&kTTn+|ikS zO8uw=?BHFBwA!a4Eg3F5rq)0FsX#Ll`~I%|Q^R~hde_CL0a0Y*xdE@)$YEPT8r;n0 zTX=4`hdp&`bm>#Z#K4@drLy&$Dy&j@$I|Y5UTd-oP;(Bsp4ye-HPe+fy2aVpa(SnF zK&{kJ;8WoBCIaZru3DaB2hH`y4*nFF`^XK@W*$XqUY!H5sY5@16v_HX?Am9S*Q#3#RN*TE*Bt%isw}w)+*Jewh`?&c zBcHpC|3Q3q2e21dAN}+CAN}XUmqGKOMk1;+l%1!5S(i);p3GW0Y_h*W0l1k0ogeSc zoJ~_HcMAsK9PdRI8V5gI@B&aPOm`HvJIA>}78fIdzC6;k{QouB|Ja}JkKLe8+eL9} zMGoN*G+?+8k9s-n%1^btj^tym(biCV{)rBl{C#KcE)@M*&6&El<idEdXw@~! zr~E0e9m1xNe!0qfZSEX9i~+Qa!A7T}%~{}tmjKf|_l~iCVb|>0C)6&75nw{OkyqAl z`Tf#rxUZoe6rE8w70uK;Ek$?)x@P0d3s@@3t4KCaFE{7-4OV;pb@1pWd|iYljy7}u zoZT&_c+B_5PwbHZSya4_J*3=iX)2*^B;pz`cc|y-RrJKY5mR=`JrB>vS4;K{JAo6e z!jTrYT*I4yV>EHb>!Wyk!dz({q2EZ9uDr)?_O0$h^|B+4aP|p{vhI-+AapEYARXm7 z)Y@BopaJDXRIEuR?o%X9-~=zyUHscYrR{}YOucV)5q5U-uV^FZFl{O<%ZU#^dw#8 z!q!&*2J@~7atm8jhe;B`L-f=T@?UyFpJ*OenqZV4Z0U! z8+gDaHhwXQ?N1V;&%Jc$Q-P7oY7A{v>Ro*G&5um-P2V~3%5eX#f%GF>=l2-S+A6nS z;%ZB{n6$DS95;EE`!w&E^j7QleWODbXtr@ySfO3I&xzYyzu$X3s%%vh_MFV+wu>)G2Oe=n(*Gtb_>!thJ!Pk&$D{YdOcbItYbN= zIrZDzVOLVP9!y!4f8;cYl`#W-6=bLcxJgF6@T|h>eh6gCp8-HFK!dG} z`5tF|l^f_Wulp3}npCWoADY(=p_x`&{N8ZR`66}K`^c_!Xdgjn5^Ms&TKCu;*byAo zv){Np-Ftu?8rN8(;azXUA}^uYkDDw%AmN|+qo3`<{w4^~~B@O@YP;M0ejbn6Meu8mpS zgm?XLVR(^duQu{}uXc~dzF$fU=^rm}^Bi42jK>gMQcVXvW9=87hP_h7zjcLFy^G+e zLdy<&^*~kL2@!vZ>C=v7wkt)s2@tyCS3RubE=yY1Z1ghw$X;oLg_nMf)iGwz{c_xT z5~GwU%aD{Q1IFP~#Bwc_5(8*M=$6X10+fGQm&AQxvE;9!Za6KMp5VXT-X@(MMzc84 z9lou9#{+kGm0rZ~B1O$^R)-NpT9hZb7sCrFGB=fkr;HYGX+{dm{Jb+Yy-XEg6l<>- z(-x#a0{mNidffkc1n}8Cj=;E+aZ8Tx%pNJ~`dcOw6hr@hZgPz`%*^Xh;n=E4DL=99 z+&E^GBtz9dgw+Vdn~hha^88O4i7c8KQ9b5M{}TZ=efHlG;ClLM;Fz)%J1`RUF5N5Z zririyaHD-5MI=RTlKDn=Qg55TaMn-6sJEqb)xtRLLh~`dtck0n6KI<& zsqQ#6OpqH?(;^qdj1INYnM!V2FZ^%-rhvwD@cs=vX+2S<2}jTbk?Kc08^0Vggx zfE;gCvT3NkoGh)TM!C1OLS4)YNc0(*SuWAbgVaQcU+|l~XnnnkzPMhutW-oGB*T-Y zN}M~hXNp+Dyz|N!`mbtZ=RMlS+`_1o^=7^;7V4QE-lO3Ld{rS2G*(?uJa5N*u@CR$g8wBsjFOlueN^lLPgfczW(qh4tb{ z5spFe$qiyZyKufnIN^F6*6FT?3Uvkk7qoo%PW4jrh57`1AUe=KCwR4JyL`zpC%f2~ z8t0PX2ME9I-3`L_#@5oD{}@HNde)NaECDc(;BlLq9)7n+nSibe&PR9nnD-SLupAl+ z$L&+Iwm+1A?-4^$k+XGh@Pp3{|VRe2_N@p;|&v+FPfES&uz*yMNm_#bT;sY$}*Dtk>EjNirtlRrF zWK&O$mWf(}$bNBE1fZyN7jEEwWUA&k0ym|s=Ec%M^5+jwE3j(|0cy9>+7{Z1sx7J~ zT&%^vDj?E7pSHSu!NtmZ*&Ug0t!wkYMUY^isvOrrQ!xAb$+8U~gEt3)5fikNXodTwh>DdEa% z8$1usIU+$x{nu-{ZC-|xRU1;wqb z^Gy_xlqHGpsLws%HQ-4^wMCW8u-yVlpQ=_Qjn`S$wFck|FR{Y%Y-w^s#bR&#mWs6{ zL22w7e|Hc~n`t6s{K8_eu10d!>+T(=;a0_R6eTbzzG`IBX#jxF$0-ji8#g4z-a--A z`1F=cFUwN>g5n*Fbd9~n1-TZ}fU&IgulVAa>RfX5{SVN5^u^MJU7;p*C&O;pExH6k zxPAJv!d#3kC3sN0Wx%nBY1l)o92y-K&d0o}Vt0avyz9<>-lso>=24`8>fX*d&0bu3 zNza`!4%M8DnJ$)G{nN$vSP)3_;tfmX5Km&Sp~QP+&&6p$Yb_`jCTHSwobkfy(Avk7 zClY~u{`rz4HVK9r7?;K$q-0t%==bg;U=5+lm5e#gw>6#MZT>PJUBt6bm{dP!2SE{y#8ruiad^jtTm^OJfwh z?)wFSQK}CiJ(`99{Fa;JR7Z%Q@Xn}j_7CI%sCfSXG`5s4N0Oc=P32iGklGBmRMM^# zlek)<7i-_+fRWjTBy}@NXO@MOvc{bG)LcN_$Zp}+s{e#yRSE(=vpppe)z&9w-Y#Ln zOuPA_NPR{U*CO{Xq*NxVlve~tUnR=zE`?_KK`Qf2jR9W*D8b`*wTvh1*(`j#Tka=( zJSN3{B&nwW39pnmR5I{v{gOjBz=*VD>ectdFeq{W!|>W%g13^C{XV$z)+mmUug#B2&)mZe%@Hi~^n7>3onzOEuNxI(qK z*dx5JDiMZ~o@D0ePHJs0j0i64{t*!6|1rySHfv@85-jJ3JDf(&C|q)AQ$-quaXcRU zva13=;jH;M7G|%9@UKwj)gjC>j@-A%G;!n$D`?Cz2_L?Wa+}S2K0Gt(c7GW#r^t>u z@5U^g2VcbbmD!IQu9oQMLf4H8+54UbQ)U1}zU9J;QH9tlRGR1GU1Jiq^3^^;*>ci= zFJ;Cqh`*<4becToBTHRb90&nQc{-%1TKfmV?2;JIe;69?|At@1XfG$#dretMs4|^8_w}+OmP4Trm1z87@p);!CB z4g^(|Rh0Hj4aCLbr0rwhxz=u4&D%&8Ng@DXB>-SYDXt3S(6d~6D0h%HiyHK_1a=ID z|B=;oWhe6nIr~3$J*hmnQTPojXAo#@0QylGzHXERqcDPk%4c%ABSTYHuot0L40@oC zimxYO9n-Knpv~-mJC*83ao#Y>hRaF#<5G!JoOVHuBwAYb;rx|?<{AHRjI^^dYg)oR z|L=w;9}w?g>mj{-EbM{ZSr3WG2u&>d{li$3XQvN^*zI0a94wjYTXOV;07~pxAH(E^ zJWQ5qfBW-8+|B11HX}&_c0nRO#j1U|k`76*(V$VB0hXX_^p+P+%vxXaPb23;In#_7 ztXQ?3SBLy#bnW#j?-^j_Uvm09TDo5F#N-)oSfL06eXFW)=)*#G{V;Za-K-#2A#9>u zsc2iWvNK-45nXl5Fxf}|SSP3`%w|=K$9vZw!1e{=IaTJx*miKqFU84HgUcJZ`$noR zZXZg-;lI?Wd{@W4o&HxyeB}*TrlRLKfJ&5lpPg5nf0sA{9&(Yuq{%=o1j%vv61ibH zeNOZw3-SzQi?S)hb&e45`)cVEz&_IAnC25roA=2GCK%VKNsgAV6H5`~ z-YO! z*A~2C>d~h$Z)cmO%HS+zjZJa_hV4-dX8hg$0UNQHs?nNhjrA5hebC{!)}vKyE3zXSjs_*AfD2Z)=Gx|hlDnPU z#|$NTaz4kfo|F_-g99O_I|5qcjN9 z6J3J`t~#PS;C~&et+$qh_`c*>XWOL6l^Ew~qCdSIa6J{{=F+lQ<%dwb9Zql>0vu^u zaz%u?z)9k$;v%Io9_e#xT3|FGv@JjKFkv$`6M{h{02@zZf)=f-|DA^QCwsM{tWacW zXZ(|^mg4$8WA8h=w)B+7`eg`(AZozGGtCcfavs-dmka^@1$ktsOhE5|XEU<_(>hCgQBAKvR`1}6bC zgS}1hx38+xaDQVgmllM+sLb`--%`CWzYE-Z`f$a4;mgbI zaG7QMYk~VAS%$IOH(s7Xr_Ru(J>7zoXF`^(=M}^x^Q$N znkjP8YGMpo9sFdT%$f_GnBT56O=H)sXi_u!m9qp^#vVunHJPz1d7LpLVZ)5sM2PU3 zF&3!I-BA`3-Uq4Q4!4|+O?yglE$bASPIX(@SmB_Os>D6-vWz`#SNV0%ykxQz9l)mi z8UVPLby5N!^Z9IU$Y{k4w2F`5<3{E$bvl|RJ#2xS$dttCH{>W!Yu-9{gugG$V0tjn z5d8({VE1GuMvOGQ8nb(zFG&Kz!rhZC!NfEaQ~R*3` zzRyihKmydC%vG44FlJa}#wB4#Jb|J>2zI_asm_bZk*}8e&%8>3i%N$X!&;I(z^g2h1 z0k4mvp1d-U%{Ml#d{*NPY1SVNec&>pwaqL3O!218+dFW&nIw{eZFoT*{zd(bQ3FiI zTJ+49fh405zYz_aWS!3{#k;YKkAq(yhdwnnFe$~YvpS6WMf6$MKCSlY z?Uv)=<@T}Z#DaU%R%L6xuHExFCd`4FhBpp|d*CA}h9o)PPb1@M>lSN9YZA==r+810 z-<|xEI-=ZZTkT@!E(+HlSEkpSQ5IiQ6MT<1t!XBl{vqLS0L6`{fGd~7{u4m+dct>p zm*43XhNn=c=;+K^U&PD!URS00O=X(Bl2K+rDNWaY-It)iCJzQlELylK;6IJ& zJWm@}C4y^JG;WDm)>-KxzR7q2H1(cLx%httJpV@|2jXB_ouM1Y@s=DK>TLPpf0J6{ z{H+xS;~fa^b53(n2^k+1zZ}PrszUEyj9U7~!llBAH<_-%mfFkTI#%BW!-j>X0=+@= zARqC+FGPSx(9W-LsJLDI5P&q>`*^A&D8EHklsj|0@XPXn(Hwyt)bArY{t`m#-t2IS ztYtonAn}I9UAX4NYL%}RU{67l+ZDT(Y{I`m@-!jtoFD|uN`G42A%U%8!clWMJdQZgvuS?{T{_Ra< z1?WJ%nBRgqNR{ze>U+tMP63YmjMq+Y@q9b>cB|8CdCn)wEe<{w#^kFZk{8xMab3^u zKk1;|FNRt!^WWSA6`@C4#i*-@KbGZL%tJB9Me%uBxS78t(&6voWONu2xZlauvx}pS z(v`_LLW5)>&8(04u!v+K2(R<%Qw6U&<-HeZdsP7DfK`9iYSJ5k?vVqO`nzE>{id(V zWD<_EDTbXnZ(L=j8VXNwMI3n6{8dg6)39M>#a60q!g9{uDe^xxRDjnBw&<5Xn||_a z45#b90N?3UB}_&VLc>821TKa|1uL)rZi>v)8aeMS61zH-K4IUm)`*K{uy!Jhp$XIqA8q2;VI-NjIMJh)!l;^FlAsw;LSI$k zy;$`@UMVga?om0 zV;`9CuY{ZLEEGOsRsYt;x$criv4*Lc zNgR!(2bfoCO{;z@$z{kwt(?#m+CHoP4KtiFCFM6SHZH<387wQT$0xFjRb`&mEZAMu z-5g&0)@zijiLo462uzD+#49RKrBQ*wJr9C&NHBIi z6Irroe(tiJ=_m*C*W=@M5nK>X!Dkv9iR4n|$fQL%%R{ji8QRcI(f-4;K@sw+uOf~*Rrw7O9e-Wc}aUT_K1ApARnnD~ezJ%?$ zbBD!}__|(+4#Q{(myYyS%SGv;7%jRoM2ui$5gqDo2pxq@Bn~is=~Gk&$k5Sl+V*oQ zrZfPO4C%K@@1g(@o_xX+@e3QEZCUBzx33E1=Qtx37V{!mnmbq*gi}40I3u-wr9IlY zoE18bTlq+HWF_S7RclKBwug59Hy_u4aj@0t%lO23JG-)W)Kf{Ytrem~fC8mFdMqm>J@y>L<_{=Jic|+{}g}mDwOXY^=_zQ{J6hmNG_5SHraU&D9 z=M)FrSC~^s3I@BZSsk23x)-fV|COqr`IUMh>Y~I0zYJcdI1vNsslZ#TJV*97gysiJ zMCQz?$oCaF^Xg)nD{rWQ+3;25}WL?XJ*XzsL@?CYofYgut7TpWmOQF-aLY{lHANXfJE9})H_GLpmWNn@fFIO?lBec6S`{Uo12p1QLHx|mD$YZ+O zSB5GFk|Bcb8xu!)-pmAQu%=PVl|a*>rH?~hNF*|H`8c7H zid^{g6f=5NwRD#EpK;Cow|E|gZzK|vVBLL+1J&I*MvlRzcI2tt>CD_%cUHnf0%8Cd znbLtHC7KAzpOc(nSR8f6Th{sMXVZ+ZcVR%?$;2eyG#vZW3IOW#aeV*=Bp%a{c#9_g zLn;?Tgm4IuyZ@%ilYkKtSoPmT`3qd0C2FikW=F8w+V*xR*0^(d;{^^95`hTXi|b|_ zk7yVE8`kjk%!`9@`8fR0;A3ZrImM)dKdTZpTr-iM$L-XDdO|5S0U=5vQ{?O?YVc^Exf5LD>0r3i0b8okBixhQc3lDYY0JdSG0++w}b#g)0v;^ z-1Q3gZ}RwRW0B*PSy(Z z*5jZ0p*Pd*>wMk-oS&T)KiApF&*|qrz2^In^*0y}GQ0r~N;XD$Wq|wy908FvvU8eb#EpH@{o6vE7DXW<)r(uL1t}50`>bUP@`KM*#v@EAL20eWRFU zhHS<`l_V|$TUn_bykJZ0o}f$yLZPt<`g^~Pg?{rxD+YI+cz)`6XmPfWDSUDDxn7%5 zkip`64twf|yi%H=I`S$n0c5XvxzZm}jlHiNR14S)tqWfsFJ=cL9~}W3v)LB+AFQ9= z1m1^>(O>p1u)<{_`q{S*UVD)PHZ zJQ@x60C8-d2+^ZY2$tAe;E8cHE8@aQq+uZJ1_Wwgup8-L@XVdU+Iq^M57L7i>Dq+GJXgwl2g~bd z7pHR|tQ4yD`4&%`e)gDph5vozp`IFDQi>b_01<9V_6!{uC17*LWVm z|5?4~eYII)75+y0wueON;|Bx|+Oz5$G<++aYM5FhOUgUW-JCzd?5%ZRc?UH;^;mw~ z)W9T@_F^Hfl4|KthG5EZF;2rwZrHSd%pv*~jzZd+Q0oL*W_J1!n{v3%r9>UL3S z!bF|vPV>c?YqvlrWsC&~o(oMZRd}T;{RMRt`fxOv+IMK5lgm*IONSxH3E;TO8~jkU zy6{zLCGfl++;&<>`!(ne9LG0LaAd^*Y=!&mpnKp;I4;Hm9V{PH*Y3D`NY^e4y0#z| zR;uOv0(8B#fT!auOaZ(Wz?~%npvtM)_s=uEVL$!SHP|7({}fKa12Bb4XmWb`nC_kYzVutN-iQ31$z2?;iD$YY&xx6l$y1VIQVD4Yfa&f`sy-YKH=j@?-E>;M z3?ZBBzt=ur49bT?5W4V{5%CmBu974XX&X1r#LLxuuXAxn1I`JFYCxqWZIpaPf7xZ6 z1C6Bv!?Qp%IB@3MXWq-KZ-3f#R{qvJ%^}=|AGv3{+vewwn@l0`bUN)7>_$bRhg@Av z;T`0RNFVbUP;$l@MheNyMTqu=`MZ|JO08k3E=L019Jaj;xgJ%K=(e<7VYi^j)Al4k zly@FeGl=+W2jMAJAnv2U&2N}`iOj2f=drRsZYds{QiGV-(@DLG5;;gY_7XGTOaK-D zXco&rDH&J>e4yJFWxsFMhm5a}CsV&1y0OIi@D?zm*5On`vgPT=OX0^!-ws)^J?rQp zd&Fq7#($hH>labS=X|{5fq$00IqPLCto^IXiG4W;cfEP_c{(RV%z3lka556jf8N$K z@gQa+H7M~jtZ%@g2fGeS^YpvBpcr$Z*51jUr&7Q3z-Qxg=Sj6nur&71lAszarJyLu zy&>-7{VG4q@cD%)i!|c927_@}(H_nY)x{8|1zFdkj5%WPf#xCoi9v+>kL%j!q%>U< zmU;2Vo%mqed-c%co|h<4OybIxjjZsvaP-5@f+SixVI&xkC(J~T>YsQBWwMN7`M4K> zJuOgPZD}kf$~nc_+{d%${ejjeQqRg4wX3eevyeO~24~;OnB~_n-zDlGpc9{Ip7qof z^_46DO8g>>>)3FAS(8y?lAL$^Nxnv=vKW`Oh~2amcw{c8OgJ2o4!+7R{onP4FnU7M zwUuLKg&|q|VOx@bBTF24&M+m=Gp+D!gZfF2DoDddBW(xayX9dO4Q$mWOVt{Kx3>{1 zKG)$Q0TS z4Vh%jHd>o<}HHrTLVUOay&LX!6f@Qd;n~*fM@qqs2bQNED0Nr`GQcpyB zZNm&wm5IJv;meQ3WFDVi+p;Lyp&)cHCD)JxM(B}bwE^l}DiMVbqlbV{Hj@>~5U3pw z9NqAh!jv@;8%F&4RL_&*W9A;4Z4^03kU_XSzmt#4@6#h`X;V%Q+XP8N3QqhNYaKByxUVA0 zd0#}KE2?$-$dBmCwnEx4h4Xp@piyOjPnNsYSe4$?^RW8y4ug+P9keAB_?^Hy0){Br znxj<9J~B<`_Fkx(Y_{Z0hZ%+cb_skF{3s}R>}XyDUPqjTH*PKJU#+VEvM#=rIW(rt zQ}+_Exu^^ONnMzoB=7gye6gClzl>dexdy}eMxl@Kpv9s0^x!C)CbM@_-S$wJ<2+_t z&ko3Z0FD&jN=cIA1TS3W0S6yD7thIUsd3&r0)QC6#{*!%me2C(kE=2fIrKg{2Hb3A zU*xc?dOhE7xrb z^WD&15LN|HE5JMFtYsYLrS|?xc;MkNE8md8nOXk2@*eBDy@RCx7Mepk#-0^)1AqUk zvu+H2Nb-8AX9pgY%i4i1;)wedeEFpfguQoletPr~RQy2n8W{!zx~-BXI#tLcYj2N# z2(Uyf2(58ZXX#=nLo;xUsp49K*#1|)hLrAs@=lrmiSeEri}RMNnOL6z`cZf1`qf8^ z+tZwOS3V`g*YHL(!-7Be(`6t!qdi2|BevBSHB;oEQeZjmJux*_?Y3ChdPO@Srxys# zAH7v7M`mPMgzlN02mk|+H#+n(chKQ^>N4!`%|p;?h4AHfl)AF ze!8s5a%`lfA#rN!sIt*)7v!~Y_Zo27%^D|!aLzURT~H-j^UWS8$dT~Ndn*45P^{vE zZS;Xi_vqh=IXAzUvY6`R)9ya^-f?Ho8LT&j*h2Wb1g54yk#XA?Pg}UMu{dg#RhNiJLV}2@%?CW?TDZYI#pakH0IbDnedek`U}VSR(0? zWx?}6zez~&vIse0OAd2p;6S>Yf_=12DT|9s^p?@h>lVC9`XS4XJ2Qa_3zzKvKEiIz zq?&`=!pu#!m1AQkMVaX`v&!7abI+q5J?kR=}8hT58dACVJ)`hh zWB{VIiRJZ)yHJ`R{^As=jp*4Yw^Mnm-6{$D2SM#*fP38oR|VgYWwhF%R!Sw;R35RD zGhp?+#j_FTjPw?SIlXTSZrwi1+q%HDIgkV$jO{i-{;!g%|HVN2QQ$x|^)j~u0Itt% z`0_|10cG7&ObMsl>^aD^JaZ(8!qmYoJ)ZZ!z`e5WF=R$^fkE1L-hQWW3VkFD8Om9Q z2>Go1BUm7^yjJyKkJrd-5r%%PAXR?8kLtbE@wV^Bmc+bZ9_UK8& z@CGevvm>fWXlFU}#~25)$%H#F72VFieP>C=drQXrQdrDQqBviLlVaBC*I%2ET-skt zo@sFb+}j7ZIYCJD8n0B09tK-iL*3mkQ`~{d+3r2gz7-tM?ajua9JSV$*=Bnw^<0E2 zyU^@n&ib;*8|&(#?{nyj!?`18_L)X~bwZw9%LRv|=uyb^3S)6z)u_U!nw{}@fp0@- zY@XJ+=uBtbwSnQiOkS?9y!%@?&T*B5LULTv!62^03aXf=fJ@dMvMvV=d875vlFTpk z^+Lr1*S=Se<_6fqn;nF28DSd&l+cYJo5Yegk z2|&*NY=)}GMEz9-BT;wuEwm(>?2T^t*$B^@n@-*EPwMD5N_+n61}uXQ_@(`|!? z9K?vU&y~BgPj3y(E_-YTYv;On3#`cmkp-^KjIO8n&RmllO;%fUNet7Oe`{iCV;clP zchYMWJQ~++$i@uC!Vn#3k2~1ih`%s-WWax6@;M%M-pmzeAau9Qic<J8@2f zR(A;#hc;UK36goow#NdExR=y)a+V+ieI^)l7Iz|s?R*s>vwK@m;fZy`d2*S+PhL?) zq7+18T8%?d!&{cKnHy$;8XPp(liqcVK#B=JgOq0uzr%}=?)0%iK;M2B0dVWTx|%WN zf|9JnT<~QzM<9){MTw6}9X4DmqFwL_HFri8Ak){}$<`2f;IqYlpr_sw8`Pk3-&|k9 zj7CIrn07oz^_TCC#Dm3J9<(bHU4_aZsc+Bs5+}7}VS~qWnoR5_-sRlt_m}A@W}KD8 z6jGGCB?ky$-Ps7{7J6&cswtmBx^szAI*-2rnULGQ0a1fY6tAAQ`Q4A9J(E+`<>EE> z(B9xIbBVlGln6P^5N7BUIT%zs_7n{TL541yzmo0gUnNoR_cKA+q`B?&T-gd0#UTIT zOyX!q4}O!2tl;-d<=X+Am!$rK?UuAtxVQ14#LaMfl4VM;={-r`ahaA1Xfh5nkG0`W zp9O0y5OTb@QO>@OUVl29*sINRcR)z_UXy7Ludn^$>^rUgBFG-L?-Dfie%I3)mON0p zMzRB^Smson5uA*pUL{-EJ^9R$+10MrW%B`y$S+7u%;-)2xi1LRVyOK_lG+VAarkHE zufp}jNILY4d2c+Os}lQ&Xoi+fxwzca?Ju$9KN4BJ)5Lsxy5t5D z0OO3Q(%`^N&pG0@xVIMFZambcdLSZv8b=P6n2tRhkDMerp4(olkbd}##@^-@XRF(xewy;i00X$(-%}q99ex-M0A=kPx`|>J&*nMV{$AktlYJP|JvzR0D3=*JjLHeNij1woc&F{GB>YZfwL?`{paj5a6u zGS8HpJp9M6k@wEz-2ZKOf-a+QY>BZ;+?n({|zb z^Q8`KV}!%=jnRJ+C86D%nV_V+=dlg0=Qz-|5eO9!4!A#BcIN`OokTO>QYxD66v)4V zk-lT%vb*=Dca_`wji&%bUWwsQ&$vdPmOq0I*=hw6C!n(;xRU(H0^?xhcW->CA=l!& z_d|3WNyPlNay9Mkr=Cy*SS)0N#H3bOm*FKKGL&L@J>w=vqcXk<-er!n`Nq0Lh>zXZ z8ej2%!7(xHMBOuu@Aa(1r0cc(=p+l$&nZCzK{8rBN%FPLRmRu+;ID73dD$7x6XURC zS0!U@V~=E=*Zq+2tI00={GjIA-!USC6L7R7As^@lm6;o8cq&dunbq}+>)8TAMWqxt zZ)D_pt5pZT(+Mg`v>&%fqg2M>M-&0?uK}TLID|+FN;6GodaNnsV2noL1B?$1?QXUk zCy?ba*GlCWHcD;4ljdeA7m*1y#1EgP8MB$>-6qj1I%{I=Y%SqMybHTreU+~^38ugD zf#e`#vNO{pLyUIznV#O8)IEEFNnMGX-=ElFw3@*{3uC2%QkbG^^;G6-VJt8@7_{KQ z4k>qlx;B$s!_d|puxYb5Ug4;mI!0eT_h#~?B|*=>b8`(yHR0b&TZhui9LcINEys6P zA9i{l=Il5SjY~qpk)VfrNrG1?>MC_)$v!QFY)2m1bbv=`8>XtzHn*^5!P5?=)T&ZE z<(Ji%?9tDf_?cBlgNlgqmv%d&241k@G43|&g_KaFd0@u_+hrj>^_(6`pN2?*`=73# z2e&RQ%%~MxSEb!{AN}-UXbeF=9TP+{(r`;}3wsuR7TM(W64w2$A;pF~z4Wk)F`Qfv z3*8Cpe@p+Y8+EE^_O~w6HP7v_uW?nJ&hgZ2e`=4X%BZ2N-s(tLi*pBET09Q$Q_z`i zla=%7H`h}v()G}0r0;bQ>0XuN zyYg&v)kcPP?JKJz<4p5fdj{)}=%ehxhRV6wpzmjql{&RnBO7u0*1!7&U7Pv%cIJsq zn=6?WdS;d|=KpA9pr$OLC^6vJozW7nOda5%^I&Pah@-H_96?6l`;%1<`Mz`hKtM=UL0ubl_y*zN+<){7%nV<-^**^lMtBY=6VlY+I7%&9lxGq3KTLa*!#N6wbT(`xuKw zvi)K0-77D=*L#im=@~;V(AdVWgPt#)ia6t6O*0Q&*^b=pJP=Zf3r5Dr;nCp}KwSfk zqZVoFHm{SfMcKu)d;@CnspJhH$iTy`E}Lpk-?5$1L6dVR;i;t$>m`#ml7hRBuHgIQ z3HWWsTgv^UTt@n>+2$$}>QEkjR)tg6t$aaTqvl5pye=S=ey}P(GR1gzy|Wf5HDJW8 zEWP586L?)jFjk{eCYasdL{|bjaw^KGpUKx8+Vf)N=^g zJ6HERw(>zDLs|3CbrTKCQ8@+EZKqL~vCPZ(*{&jO38qYtl?$F&li}v|PPjHZgWBzH zU*q1cGuz;L8ga5^EQkB1_}tHdPQr{kbCt5Q!=^%=+5`i7q|yGzLoE$D?FJ|@mD*S- zLl|meE3e7Q6Mn`!H`wm`nr7*?nWmv!B1D$)i@-ItE=4(W=!W;&bXci5lIURqZa|zm zrYG$ROjg8z*IYtnwas6Zvk|G>@)AOHm)YGZ;X*>D39uv^m;1=QMx;6Y>B}rhbcK-f zT!+SiN7p)c$%aE2`Z-9saL>dbOpoCKH@(Dwl0Xr?7gXYkeYA+6#JiHis?eHUO`>0r z%FMPG0b9d2mTKY%Pu)m$-+xB+dThPwzV}ms35)*rJ^o$(#v@{Vfe+rln{{gd{F*q| zEg;L50R-Y95^;y$d-!|TE3(IkVc(doGTOF4w%>$OmUsOCA{5~cA&e)vW(lWpZV<>U zfigU*|1hVR7)o8=x^FYXVdwXVE+YiJw1&EW86_UICf}NFa{jc5-HEb^OzlI3%-KJ8=3EA=1C4w=L zQ~=i~*M@m6 z?@sUyR<-$k!&?!KMz_fz6$)Z@`nJ(S?^KC#PcOYNmC?(}bHMgl)z7NT6eM{Ng5wi2 zVzD1iL%swmX~aED$*;+rt;?{Kqv_9u#FtPNX*E+n!_j8+jO4N`M`k$v zz5K_o88u##tghi{Sa%hCM2D|mQH#|j82xmYfwIa{!O_InA15~))vdG6wDLKkD(Yxz zY|?7UWv{wbzpIZ}Pp54rw9w>vWX;tM`w>^WAv$zPj(%-s_L>?{@C#TK5b5$bMWFml zDiv1@@aF#KfHp24{x>27!&(i)R?~C5rE9`VJ{pVeQf~Qv9OUdcr`Bs+WPtpTo7V2Q z#4~zg&SpL`$1u7nX`jR?);=OJF)k}~ao{$dZRvUC`Ifim8P^|=0!~09op`} zdv-P^+3o4B zPW!Z07WoJ;YxX<-!yhbOn)5TxQ@(b1x8YQbbz9y*9W8-GUjq}S0oQ9q>p4*q@TDhS zC$|(Ohh|y&dd+RGJ>T-f4{-FHVjwY&b@qo#ZcC9>P#u4x9HpsCUyV6cfan{*vDXyZ1HEF_o?^@MEJUr{SC zo{Ax|hvVroaaU9*p&}2rcQCbvJQbz4xaNzJai;0YYE_Hm8N$}cCniQW=0i~uIJZTZ z0SsL>Zh+Cmkrc{x06i{HPH=$Po9=xj;azNTB8piXp09;<8|Qu^2%nAml(|nB9?KVg zE2#Cb0Jlk5H#gZnv%rt<-nO%3)nJlvamCB}*gCOQZg8N$L z#@fgHj_%*rL{nnI!oAbh{ViEG4(zToEg|W-#n<2yZE>0Qsk*FTe=D|Kf{#ejy&Qb4 zIhsgQMysN!F!zs`-7y84Us*IHLzuZ6=8VrMLln^6nnd=$aKv7Wzx-RG3IJB7y zT|h`UoSiNYB-*!6up{G+#dja5GRTZwm8Lb+Yn@|u1~r+B$|f9Q_p~g&;+upXsJKvp z7iuzgy6?q5RX6UIaK0WWRWVWpR0h#U#@nf<$#Hwx&E~8kzqJ6A`NGK9{>s~FwuX&#G!V-J{Wc@we=CJ}6hpGI%SXcOBm5g+R!!DFGjHcYjzkmOh5t|jda;v8 z1NIFWI;$qSN>=En{kx6sO!iL$5$cb)}XJ`WDZB8;9YCPqtN8g z)Y8eZFF|aFRt+0hAlAg|+$i~w!O<|IpF=r3S@!P^f8D$u;wO}&E~}~NZ49I|S%^Xt zixbonNam(H!6`8@FhSRy`Pv4!rv3?UOjq$Yuug(mDCjwgLQh4?%ttk=6A$PcuVvYL z4}9o-w-&(fGYX|RNBYW3GsWc)hP~Tutyz1+%5p)#r&o zA)7T7V0pXB3Frx$or9`08#1bYeK2|cJhb{ZlLpfUm|#)(`72ovV_N-XUo=qqc5~yA zmwD45YXB&D8Js}CtY+1Mjf7v-N$1)9M9kZ#7?bu<;0n{pTx6!r1gL{lV8d=$V`~K? zM~NRBvdC9;3<^gS7TItW?b*n#6gY7jG@|O*o{Ft!aa-sZrW%? zH3q;{i=6s0q-gmJ?G-fsZWdIj%&%Y)*)IaWff*OL7^Ox55fLDheSPGX%waFv613en ztHQwM&q!T7$H5*BQ<2RmRZdbn6XVyP-B)$YL0aCh)vq@#_Eva<=V2VAZ@*5(Vo^S3mA95?4g@Wi9~9S-554tX-pbm8-dC zd}pzuF;MTwX(`K)o~f^84g^(b7agDzeoJmJObX3%fhDztIl`||{IBOtuqntYMT);+ zk-MDjlWP~02Z16I=f=vtru=M1F2rIJLbj-^_wX<7j==?61`hOH{#kgI{QT>ZgtB#Q z3OpHCX4K{;$b8*DnaWQhsjxhuqeRmrRW#I}jQW!*FVK#nI;Oy;GB@McHFc~nWLe@9 zT(>B@6a@j+*En)`p-{sHX?pkpQoG52tBq*c0v}w8fjF4yu7s7$LK_7pXAP?Ltx#UC zgX(dL30KiLRTU#B(Zzmaut^$9>L|KR4IR@q(U|S3#2YqRv9u{jNi|{t;vUV#*FN<= zC$RhV-@KEpX*x2P)$S$bu)We`2ISd4p*7i#Z8lnUDuDyyLEw$8YBl}~siPNx&9*w# zJwugLK-yV^ddoK{*)sjF0c9iCFgJZX&gI$`uHI2O&}vDU?qG%eT9#ThM#bPG%1W)f zQsnNHeBy3)wdb);D!wxP9aAiDSN;|79GqGlf;YoP+6h9=JQLbeqeN?f~CDGXAW! z7ETQtbDnY^El;=*rOFCDYizf)nd*C~0|)FZRK4^24ecz0L7>$vP3LLVJYh{?aW?nF zV&BFZQ8AvNlSQta^E0_kPht%7H$}LOKkiJg3ii5b_X!abe(Vk)C3O8G+3X5zj3K;E z--!=1q)%htJ)Ik^)Y2?Ky5-G4mmo@TZTW{@bHyl9scmKnHh3t=aV;zzFF+F}wX!&t zWjFB5%K{Cq(S@rl<)F8Ls^@{N#fj?9SLmNa96Z!C|G_bnh)iOCj9|Ik_&Z(_g z3nVO2AfJ&}w3g4rP&Dvp>i$BbQH8#Dfw4|hGr-Dkm>iiagKq~p0;j&2ifIeWqt=vo zqzAoQ?HLzdT<=YO0b3vbtkhlBDS_8DLVKJH{MiK>2gidlKbMRX*3>$>iNd+F*0tmS;PRE<}dIha1t$fP?jj{vjRICX8S zu?Q8o49lW(^0&a^0>mfRqxrLW`&Vbd4V9DVzeV^!`Kf?Kx)@uy)x#TU7(DdhbyBoi zI4x=N32k?F&m2nYhPPWsAoyc+WOsNjxO#D2K3YVcn}ATPfcOW(s{B}#oK9Wj1oHtP z8^%HR^`1_W4~*msx8+9eA#PPu-Yfe^lZcWrcpG}@4Ui$^FHrEt ziAdgogNWIe%ymIr(2I!YH~+Qv8&gH-wB6&kI14vUY&!)Mjso6`ReBac*M(lZARXn{ zf4Zd{)H?(9761d@!hz$Meo0(xYD-`TyxQ5@3dw>t+jqfIuo*Kq|J3x7tpRBX&p_{x z>EHLp1Q*x%CVO`-b&@u*!N*}O2$TPbv>=6V){M7$;JoCBZb4H)a)RF2RlmK Date: Fri, 20 Sep 2024 12:49:38 +0800 Subject: [PATCH 25/27] [Docs](mluOpCholesky): add coverage test --- docs/design_docs/cholesky/cholesky.md | 4 ++-- docs/design_docs/cholesky/coverage.png | Bin 0 -> 61954 bytes 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 docs/design_docs/cholesky/coverage.png diff --git a/docs/design_docs/cholesky/cholesky.md b/docs/design_docs/cholesky/cholesky.md index 7d87b0bd9..2a5605618 100644 --- a/docs/design_docs/cholesky/cholesky.md +++ b/docs/design_docs/cholesky/cholesky.md @@ -457,8 +457,8 @@ complex类型多batch性能测试: - [✓] nan / inf测试:输入中若有nan会在运行中报错,与Pytorch一致 - [✓] bug 修复测试 - [✓] 内存泄漏检查 -- [ ] 代码覆盖率检查:代码覆盖率无法检查 -![image](coverage_error.png) +- [✓] 代码覆盖率检查:代码覆盖率达到95%要求 +![image](coverage.png) ### 3.4 防呆检查 算子中做了如下检查: diff --git a/docs/design_docs/cholesky/coverage.png b/docs/design_docs/cholesky/coverage.png new file mode 100644 index 0000000000000000000000000000000000000000..b1568db126099305c81045a85c997b369fced6d6 GIT binary patch literal 61954 zcmY&<3p~^N|NloSmC#91xph~$oysMI%IQdQ$tB8VN{;I&bIE14QmIqX$*l|{A=eQ$ zw{0p>7;@WeScPHCVz$|4+yA5U{XYKR-{D>;8Pc9y>YMC~VoY1pojA z+Y5g<1ArU^0M^BC+6ewn#gGRA{6_}nY;z8%>QSEsf2WV!FFuu z!NuT^o3$k4J{PZNU-QRRVYLCc=;OvGi`VHh{%XH876X&nL93`NCFT!#hjud_GB@(v z@TjxfFX)}!ym_^2$p=0h$c+D^p@Z+I;-*8d&oqE&CO*~P zKio6ncsdsMWm4dn5WxvFZ*Dd)){5s9q7YH>9p?0*9P^ke(|BG@JZ*#oVVff)t9aHF zzQcMTK}16YaI)YRL!?pzo06SNqTYmWXQ;22t(M;h7)#F`gGc?*K2IaWEq#zj2V06> zIkGJ@&&^tL2b;A%m)i|sw6J(Q8_ym&%~Tx-%c-FsNbctq6)vL_g<=$9tYjGn<Qi+Z5EV1JW2_)C08bc$Qg$cQfaCIHs{>Fcvm}u(fcXNFRAxs?{`%lz zVQyi{+_AIhs+{%zj^k_^F(o1`Y9-Mp28tYNQ3u8#%OPEN|z(pS=)d0Ei01w;}Z?=tRLf8RnzO)go1?~M8YydUJvttz2qfbM&DptWo!#%Jf9dUyQ7 z0-+)HKTCUqQ$>oun`A%1U`_+^O$)&MqMZTe01*HB2QXjXB16gTK;MbThInrT_SW_d zRUF5uSRy0|oB}1wT7hcjw|_;N4(o;I(6MX!3IJ{mu}cBjAy$j+jAb05TD3ao;kxR9 z`$;=I2tQ}SO*-^fyWw@W zqirZW6x`N18+xCYRU+0DzUYXuj;Vr5vNyUD)&XU&eang5eLvT12LK?CYBc@`JG+Y7 z36T*A7{XVOx4bPBKX<~2I?+q8v)Za+bqhre+;?wty4PuOZ?<`IzmJUF&z(T^#Vt+D zD<9YU{E}qgccXRHQ|8eE6qkPQj)DvYJS%;sO50}H>h%yAV8|Ic6t<;-u%=n^O^Nd( zsbM~we@y_mH1~Om&64wlI09JLc=y)rNIh!J+;p0puYlzK?<3$eUyYs;)RMS(I#Qf= z5h2SfV05HPqG78aOqdb{ic6|dfaES6sW-&+3*aZp@s^8N;6p#>G~r^cP)*tY4&!@H z2_I23(kQ+Nn+oVLvN3>{jJ!H>#B5||H7S=tnF=?_f{5ng-%lln`A|>0q$c|`HsI*` z(l$K&j+^Lb595{Dyde&CfJ`KNS%}L?xw*;N$q>OJ<5k@0$hi-A>4KrVgCrO>nV~gW zUXJ6Tcijag?mt)S(<{Fr@k_akB}eHsu?&Ums*c1j;u*{XJLP=Cn}K;u=uYIJm4D9# zeCPLCJTRY2BTb=WLvm#(&Fg*bO2pWh4fA`}Rcp7)`ECXe^FNnvtF?)NGAO`eU>=+* zx0$=acTDe*mLLO9=TsU2B zgcf&~m`Hz(dVlm3w~0Xw(G2NHm~-UaOCN9oqO+fW9kW(4u}T~RWzrx4|6ijd{9atewd&N$;2^rqVO50)e$U;8cNN^hnz!L4ImYmlZ_Fq zj*MTH=beBkrc1R*7^yJaFre2Gi}NNXRtgGx5s4h#wb|~yDcwRbCryzF^`>!+3>{wt zenH2V&BAtBT^8-ECSGIhtX8567vJjmHZJWv(wA(t6bj?bkKjB0>pS98K}pSI{73yn zur zBqG?{==}3zb-nfMvFL@-M$*YMg12-4Y4r838d`A}+}+6!+0B~sU9+X#reVYc8v{hM zY2x<;*!QpW(1iu{@l_+-Y}FX8k#mug%}yjVMhjI@bJ`FhxC<%X;=e$nCt&;tLClR? zCfwnW z$u-q5y@l?{to6C`-34XS8plH4jlE^9S@3RnW4`CxvB2_w@JsiaBirPMeuDzQ;6p$4 zBUb|n#gzxuaR$9>{$NmsGCRM)m&g$?sIRhRtEW+g?h-=d&jBMjUm|E>c|rzt@uF; zIV=D2p345Pa~-ciKc@RklzSvu#{&S)sHNO~^F7Niu-1KdvtO0B8ngYj=$!_Bxl$-w zodMeNoiY@8NNAWk#sruPvq}h<_63YXe!Agwt8bY3Td;fL3T5TMgYGOS_Yv1l8-g#@wB`kF{@0+z zn53rl?`Hoi;`NTW!x!EcYn(hq(NYW-DLk&y+C|2*qMmFBkXA@KH1FV-Uen6#bE8pC zX0Lk(!q2u9b62%38)KJ!W9M)(k>S$gv!b9d9k;Q+zb7iaH<>iC4Ydd?Cwg-~?O77+ zqYW#g+?%cDImSfnB|mDN-`qE>|397TTw2D1r(;~_*y+2NepvQ)ksMB1ym{8hFso4c&-VBAD znuL3E&gm2*aDE?*i%%CDYBx(R9sjH7rRY+<0TdqPswe_2v}gNg(*V1Dt}3Rqvwz3k zn8R#6N;XYJPx`2K?ckbYbwkBk&bTJa2|I|OoPiMcXE53xV2o@q2_c+I-c`A)4;bx_ z$tD4bKRe=a-c_Iz^do&Dp9XbMQps4UC(=Oc@19DiX%zhIH7>!d#)4sL{b5=LCitMG z(er)*-8Q$_%#ilm41Le0<<)Ja1H&PMUmx`UCiV`RHQLki@Yd|?$@D+nh1OjK>kG~@ zM;l=%8*ykXDc0yhRosa>2Zf_nUBhR^@y}LyFPrR}mg^vHs^`$(OROTLSx4MxubQmB z3^6Wel`?Iajj8b%pK^ZWl*M5Sx~u<-7faVU8lBCKz0FDc{YKu6Y1C}I70d3#NsOZr ze_I}~vA7*ne96RQO4~-9SFWg7BzxX+Dr%D`mytu01%5Abp0>>Fl2S|^9~YkvLzm?Z z#;qOom!N7>Rjw%CU=BAt50AjUJLgQIVSDsA*I{Z&^`^v5ej zDs5nxLs-JE8LW3P8$#Q^%_Tj%`FZ4{-4tO!vU``DxePjwJa?n$l4q_PWDlVx*RMNZ zd8*#E`7h_!GJF4gtIG&hJ341ezyB%Gfj)VJm+$5|M*XYW;OldEDS4Lb_f~QFvXJJ` zkf|S~W$uPJF+@T7*F0gq+`w?e(rPb%#S$V1f&8-RSr$1p8( z?U~oH-Oh{4y`t+)1mvS{Ky>U^4MP^GJ6B44xRx+{6=eY8`{~JqRN(s{XEVf zPtq51jy5>Iduh8%qMVYXZhCB36+s(z<`d8)7@tQY?wj&@l+R z_nMn_WrvYvl3%v8r&vhl((}q;1$8ciY1{DQ!{pMb3Oa1){d26@)PRKl$>;oaNERm^ zI5X?4HSx$+dpW!B;dSR;_$vHs=ASc01(kN))V7k-xD8Dgjfcz*b*MKEPH4MSJb?|d zu|}C_M=9r_*E3zeF3}JWN+QZMt0ITkD_8a)@n9Zr+!anJ`!3YDyT`VZj^CX|fPSvw z-^a0Cl3({K!i&v3;`0mUV9q5ParT}g_kGHH!_DVx+?DU&C!oiy`ZKTwu7~X`;zu)W zFZpz|Xtx!goHS947*rOSNsS~hp*jm9iuI$FQ21DQ)}agDL~E$tvbbK!@amG4XH%!? zAR_~N7*@ciun$}F{2XX?s*n`y#N@nV1jGEGTkJ%BfxjB_05vhhzbtM$Qx5k^L^*N$ zod3zlMCV}IV-2;4YWnrh;iCJUPSnoZpCy}z(p*I+rajbOl;v$bO7!wS(cZ__qA5Kj z_+1()&ZEh+dW9l()6mc0=#1cmLde2j9nHrlIIb{sDZgl`JpUhY*a-&C$z^C(ectkt zkt1iSLK_?E$NY0X{v7NBc4;%-_^`o?IPX!0@IZJ24o?ucPxlxSqZcaU(buDf*c9=R zf&{aX6zf&CG@c`eSkV>{W#a4m=}3!3dD63qRO5ie740a=#`yx!rVTh6+Jp}Z)T&3q z%)NASj}8oP4mE!ihQeAX7wPF;j$^)s7brH^dD>vk{)uMQ2J9L-BRjEePjy9(a)+@C zSy}p2I|gN9Ui$7~-JvOO(K))cG+d-_v?aHZJdC(637b2I#-DARUH{ZxACDzT(2XlmY>D=;7F+K0VDE3FmOEho;*mToz&K=-iZwK zGq2wFJ{dZIeImv7bY3p;m($HXzuh`^mbuE}&&XplK94hBha(jfJYX;1<*h8kX<^DO zP;uWITe=@qt|hdQ%dL;LQas8h7#kRii`Hv8e;J96;Az9^W4rC-k2JHJ8M|OZ%J8tx zST_j7=w{+M+U|CqowvxSEfd%IE(BsmHSj)5-PVlOrD-dbYMXZaNFR10Ci9Ybjf5JC z=3C~TCo=heFs=QJE1rx^7j?rQ?lMc&^G=!Inau_9d^yY@mG*M#a`vqW1_JBOg zt5232Vi$-mPaVJMkn_5Q&ZSFZL^6~CFq4at|1n4R2CFC@-ZCCl2^w$N`Qyg2!^T0h z6bo!&t?ZT1>0CcwefvO2ltQ(&T)KM1kg}(Wq^;c{A_x0!&xs3|k>@A*>djh*qMdgQ zFsx_E`(w0FD4pZ)i0bPPP?(iFwCv@cr=sbmz7tH76vl>)BM3B?I z!?06YR+jQvwSw*dDUJl;E0ofYhtyG8ignr6F=565lK0*+q8Y}k zCUxS{v#9mHsY5%Fxd&I8iqBrmeXjpK#`99ij&dC+t!CP1HF@JFqjv$cHpS{1*^9xT zUZ5?@L-yZgwGEa!2|9*HUK(3PuJd|vl)nt^$voOct{?)3@#0WkCWN0P z5w)>yCN#wxU+~v%znRnC;CiH0L6UIWP;u9R+nLA#5SJNpChs^DQY*B6d7HwMhA~XzcakDu$;Bu zM$7y>;sd1`kr`43U65r_9T)i6QGz_Tv6-(@T<|Fr1^V}+@I+gvji_l37rpGA?g;Iy z6{bU#e*f+Mp1*?p{Fy=ve+}0?M@8=`HFxIU9|4X$5ZX)6AZW#88F;3lLz8I|W80o- zYtQ){xgdp7?RpI>+j{+la=J8`E({>aNsERa#=fv2|-M8Y>$rPXLMr&gOX6QZ{Lvei!ghoTC2edmJb!p1gvq5?g$J_d9SBWoovN!@n&bwxHCvgE3IeEW)1n}&NuG)GCg+y zYGwgA`PChAhVy5)rw5gpQ33+30x?uLvot1LbFAaB6PS8f!=CgFMO<^lK|@@A+!kf0 zphIi*$FYg@u~p*Xkj<@{jRK|hzN-3mKaC>WZ@rcM!;*O+>YMLO``x7Pp>Xnuh)^^h z;Sn*ctRV4bjcZm!ofFFm>oZDrip(OrhN2_*+D{dtYM&grtg)x*d2yb)Yuu~*_k5vu ztNT9SN$)L>l4PDaca%AI{5Z;tsWJ;Jnvr99REYiQZ1zW2t|ox&Pd~x0(RL1+AI>?r zJa8hc{OU}DXU_z$k-53Ukq7l8vQJ00G&aNTa4^x0wA;)jA24fbo5EBc2IBF3KqyuR z!n#3?M!KNl^Lt5o`wH3RX9H&*-VTNHmHKpNyH|3!Z{~FMr1^u!0kH1`*Qu9r2nqpP zNk~5&K&x@)X}d7g1_sVqKRJdA`mKm~wV)iEFqoDUtgk#}2TwL0LOJC=R&X5=;JrV0 zYs%*zOQ`YapR|ZD#3!V#qOQRXTPa^M!#-xS4QG5BLLpD0vSr^d$N;U5JEY) z!fC>**(t!~!&#m3!6a3zgHZOF#`M)qFim;Ko6tVW zqdgZ38wX@CcIPbU0?h{J*D@pCo|Af5jyK2X*bKPiaR;5lQSZ@~KWFIbtf=kZX zjV{l-PxaTbx70d~G&+5XMQt#)f9HH@2h0|XQE@}^EE}=8tG=~Tdd$=iOOC$X!FU@g zm2lpZ26N%n90dcZj)&mlS@(s>Rm!f#RoaJ+G_u+lj#2069n3k%GC{6`%_(Ha5S(J? zPTuG;@34G)oa^D$`BDYdPBGi%2O5Ww8V}ADk))_Q3IIgZcH9m>HJ-f@|4Y4rQ4I6- z=@7->X&LK`YW<$5QJ>2D|E6MjA=a@Kqb6P=}COpmAtVX+uPqdHy2lVf9W@B>b|hOn|7!1 zODXJ%79kjLrr_C$v@?vhL5-ktE0;m#dIR#GtoYUUnjgmZ zap2sq*jNN)w#H8sQeq%J8Z{I{>>lIKlqEM(Q!?IJ{ZqQ6ceG1QX^)u-jciAbcP6JD zWmq;&Yahbq_E7>kM+)Hk;-C=ejgCB5)*aN+LTAiE+=~!hMO!z=gYM#GHf6eC(Wu?QgIYLH~gdcq)h1LaTfUX#EB+lNTZHj&yty+d?B!2>fAB2b+ zgOajUB~>6~W)&8_Fs@i^Al%x4Q5H6%vAX^wJrGF>(Qg4kFJ)C}v1uHawuZAYADbAN zq~furoI-*>8<5ND}|^l`xZE>%VY7-J(i+uil+^Q|A0bnD`qWc;3&Nw*;w%*Xsp&X2UAMV zcxGj8MPvq@$mY7hjZ$vltQwSx356d z&dsHso3$hJbF!Hb$Zi4sqr=~D7cMCL%CC;mU%YFn$?dabH`zO-#BJxqiLwtpoppE& z)n1N~ij6hOG1B3d5d?85hk53!@f6D{kLv1I^Jd-;Oi4$hZapdx=hx3^xNf(iLq#&c zt>+rD^D5ipyt_}=4EkL>^eT8EEg15UA#0XbfJ`-xN%*+A635-W5Oth;XevjqgdoV5 z;>Q@%p65DPU}E^1{jGLZ|NDw<_Z%zahNx(eD?$@yr?U&Qhyz|LBVrw2TIUD64{Dsf z)dGtOvF~T}vF1=9(lf*BlztiH*YIi;cX~E-=xP_eM+?scCkn_(j1e}`B z{uEN-6reot`+3aQkmTsPLoojVnCho=*KaCPwf^PZu*x%QLA!RDo)0r@5A6G3R2IDaY3Q@tyl`({T- zkhyU5rG&Pz%DK0uBa=T`yyr<#+i|^zOWR!JlAW&cnz`I0LRFGY$Ti5iGqx9V?b84H zyV0=>UK)8L)poA7ZrtJa?K4N`Ze%zzCwM)su*nyx=ujOSv1Q^cJNp2cum&xzh1~de zkA4%Gg@7*nJizOu8{1SNh>bt9JHaCFhQ)gzrU~KC%t?JYr<$#VtJA1yDF+dpxLl}Z zM8-gx`B}M)?pG&8j&Xy1gykY4C9+A25)b)ICQ9n@%bM`zuklHPNe?RclJ8oQ4t%=~ zQB0Grwua;wBreLw2SwqFpIDabr8@M~Sueva#Sp2T>s3g@t?S{U(P>K@pYWV69z)<` z>v}V;q96$asB9db1w|oP1BAbg45i+&=>7w#IhH^1=LLiO$NpXM6$D{gQ(nuIUifD0 zdC?fHGw6cGZ&TK>F+sDng9}+ss!dz<_%DU55>=|)MH_$4qVy-*1i?}X<}e+bEv>~o zq6mkm{=5|9U$wa3b-e27C}n`mRvb&N5q-@nvs$$OC*0P1oMIBGn`s`ip#IiI7~_sW zmE~PJ&VLo0_@eBRu%UOPm=&ly41>l-uGSgSho{>o`TRsSV*E67inukBxZFP_bhr(x_Y6N@C zu+LuMGCud*ULJFE@Dupwgsdq?*?8pC5IXfjpykZg%H*PoCtN|c)~EFPuG-ED13Fko zI;3Uw>V|g(xu}Y+L|?fKoBsKg!3-H6O_;N5<}=&17U1o@tKGe)ofH>DQj~NSF)CUx zhFWOVDmiDwRDeW*HDHSDC3#E_R2+*fik%rQBAw*kS;>eO{p_n@FiRo9DdQ{S`0e=G z*kZ-x1wsVdT;d1gwqa%6&=>I+VBIcuW;|c=G#r!(2rGlJm8#!x$ z`|u4gmt%?Ii83@*WG;vQq{A#ybPdLkLkxPsC$pGJt$C%VfO!8{8ILu7!Iv4Nf0+XC zyB<*~@%sZusAS8aoqG#ahfpd42lGHFs}hFQko7T7{&9A=HVUSTTGEC_;e{!%*joWI z1h~DbJCWX`a1gHjF(q8Xw2_Bk?y6JX<+H`oUrsa^A*`i~% zpUJdT;n%cfH-TS78ujfK9>>`nSi}2~^_#s(*F>X&M1U6#$`1h#;%ryIxO$PqppXRX zsz;-Dfhz)#D!)+u>ML8YLQi!6bPqrLtd}6Xb$qph@!=KX%Cz32aL71u!{S#ECQa>* zk-zaTeFW)dhLB8=3X>wgU5JtH{Y5c|92mWd+f#LYAz>}A?6#8Fa>1qiBxxC`Iw&Ja z4`_=wv*)D5`S2}{8aY7EoO#l3cp+*;6=CGw!Qnoun7JC?U z+2Q)DgK4D4Pax0~9}LID7n5@b#_9ZM^);u_V{LvKWe}v4H*iaLz=s!2~@vS)RTkM}#PL6l9pKLm)LUewNOX^#DOGkz~u zLQIY@=10Wz>(nuLjet_CH5L!vLq>R+au|%^f6##r(r=C+i75sos&ueEYNMW539yW| zB8;&O24rQCo4iaHbx4)TJ%(U#If93;{35SJ&A7q@ZQ+I;VTrJCyT@yUi3hQ3B(Kxr zWs!-L$|vv%0h~<$knffFZD=1L4**WZAkiTtd%$Yt+?WhyO$NsI0-mqEWGK4t$#eum zmPDgV%u$G<%X4S_HO}ziyp!nh)9Zk}PuE8O%=-TnoP-zLGO?HG|L5l*OJ`QbM`}|w zPVL)IrdrSCz@~dx@ZyGE4Ph))F#K5E0KeQnL+CEjF@|zorhADkZbH+HcLvrWV;)Bx z25sHs?9$5;liL@e-B)=~)?bfV7U^WDs3jfDiNZlFb%Hpq-YnpgYTWzRgA|UQ&v{=_ zH(r3Cuh!bBJXd?~?wd=!r$87CN$uOoCQy3k>^gSGtwICO+&bF9zc`vjQ+F9J9tAPL z?V#&8nuLKplh(i=A4xT6`B^%$E{BS6wBRoEtU4Ga&%vg;F^0I5oPB_2H4_XP98I!P zj{@#gsT5b)>BP>phJ15lbfejm4~Su!O|b=yD+A#n z3k!p)nPv&1i?Ck;Z@I)^w154N%CK1OBP7}k=d0k-D%ZdsXHQhkvg8u`A-q*!xEclt9jM^B|^5A){dKt{? z1Jo7(x>T<17)($*2y%5SG}3v$t>3^bPFiVtP5N46<{ec$;fl4Q{KfjJ`t26l^ganf zyWQ^e99tewZReRWS+$Y>#%Y-IVxPZ8ZutP#p##R|Kp4(uw>;KX1 z`1c#V)$@_oOu*F>utAsRzkk{5dh0!Tr|xF4_!p|G5dO^9rq|UR&xlZi(A4@937WVLp{e@v^uIa&NM9 zi0gP=@$SSF{2Dpt-_KXy5N@s+eP80#S8mYyH6XCHedF`_jIbV1wV;|T7vDnm-^VR5 zz9r#I->2zK2G-7U--P#!@13`;DRJ&n}xf>{z2G z%9DoDh+d1~t%(nS(B!f(^NN&fxjDIaN)HUS9Vi|Mo#)Pi( zbcz*O5j4z^03}8EtN_=wJHl=o5wbXdg^us;S7luC3>`TotPo3yryJ-zPQ_~A>#&Xu z{Fu$~tOBuL%5Ck#_U0*_eKnw8%I~7kHnwjWtHO|;_x4z%)<<;9X=Gs|GcpJPGpGL0 z@;t1rb4HD(_4I1|ed;PUDr~;#NHL;Tz$L%%9t?xgjbqg8#76_759l>CutTJMB-1SDAES^L@e-i3%q+I<&fdlK#vT z9+Yxa2{iOZ5wJ3K4n>`F_BQOC70XFzX<@}Ubze_etoHfS@C2xSIK{Y;@`W9zA6Kl8M9g;`P0wblXI>#f6N?E`w@O`;;gSUFRHM|N0`3^e#A{?ux^QP? z&(M=)2?ENq9AZ{b8xLm8^bFJ3J`wMyiJ3?zN1@-0(mMDR$}emnVp^)61Qo z%4DlkS63WS%+-qW6gHRb>vud&^K(Jh!QOr_oeSC(3DUul-r0k>2Y_r#H2= zSb_kuaoe7r2HEP~jYpb=TQ}@Y)^&9*sieJbG^F+~mSU9bKZ^n*N@-=3UMJ1wypXhz z(V%AMv$IFh)YZD_yl>zFL%Vw9#Ck=89LkFn>&WjOGR1?c2ZydXLc5?s{!1?1!`eY{ z!9{dUk58zZEr(qB-Bw&^ET&lutHj6LEh4{nZW|0AYZ|5t)kjT&re%y=d_6m!=$boT ze$|A;GI16QMTT`424Q;(52#2~Z|mMyE|Q1E)Uo4jqT~V#!tX_P+fG4ld_8 z5r>DSCoeUp3PbH@pxOQ0cthjfM9ukAecYV;mfGZolQ+9J7tK8tRBD3KX&fCkWMp6u zMfLWWrjm@b()YnWxSqO#^uT&6c*ob_>BH}zSMlg%_qX0hs|(Md=)6$T)**3C$NV{Z zkZnkP(Sn`!;oRp0iB1f`uJ(-(%C`^o$@C31Egp`l_+0-CYuf#i{ai|J?u>=9H6IppM!Eixnu=C8(|rAsDt&TsFw24%A@*o7w0$j)R2YkJadZz}4oSvOfME7@0b)_8+FwSVR{GKHFBX)VRc zXhM!?@j5@hq50$-Zk?4x55)?i}wyq$Lrgb(hK0jQMv&^p|RZ=F(lK-t@a$n zXVYN=bVF=;Rq{GaoW7w|HVVvWUQ{9VOcI8e5|C->%-va?9JL{|61R68yj8>yT3ZnX zb{;kLiOqIks;SwL1~>#kI+PgSf0{O*|`_j2ffJ0To?;?3@((YoG@H5GTfxP-R!!%%u}r4 z@~h4OTFl$`gCggkw@PkjY*F(1LGncy;KmV_Df3&uUQuGQ!=DBZI=gW9TSCSQ7Du5} zbV1tI!G}V|?=TmD_p?%mRD>=A^Ske$T56NGA;sDtp)ULgKuZ*T-yc$?vFV1niFtL~ zXo=6@*`Vj8LC-fhp$Yk8#u3G1pEQruDsxINnJ%=cuVjZHz+*^1Jk;`dV%r(BYvfXe zMgU2Cuz);R11;%#Zg@@`U*>%bC@K%!*>?-o+<659p@-0Ijk z>Ss|x_E?0^pN_h9G}D;~b*fJ171oR};I^ttkK|v91cR2Y;%HXTtb@-=7s&I&=^Leb zd7TA#JUravHb~C@Xhq+H)O$BfeXk$Yp>8I!NqVY=99{S5{5B-WBJeJqyyOWl6%O@b z(-V!!4SqyO;9b=hu(in#9{hLm*S18|Zvd8aywyrA+JZy)$F)HRbnobvzy{LSxt)WY zjO~}Ge|CaptkGc`%!pBp>Mm!;h~f9!yS<4@qB1W8SHJPc{y5KFQZbw$WZZA9e^~RgSruBiqkSx=BMS>H6@U!G)@bKmIa9^EMwda>(UISGcurfhO3UH(x$Ecp zgW|`(a^l9E?-{ss;tfRi??oLC>dj>zdfA+)wRJLj+V5zjZkN(UyG!uwdyI|Q+A9K> z$Wgl}P?9gWujLiPk;liBu935U27mx4m&r<&m7xfOL-vOORHFc=2MrDva}Azo0W-yk z&dQeNSJ!g7;inVZJd!AY-KG2z9X&h*M~+0H$Dh+WnD8wX4cOA&WfQS|Z}T$C441AW zXrDQTfOHdboJ$2g*L#G}fn(o~EjoR3^T)04L@_p6vaD5i4d)a*8Tb4H;Aw}56+Bdt zl`9}oOomcUDoP_>_6K;A&oJtnz&w$afA@USqr*YR<@ZJrn4djXfHNnpmyelvS+=V0T?rDJsSvbRn~yp7%;C4h74%G_@{! zQT1;*jn^q1ibUNc8O*C&%TNyWYVSZ=yeZc%S8Tl^QL82s-hq76%MgI~-P~$isns&l z3h+Fy=ayWc=s#op#zjdFNmV%u4|oBzUVCsjf$7d#Pp4-d0fkMwYi&x0!6|Xn;)_N{ zE)l!pewNh=5p1JF8{n%7T`%4xYK10sDMlC={X7=E8q+bd+9HT0bIgW|1rLnjN|2s3 z59d=wI==jM)zXS3AapGjf>n+K^Y$PYzBlWtHA`u_T=sJEiAXg&c9cfC=lZLcwgXQu z$N$tb{I2tUbRwmG`3qQ6Is-kWDDzY~*t|vvMn%~71;)nSsGeP0dOU(LMqe5Yq7J>e zcOk4+J1qP_0Aa1*j-6^6RNiS=h4pr8BE8IeVWri#`5NyO3g1bQWY&l^jejTn0W$f< zo34Q!GCvR@Re5hs0Z|6yXD$_kgj>I=KRMV)YW%!uN}JJoDxg&+iuTt%$N`#T$MW3* z&#&G4K>uYtw*4?z`T&edr%tV}UVj{{&+qYMchq(D?Kzp=v#p@~2erx>V_yb;b>hsP zS}T{Scm-!N+d0g%g00b17HNCe{jyKGHuU(^u1--u=Cq<|)MfY9V}R&Rt_;Of6=kCj zT~}?x+}fJgz;jik80&5V=v++)@2G=aVxRRQ^Q{$~ zpyBAD^2yPxf;vCpp?LK}#;o+H>MM=Rm`>xsqFn<`1YFfIg`twhqmDI#WAO1G1E3cP z_bvxRp~c=Zh1P|6kWrFFDrg36@srAo0?&Ja_lgva&uec-Jv6&%-e=dncaW#y9CNP| z4NiB<&xcF$d-2%KiEp1ZM>idYJ4zW%`{W=Ws+ROMTx1drMmG=)%e|NB-#PVom9=heN=Qc=)zW9$K8IQ4h zD-n42)?rQLSnTR!ZFZQZW4q>QP18jVVOn|a^wTLqQ{??-p1 z#po=1-@LE~Xb%ZV`ax|7q4DTF3xECNt5h6#Vty`!Twv)!RNeVP2a2ki5jB3vN3Cx< z;qof*$8=VK2sH!OhZsYp9H?ZKee#W0#541Ph~IGR^8v|Cb;~iH80r#Oqxt>Mz{DHV zrpF^+zsi;6YHX|LT;B|jKEm?icxaOoZgoy<`%tsa!<*GV+>9mc9XwoAK2}#5n*irF zvnbhi@u5vWI*TeDhpM9EJ4|&t`2~0N?=(iJTV5O^^f9lHi~&z zn)f~)Imf97x%GMnmzzw7+l|zdfph1$B`dy^%Z>0DY$)WzoY#|rn+`YBDAriGOEf;y|gD;M8@ z723#Vuvf(ZT$Y5xV19v=uC-HuExbO~UeOj8@~kSdj^K7JB-q@P?_U~}70eVuWz$Mi?jA3Waxq=j zcv4VJf)u7>gQr((u<|xd*u+MLNoOnbNp?Is5-2z`zm(2 zq;|M@`_yQbF|^39=%V?}Q0k$9m}n4eSU5U|3sVR!x`;kr38B@T!qG!Y=StRW9R{r1 zgUtbsGg~O=F_4E|zriOE%0t2w#~QzS#gVM0RN6LBL8yZJF8u5zux0MQ*U+h`#Ue&uE()yRrn+k)-L{E6 z{>2*p`^4MNV<2h3fh9xH(b=@t-tl4GuRgl}+;@7_OadDy#t#FYWzPS0*!}lT0eC0t z$_^yFVHlWq|9i{-7M9NxUD=Kt3UAGO3l_Wstbq9GIM^%&qh|@MHT3=W+R0g09dJyc zb$obdHR3Yp+S(yT$qV3)`v0Gvv-}7Y*q-(`7>*`-Nq1J8NSI)gPEliIWaBT?;^l;m)6wP_T$OXS{a>B`Z2v=eE8d-;DKdoJHfa0b`Y znRR6wa!Bt{)^Gpkcr(D(nzh&SB-$ApeGzpjI=(1R(iE82YEe(0OkL~hTv}ip|M#m` zCO>M$V!;;V`SG>72P)M<^cCL$-V1@@MuPOMmgYdb>LclZsx-kM-yP+yDg44%dJGA1 z@Dk54W-8eraGSTtMh*g<@Nt*p=K0i`TXjN4wp3PvhPec8>jO0 zjL<-)BfL;Ca7FIaK{p5`lhET!Kg=2xEZk7qWWgWpP#sX3)(?MG*xhlgK5^RE#^N&7 z`A3Ehtxf=P3G}VI{(zB7(1}t8|+eu3B zpRY(nWOdG5-64!jNSl{(e?39{UEY>wI<%KC#moKK=5;DpH0``)4c?c#w;CsB+vtlw z>Ar6s;1MGVy{LPqsz&^cKoU62>xWa;-RiK5A3+x(^3sT6bX$W=|9w zJhevzYGt0J@6k?xDt{=De}8oH7`xyJrRb?`(J*^7Vk&+{J>=-ri(1ljRZ`>Rv9#ly zTLD3=O|1#oSv%c(n2(f1s{lHv*H?N>J$veIZ-d#lU4X0a_x~AHwHO$vfv)ulky@-Z zmIr=RfzR9t0Xv>rBtwkPIz}6PmB3PbOHojCu~Mr((>-a(_H8XLO%^zkarkWZooM@e zx@1)*{`ma4(kF*g=(zS58<}qQM?srp1!oSWv)V*`_g3vVm8nNWL4m~c{ z$EwY_^N)*c$=*F`I?#S=D7@5;)l_IYw0G!z}$w(iET~P=-{HtM$tOV zyL4{&<46(JC5uqUmx6w|-Dext80@3G4r5a7NEv?NVPTfbwV*NdD`_0dX#yKc^XH}t ziP2!yzApAK@k{WN9@iRAl&QHcaN)c%3Ts;-Yz6fJGIq}JF0+tu!tl$|0B%`WmF89a z8AnN&(;Y49x;*~+1ltZ*+2YsVTQDfJc;F=%=D=qk_N(B-7d~Z@L|ABI8R1yTpnv~) z|LwIhXHZV)-)t~v~IJ8El0k7IF;2-$eJ$v#T`!|+|K9@yJqYv&20tu`ez z-5w)jqyHp|!7@~wH<%{h!OKw6K-&|v;Gl-goyUF*UMWy?$$in&vL*NOyZg383nZ0i z0kpmo=LaqgM0ce-wH9Nv>Ts|iJH!R5%cS`Z!=Pr3Tzf=M%|4shC-&}CAG*q|Y+-aF zC?`iJJNeJa21R+NOsT6oqJA&Q5$Efnsv=~N##n<9f4$SYX>1qX%Zoh%+ka(b`3pGZ zVl`SK)FP+QXXNySzfXMm#2_ac!-KAgaC@)(j4E01+@=*D0$UQ&4S!XT4unICLEgvw z^*EzDdiw`g^rC#3p8=4vPBXk3IoKOzuWsENHKuYQ?FfY27I}f%$kF4)N;#Rw++#z& z2S?dI;_~}mnsV=44Hoz41rS*Ko#bvL52rOw269}M-At#BD1&hXbTSA1JO7o+)H8j% z(~8&0&9KpkhB_l0n(m}F9k9Ot8W^yUrSBRe=CgFbBCP|A$$%J}D|^c{zueP5e4P~Q zv=zV>C5f{uqU>+L<0{$nBOS)6ihInC*TXCDH@dUHT6d@0IJUy@`hpJ3BD^ah1q676 zxh_=+$tv|*ETcEG(x?u>g_yrjcHET*?O%l~jI^Z7I`zkqU8A^h`^se_zA?)wwutLI=%51YcWhqy2C#q3ZW?ze^lOxp& z>gLb7PH8WjH7gV(93vV(D-eVi8m=zSUb-6;^iD;2$8!qZcXR#3VA=y}Q{2(ij;gF?p^r2wk&2Z@ z)39@4xICqzN+P!?oL;Ecar=>zgo=fBe$0T)a2{sXOMQx_8+w1D$(b;S5S6FJF*T;s zCXKRkILZvmf!8g^+bF{8b0Z#db0LlH!Ajn+lI?#Nw5{(j(w|)@Tm$$>kiGzr#2~P< zhw=4gDCcHP)|_nNUg19pGL%n4Py5c+6ev6Exz?2qT%N8Ow5>!%>Y$@9f_t1l0PdVBvLNh(FRMI}q!Zi{Rw zOUOEPD}*dj3^K_&lwo8U6j9l_S+l0HGxnLrGAK)7kZqX3bSpEqF=ZLejNx~N&*%H8 zpFdhK@Ao;+Ip;a&JkRU(dbH*8I)}G~D2C9*q3mU?UK>OvEwvi*ZT}(pN}j`PiwbEo zigK9LcLHxD$9#4K-eG50O0Dpr*LmF9=R}88G&`Q4OFCiH8W9dwfiF&GlNYr|zwQkX z4$F;xabl;&WWiGX&NoY+JW>Lk?r(?2-JA4-$M>`ZN80bHQmRpkoV(;QS$~!;`DcE{ zLP9Rsrrz7_(TL!fmTK`z%jO<&+6h{hC(bOT1=*?{z$jlu?o>DZ!KPkiW4~t4cVCfs zKiaqn1tb#GttG>(8hVgIvwwO-Feb^9mtOO_>rJVfM@@?rh=Ay3n}WS~g~}b270s;f zb=x%Q6n@bnHONOZKXIY~vCC~It$-?9J)u=y?w-ECAakV3Zi!m}eUkVlsR!>a#zM8N>{- zAIl1w``X;yj#AXoRltpiI=HAu+PmP?rpuvHFXv|)sc2!3$!1j2r)Fkv=+GZw0g@TM zkv#=i-eZQk>b8N14#ip@1rB@xR+KS;;7(RpMf1s6`<2#lY%3Y8e ztmw^nWg3fojNlff(XEP%igY1nhtzdErb^{rWDq+y|m5^ zdHPEls9b+E{u*&8jWn$BY~H)`rt_`FlyUXH4k!3HklCqkgSl=LqPFg0QArwom);Iq zTBj5DwMINtQW8qj%6g+S5;vwn8kKcDKh5!+xcW7v&31yKDv7|h^XHZ7Xt7k07PthqR-zrv^)r@4PSxIwK z@<}BS{M&0jVT%+bB2YwuzZ=>Sak_Ky+T|(0s}&RDZ*`)KrAkVTgHL5^&_6<+MGp<; zRG57Y%Y=^TkoR*kdobb2UZbdNWz?umWjOri&jV2%e=aWPG-jp&(HjKY+wA4OvDAy-*^g)Ohzlsblpqh?37%IMe+~Zl~*YONC=A#MHWo^67rt;Pfz#e}kJ{*TWIcD!7JL^A{?pZ8YoymK47Am#V zj#GjcJ`G{a-gpT=UV#vKq0IZcB!`XVeIB)z^E!GlAd1k~QLV5Oop+o_QS`~X?#?qg zVol(7n2Aj9Wq43DTFlA8Pc)|4 zm{W@pjAq@}C;brSW^hQU;mHzb2be=|rE$)E`@dDRtJii=4Dm|zN`~a0Cv5HierC>`YiZsi)R)&-N?3Vw6Pf=?+5BrXrCPVf0g?TZRamjrZA`3b2xRlG zLRgbZB$+Vu&r(}q$KrH#!wcZ_U8um1FK0652# zO_vgSp3{txoi|)OJjINXYrzmFn{r*YwI7+yxV5ca!`%iM4e}OLeXiMw%Wtty7lAZF zt#R&4m`*5Jpogv)^955u`)rw~sv!?z&yTh^adw z92t2a&3oyij@AgZZPnl~QC!X{z+dQr$~6>_6`|~TVgNUstcxAPUIT-A%!;6zy-gtt z<-hEGV!$sMKd#;$e9|Rd=1(gr<6BS9eDX)y$~64#_aQ%gE5s}sd+)w2GSHhLK7^(0Y!qQTqN zeO2I1PDoew@RQ>g;I6meJ@PecEJW2VqZiD*iz9{D;ccNs^QfsshM0C4sa4+R*y325 z7ks}y7|2BP2}-wcV&s@QM;^hc;6V1N+c|Smof*9+c<9M#k7Rg^Yqch4b+V0P;N8J+36VSV2pf8{mATpJf=)QZh`$Ip?^qIxjxQi!2%mmZiH${8->&NYL&35Y#_G_1>>uKJhAee^(IdDj1gQCbE6; z>YPE|4$a&y+;M-yGLNe43DH2;bffP1>l*F?ueVh;vZK>_UpcxNa4`{_O!JQ5cz=8^#T=u%t!mCPpe$R3Pv<4{4-w~cXv{4*>SUDP>xU>+27O5arWQ?_=$vWEL(Jm z$+0Dg_bu6pB9r_NU44u$?7Nyfz-cJWU}GmVSn54_p6a_f=Xs?bB2$5iw<(&b7L!45 znI!C^YVG{i%0>amyz5LhNgivQd=sVk^RuuMrEx!i@B>C4$qSGuDsYGGN3-HI7LE4#9K&?>ueY(h(czD`p9ZJYpH8o>uZ!r`vG2H|0j88Cz@I51tJ#)*r&C6qUj^GcULJcISk7l_X|{PQH}cbInm(<;LKq)_ff=V>3tN(B0D_a2<6kf{{*;SI7L^ z?%FidgqdR}YeMp?TY@Pl7rK?k)iIjIP_vy+$`kGT+JD*Ha;Qr@BWRY5;1p>*;I_PXaj?(>5^GBDEFNmTe^c5^i=2UU;Y99 zH#!JgWkIaxxg1q(Y?zty$}Ta|ST(9VV9sL!Ssi~Y*fFAOG7or|v{z9&vZjaM#D{$uMvUV}IN z`EJJjb52vSL=g4cDV421?dUlln-1hqs3*(P2I4;@XU%J<5p^&-stfBEy&?T(K_W{Q zX|tYMFSae6Krkpvf&@%Y&kLscggkTC)n`d@;92jg*cOe5`LliwHH&AiQ9oNqM-A&IJ zdJr?ydG;(r7+QBtt5;_sXleT}G-?`L-dW4(3F>@&RT6EWv}{&;x|ZWtDd*5C+KWZb z`}xsW@^T|%=pTdKEq;H_EhHstit6g}Syz5S0Z^&JepY5R#LdVe^XAFn*+U6lmtA!~ z^!h?#_Dq+xV1MW!c(-iV-soHdp-cIn^*W6n?=rNg&|&5|m$jtg`ipKxlf3K3ffis% z(39n;h^cAx>Lyt$d>5EL)T)_H`fe2pVTb~JsXQWS@XDL#g~(IYkmr?jEnj2anrr%DxbdMdJyXucdx4so!j=)5jI z53)68QajLe4Abge{i*yT3kf-ZOB^1V2o7hpZ-3;JmM!EC`{xi&w@Pg{U#dWSeL)C{ zQ(d;4y1BjcHGF4XSyy6mTYLrA`&5Y3;}NnktkL3Q5))tzvy6g1 zPMkTG)cmaUIBdaqdRtg-x%I7|p43sSv_J2~CIJib9c`xWT*Cah2z0}kfkByvQ6nPm z&$))7QFxqahV809q7zqDEteivc$!pVYVB}>>|F?US^^F+n`%V4pLgaIK7^uH%>3+h zQU@)(sjPnlV0}7L=Clizx7!HW?YnOU!4AneE;)=nJm)Jk`-)W{@pEo2xEL#W^g9kOhuyj3zHjuj$;YFP0IbG|@Z1kp<2idCN#=fSYp+?N z0`Cq2fiq5Yz3yS3TcY>OWSzgzUO?pwd5{7JPu*P7WDRWLjmF!R_P8au@sESD6KP!e z*!j8XiCtaAW_w7wJFofFm5mIvAMxxxGx#&BGJbG5)19cjF9}NZe-+yeBA5+vAb}8S zi+~J`h1gg(0V3wVByA2yclf-?G%l2zEpslt)%-Q#?!N{Lxk|j&hBn>V<2>Ef<+kKpc#WA$8iyT4^k3(w2rX+KX_f5Vy>jI7` zZ<&ku9<&!z8y1;&FdbuBX2qGSlPlF-dB1k<?9&g}vypDwAiNVI&@H|-ie$Q;z z_DF)xS>KYwckIm4R&A)*to>*ibu}^B_rpDRr*KkAmsRaqg95r`Iq0JE?u{WUx+`3w zLmhC=-T|GqtoeZ&Y75ohtA4fvdZ@uAPUC`!2FcCHPH-p&PYb*nAZdWxf`y0coDdpp zPejjJdd$hPsUN9gn9S9>(DY*$BDMlV($TWuMts0GdRn?zBZ^tEdOZPTF;w`9ofv8M zM9>-l%M*h97L3`xLJR+B>3Vue2QnHM`ODIEwwGPFMA9)I+)+b6tmt~e{v|7jLkE=kEdgg$m(#1wix} zgOJ&=I$|Luba23bkK6il`G36J1sFI%OGfF3XYoO6WlF*Ri>&3euj-9McG~ksg8&g| zvd}SZRYvSKL9)M>{leye(B1&Pul3q1JmtJ)0E9I|9E<|zieG=@4oVloy^CbuZxtA} z26v@0ONn1IQxN$kaKGqDuhZf)KK}ooZ59}={NsPr5Qz9U@XI;iR4E2DIew9ogvfUl zHwxGWL0H#;Fv$jCXT5t(n;Im-5TNf5|BKxQ-!3-;&?9h=Fw*}eGiX!J1*8<>1l;== zq|R}JVzvH=utyZJrjSB>-NI>sVWFe{IAL3`yXH%5KVvgTi!J6A!8Js8}O|b6Lt)urT&X~ z^!}CJ+9VCWx&PO13W1=cbP-^8dHkO4)H}w`}Lt4($K(TiP}Z0fb;BX)!P%C$QAh_C-w;QKWrWVYr?_26I=Tt|66;T41p>lApBv00PZI(YTB{ilE^lH2sz5&Z-8v;aHcnL z_iqDS($zi2DGLJ)4GHS|Ek*12XpZ{urz=?~YWZFv0U#{y2Z%NR39YLA%aAp7;32b&ym`nhLdbFHPGC+hOT2?z?DkLd_>cbWf6Xab zzILcw38>x5tR}ewGP4hz^6~d}Y?i7Gf+Iz~eBB3bPBq(_2o(X&^b_i#Qc=IG&amrI z5&&{sBg|KMZT~GT51Cxl1M43o-P7;=-mXB#dcUJU@>~17;%~;&O9W&>7+5^9;lHg4 z$OT(d2umi-BPvqpx0QC_8}O6e6(IXh^^6-G+wAE7`%H)Iz=DaolrpomX|Cn}`=~Pz zaxL=&lujDWs|t`)rJ=uDAO=-{AJ3>Q1TYRyOc)4_QtIz5q@4a`C?VL&{bA@~OQYlmU_qSOFxdLXcH_sW#H3PqKlqide{p?jHN=KJ^Hw*1!f%hO;oc!)NQ^Hv}H{?w0PLT446g;0VQn)dyQ)_~lu02vL# z1_2wS5MNgM|Gq{nWx~KB_Xo80|G>poMICdp)>=-e$A38w66@(bA+-MVPys!iMNh*H>;ZH1}i0-(+Z$*jFfh}!oAk;z``oH42 z>7h0Kj{Cxmc*@6Ym3p#u8w2*T_=45}5osfV|Ha*gPRlhLL5G1)U`t);dS zr<~~F2O{yCPwzl--IQ@k7Os+FAC=1-MIZDL{mzSX^CbKyi#yta@80YZPVoj#;-_`` zdt#0Irt%eEx{t1CwmQEr5aQD);bN+7SqH~iEJRrXI;<|)N zj)tYVIu1JSk%*v7)&$0PXKjKwP60!`O1B*ldvWP;x>3ok2V%xg@Y?QsH5FeYws#$Y zB!&^T>(Igr9spACH+u*R%<5jH{W4*hDnBLC&b50%CoAvm)SGJvlnj!^dl7e$!_v+J z!~sw@x3FEm!IhSdP0gO%;tXl(L!6blP=~qLkZI&9vW*CZ#I%n}t8SM{`J^U9+f@)- z+zD5c-;drG+$DTXWDNE58v}3${Y^j+A#+jPVeo?y?~`{>BVvn+vFrqId`#aWdkYo& zs07gtD=U4_@Z9AiZzwHP7&76wvf1yYA}Z97I@%;->ydp4qMS-Gjl!eRsn!slyQZ{l z+H+vDX!d8{RI%hMI2pn#YFI;1uEssO_tTY~HX#f9Yz_uFa0vp}!~YSl4umnn0p`Xv z|6=jJv+BJIUW(1t>mA96s(m`e=qtw-Boc#C18bh1GM*xyLqQS`uBt+M}Lyxh66<4Kjj9^z@IoEyq?Bt~IS5J#e zk(Em-6F;Av`Gyh1)57uGJsuq>jlT=lIyG3isnC-s|jVG*P>1gg=hB=@T+F~e_+HDeCKy78p=x*Mb5^ctz*PEC)7BvwTKMwFN>m#j zi`KBJ>&h*)kLb{I>KJ8>L1&`ItV7bX3=46GX&t14gT70ETnpBV`8L41&)ge(Sf^aoA>v8#ndGct!VylC_-1=&$4o|3h8a428Yab9l{{bSu-*J8ZvJ(|$haHV3h71zm97FDbk_li}hZ6O?2&DZEPmTlZ~z z$S&1%cb(K)UC?!tr2w?^?mqRXt;8TFLi|o+ESm6bI0aABA=trhQdZveW(K{Ix`|3>|9AMc~qu zWp3+C4(P}L6~uA2J2HrV0a+R*=NP?8Bq?eh{fHxKGmlvXN}QjCDta;#@nXri_1-|2b2V=`m3k)CkgDy-+qBobgN5K$W6|~gf#d}5{XdaAnoVFP{Cx1sHt)0v$EnJ1@py!{3L6BQbWXEj=hl0IOH>< zY?Pg&qG~qmJ9wpi+Pyxf>ORL9HlDkc5qgPwL#lz5u1Py$VErVb!`4}fQ#a;N0Yo{o zYmOt9Y!@@em^o)NE$=9oRn0m!Ml{<@3Qq|?uryyfQQaBt(|rWRRM-`@b!h~FUa;ey zSsuOgU*mMV7DJqn9}w2)Vss8&Z@kzu!Xp6F6sAHUU%$C)y{e{g)X;glVtv7t#ioj% z2EGOtq22tJ!aa7Mx)jBB9I2hgWmR#}I~SVY69)lI)LXTaunE=oxmcq1)r2yoH}2(L z>^qdI^tJcB5N5Q(A=Ls*_vMsjuIG4LWvBP+|4n37g~o#+7-WyoY58D9Y#m{1TR zr%^zQ4aB_9Dci=*Za$8EM;83MBJ+xaOKP-~F+-rF5RBWgzHiqOK49YPq$;}Orqjcs zw#*Sex=o%UDDUABu8i6tZn-O4UySUKwWNV(i%J*R|NWw0tWRuH;hSkw*A(H2Ea(oI zerdbtYN_EYq=SjZ^2iDFWuT^h@88&FlGH11>yr)c1m_93^*KhT~dV?oXu zd-jg;f?w>F+_b=q9*wwy@o8?%3*^9Kwj(T63w8at6_#QF4S>69ptXl@W z@K%waC<3lIAepDgG%x%bdoVRB>)^hsT|4|Z8sZN*A+@j=3g`It-08H$m|HVY5}_ix zIyX^yV=#(V-$bHba3h{eA|D@trZufxw!|`bC^_g|E zr@i~y&|k~vviT`z^V8Rmht`H_jWiU`3e&=u9Ce17&5&6og;U^$=J)rUd;;OB)W7DF z;(sZcc7(dxnunk}(iJGZ*nt%~ZOfgVHgyVuRUN07z8!0QlXB5T8f8$-lR3k3^e6}aZZ3Zmuq-70o8Gqx9(8#fKta=$B4o{SRN{q5f^wBFm1k0^l= zoPT>GJQZ9V`^7fd$kpk|Rmcpa-Vr+&(WVk)#We4k8xx0gJt zo_3>JWyXoLDKgY!^TAQfd?EPm6TV>XMJ3fHvlAs}rO9do%~$V~C(Aa5$DR1MhJYj#>IV>mp_KVX^VekB5wBDfT5ZM7`aJ}QfG@@6`r+4Rv+z8ac zn{QjYgxWgzd5rhtPHyS8qDHbH@Cm;;ry&g|pX*}Gyc-P}n-qR?$dRhz0b(CB5wiym6}1%2LUDy7ZE7q?*MOk{b9Y4d zZezc{ce|)fsM2=yV7855g^P^zb+;Z)GNR&e z-7HFpo{my{{CT4!Azqk6R@YlCOEL6`#I0GIueBsRO`;Zqd(fOQxh=I zZSs6Ya1Iq#_CwyCqYsfyA}c~g?(n?K74fw7Mj0PUoVEBu=+elGRfr!B7m9IXpT;RA zu!NCZzZ{vCQO~uCh$iP+;M zo)TUrj);klohUFZ;0<%cj}Da4GsI}qrWqQP2{q0 z_2})27L0$KI6+T6p0ju{``@2_2BFLS$r6;fp5$w!Wk%n1@zvFPWZI)Z1tga{Izuav z!`lBu{7q)%#C$B{olkm{y*eED!q!^mGt+b#W*L_QKihT0tSX}&jv(t;Gwsk-Ybd+5 zI}6p{u&*22yTvTxfJFSME85+BE?0qQBmqfUmN-M|Y2}~AzH`xC!j$FzJ=&Ze4nO%b z(2stV_-~zc+E*eeE%Ugo&7xJSJ-f)T7eGAh>7t5@*IH|Ke)`dm^c4PLanpSVY(<6V zOW)N|yhX!r7Njm^7CqY)IihoZ70y|PZWEaxtia{ABXWFP52>I731y`==&9P}ng)ki z4{V`}L>Fba=UyI$bVQkJ)@CA&65H`}x!;>YuX!o6xvz3b;f_d6`hQt>obO7l4y}9yE4o7w?C5wbc*bNkxaHXU`vDd3KJAio-Q(1WK6;(wzJ9*_QE<`YT+8DK?A5x>;85Xn5)UhKSfuZR0U|dhd-l-T zmHBeJgOh9ohk`xh$rV-J=vLCB;zL-;QGe1x4*n4ueW)# zP)5&)8e?>H)svEfk%t8gJ`LH0y!X|#09}|>Xo6Z&Mn)sFzL2|n3AbIj8FdIwK#iUB zQoX&px=;j%mHsw(rp9Zo>RX^jvPZnvTI+y%qjBWP!mNnIUd1sN+$W@$ZW&tV+>l_S zK)MDbeM#M+!fTFoL{-$&LIfyZMt5({8~rE2;e?e;ie^O`MS|0u)2RRYiXv|hB=H0( z^-d_xW9Pzl8q2woi#Vj|hnyyW*38>sGq>F|}E5VI&dcp7gEquMQJ@HM|t zKtmZP{H$MMs6}skj>b#y|I&Lpp!`|9`rP(mhc2J>ShJoAEc1&g$M^VNmjXdZSjD;2 z&0SF&!L!l;ou9oRrsy%U5r4ntE1no?tM>^3Ld}r2tz)%WO=8K0!mrE`M$eqQWH?@M zbwCrz6(Qbg^D=6x7zYw;PmBfl@j6Jap^NSlsmB-h2!+f5BH@mjP}CA!7_e{1Olg6d zTV7%tVe9u^3?T<unVB)U!HWoP7`lBp9*U2KuQiRG~dUi zU1DKEuI>w)0Q!Jo`UheJAbR^1=6I!IND)V!YF*GAKe*0YW)7{-zE>46`uJ%8d%yMa zQNnJAL+0@DXAgj0|D}TKEscC(3kd+{LXgt}d)51TiLiz`Gz45q1tInmA>qV(f+~%T zKoiYrm~zui-`g#ud34i<`2WTkCF$3ZwfLd>d&R$r`a!~iSOCgO9hMwFxPQHwz;+v7 zcYC13R@%>Q^}nePQ-8_bKs?Xy3Tps-sb4vlkn_ue<`mk^anJ{L$5cL zYsKsS_*U-lWIY);8?d@3d-VKMe(^bPeic*i)%EY|g_OW>{>2(&c6AFP#D6n^D%blt z6~@0-zorGk#6iLV&_Gw;(aLW9FC}~O1E`rdJD%i84q1C4=DE5Z@?i?3lK<(Zi2X-Y z?7$V_hT8SSfU6A73*=pmUOJ9}cq>3ZEF>+^d(=U+Tb3p-lBZEH=Ka6Sfk0EZONkC3#@~ELZe@UU~<@(Bv0%;DMcnCF!TIFAS6Tn*E zPJFMNy8d!|Q*D?APFN@`0A!H-yJdb!&W0KCseb&xdwCqhV`Q(Vp8#@G8RT?!6iRVj zw&OoZ+%UtDVtc<ysyU+VUc?ZA z+B8d6YVEM?djtN4EgTN*{+7?6{Vb#-ymU^gA$M4@Aii4Ztf3m z?>`O0^;Y00Zk=UHENv(OK3St2@JF3%KiAS8;e1=yrK>chx_pO1NS#;lUQo&=(KMfN z7R27wu=aOnj)SD*F)fmNy71CLFnaN#FG96)mLq8KFeDbMahnnp*(HO{VK1vKe9!^Y zDc{aWLzcz2+sDKSDT|N_=D~R>Xj$x7N(pTFs?|6GJ7sZq7r(SE z3X;JEksY7D9I5Vo);m(0Lpnw5rBb?W_Mzu?-Hf<_pneMRvL##$unu4wh_oCyRfahO zgz$R(SP8_?jz`|JKLNDbgyH=-Nf*;9)O?VJDn$wq& zj{w)sAGl+^!DSv}j`VaG8{3m>mypUe7BpP~ zz+UWFZ;9;B@OW-MVQ+LpmUIwbj?#Wyr8pA8F2 z0t+V9$R%&!9j?Pa{7F`PPWGa{5lZ*|@aGjbowX;XEH6`C-YQY6&8X&nKQjz}UUn@< z;bI8VY}|7s5}Qkt0nd(=dHfv*+mrYy_}rpq=uGXG9W$!86&^7vP-D^_)?Mp5vykK0 z$S=}0C#w(r;L5VF7nE_*r~V<-iTktEtiZtQ$r{=wI7*kmI@`$mjPy48S}xMCd-&{Q zD9(F+zps(bq2P21+U}OvByC%2}TS&doU1b9AmrfH7 z>Qo)$wBv|~^a~;l^epbr&xq->iP?et@}S&A@3H4|>%pf(M_r`;G4D789vAQ}%Nr;u z-7LDT(ddBzj-xidM$UtjyRP&bCU4+U9O;EtX;X7~$8qYLszoPgAFcJyf$N%zs3|;; z^0HXipL}z39PDM@Zp~<|Hv@b(RiHoj`9cie?pN{2r!E47vU5Nc%9-(W;=d2w6B;D$ zZitSC2j!Ku$eYam`N84*$Rk*GFI|E6IiBdyb+9j|QuLs$G^B{jiNpkAWHpHl2ktsct ze2$snqhR&iZYiQ>rm_NcQE~k=^DI_-;ri&GYIpwlv?RgHda}>ZdQDUz&0)O)EN}MD z%WmY0H)p+zBRVcb7Y(0zk*>YVaVp4FhH}OOz6>Slizb;;A8apPNtzATZS5>zX({S0%8hB{K)vf3ptoW@5oDs`94fpbR9Nxn=ZZ! z5luw+DmaxQM8#(VT`F%@;1Kh%3JJp(T?{)sNXyGp+=BCw@@L@UsV1{CCC~Xw$nl92 z-7_%OUQ6Xe0%e%}d6z}---Z_8@z0Sml_;WV0GRp5&aKp5a##~i=WBTAsX451NNJ{P zq_w`4{{VhbOeHPiI;dBr`or7*AcQY2T*_Wsp_MUKd>INcb~i(RMy*w_e}?{CPHz-l z{%|xueWNtOSpMP?8{U%qgeywcS;qG;8hVx~jl3SQVCDfs^}OJVWizMXjA9^uAy}5& zszjK&+C2Zfmp-kf&Bq0gyMRIfD_w#Up9yP?>}*i7fizOR6x^kA*=iiRvVkB*^Krsh zLvYb@+^S_duySX1)M_+?@#&Jx&hBlh^y{DXYH0;16BXa?y=fk&9MFk(x`NF{Gftzf zerXAb1lVwbTjy{y5D{ua#|-Vp6T?HQl0eJ>04dsq))3h!;G65nvm zNrqRTzRymuz1-+)Lnf|ZQQH*TA@w1bWBFcfVBJkXPHq>0g7*N>N(Bn0w1ND3`bVwR zZrl#Ohn>!&)Z~%FWXn-7^^F!aeVM}f_;wx-qMdFef%?YasoRGzyEZ;~XP@TLb2OH* zrUwr=*EjHd@Su0?yHBGzM=C#`b)4>%Uu z)L{Z`1!STfi3z zY_x9Ur_Uh2ea`V0>Gj(i<%##w@pPT>V7`+1ipYVeKw(D{zCcSF|G>98r~+ikccX#Q zSCV6f@UC|oMb#mY7ko5OI>!G@I~klQ|>LOSs^S|*6~nqP7G zr7!=dPa7EJ4SqJ;a9g4q#zx3q$m<|hAA3dsGSCrskKU6_J-?mXE1W%Yep4Iw`=zY0 z^V_S0jZ=dJE??xuQ`=|f2p{xsKp>}@&xyU<*-YB@NAS+0z4v!HZ-NvgNqrq(9$)s9 z&iw&UXrITwELR9NSMSa4PsWN5Q&usi-PK6L7fiF87U(r|n*9LNE$KN^mSR%qch>~p z-cT`|0tu59zAj%cjTwx%YXQ7JJtTB8gXliLPO4ubRnvrNd+b&Hy-C- zzW@trOc+SApTqY)r-R2%fyWk-O+NbF^*W`+W2L{Y)9wnmB+NPnadeMa=y#Iek{nJA z&U~i%xfhIf*uT}oR=vg|&&PV(xN}!_vX_6w}U@wg4 zelym}A7Bqrd5di zG{0UDKyTmZyECn&F|CUBNrb+kkI34K2|&hIvD#1k9yRICMl=1mM%hM}a90N5>D^>Z z$Wozv0W}~gk*PtB>yys~%^kb60aRmGC<#FD8y2>aUZ0>R)5M=?KAsg%mYwXnm8}Dv zii%hb;n#Y99qyBtX%QF972Y=c0CK~;+({7(SYk4Nz})=qZu8yuWbBgBPboLPS-1bk ztQ*I!r;B|?O4@hTybXnWBtrrgP8#Dli&HzksPUWYPvG{=ysE14(1NFPv`ftw6!2vJ z4A?e1c$zZ%dhaijz$Wiwc$wze3odZiAP^7c{m;QXcsoBeN9&kUcCbh|%0x;Q#yygGL{o3GlP;2>GoHI&MTb4rTPqc0I7=;e%~vZ zEKy2E^Pa%5QI}Ij)BFa#Ya9Rj6wQx6Ue*N0He?#pb?4_HF`c!hNSYSPzDRJZnzrNx zo=!naaLOpku=GU5Y+%4PkB@f0wZfY9xmesEYvEJ<0co2)3<5JGS#5>WHIMp+ZZx4E zyv0K9Scfce@Pg%NwpVAJ_-8Z_dHkWHiw^C~Ej3}UgzwlLw54bGY^0AN3{YD2LQqFC z%)z3sWDjjaqPmZxF!>tsTID%xOhb{W+azuW^u*^QV$;j|7PNG1Rac;sX$0)u&C1QW z*Z_#0&c+n&>y!w#f_zY%yC71(+_0=26n!2yMP;>TMfLt^A(drhEnoTu9ys%M3q!SwPbc-yZy&Xm;dA+4ka8|68XY$?SW*vOv z)sCL)cC)FeQ0!Z>(htTP|XS~DYAR}oM!r0 z$M1;HPwui~z0~|wv_<>IwHGI{WcIQiNEzWDu}mNHD+M1v=_Xv7E7@ga1uWTgf4ie= z_Z#-Z)v{1r1@fs5f1ob{g8e5=y@md&ye*~FTAu?;lk1>r7tJg$XrZa-Vj#*?nXJngvoY*;V*yH=48hI=zx+zsCOZ*|U z=baT9Haj8-lX{BAFgq@BN05HkSxBomShbSpIK9?gJ$Z{JS9{|GFFZn5>Gi!at(R6A z0)o5u$f91ksk##z9(2Xh1A>#45{jQ4{oCUa0k0VOix=I~o&}-Qn3Te%fMb3p{dJ}K zK=cE)vj`zl;;s0D3Yw;fzcB%tGG#jU&qz=7P6={-`#9!$^!=p4M5;x$b9g={3+A_8 zMVF!15g!j;uA||->@aP*?ac=qEv?q6M@-snE?zypTI^s1}yo zlHq*2OoBmHU)4JWH97D>)|v0~xBKz(NZ?@Axm|6PnIzp2_sx>?u{zn{9DZvH;`7A<=s-<0m(splb zGOAGncD-3QFe<9~{q<%@4Q-~Hy4YZlvNUMK^LZGQ--yQ!bCoQ?bTBO(w`(3bKKS=x zpBenk!4s^~Tx2H8tA`o?)b@j1nC9`VGoo9oKZWhm>XoHlKWU0DlLj4k zp1gVA-9E?Gzm}No(lW~a%Z01~ihvOhsYWeMA^jY$so-W`E$gZ5sO#uRUYi#@fyfwLV;o5P(zU_fU@8h<$oC+&y4Vk9g*ts)?d^RX+{bs&A zqS-ML4i3IPipEB;#k?*Y?MaSSrt2#0;U&~uUigl*C^Ej_;pIYlX3cA`Ya0?krMTRo zJ?+VBX{3L0nMNS4`=1RE{_e8}*DA(HJe`?~)>SB~nUt>Gc~H;bflb|Ge@-`cZntK* z%Ve{_T9@>v)k@~_@-5O0a29^)1!sobGX1taH(}jbP`dZ1idDJXp#-GW5wp~YP_*RT zNgyW0UxelhFn3_=dg$!F_a3&FhN{7y?sj_p)MSJ4Y$o-_Yo^>452VCj6H7XkX~SBm z{xI|^Xi_bCd`A3-;sKA=xzGk{x3^b8Mt15E#-8CM_s=#BR}j+ieB?PR>Vj@@=U?hj7AM)+9HiYM}TO&>+I*<$Bz5Mm@b z-WF3i`52sWhupgAPoZ>D`Ibm|^6)a*iRPH2GK8ubQS)IMc`%l=PN40d@ckv$GizvA zY_JkoVCugXD8WgI3U$uLTPj^Gcs*6;jM095+ttO>L09dv8>j^8UD_?{t9R=I5Fpud zbnX8!_U7SGzV93Gs3e3+sg$&QkUa|7Mx|0@$&z&t$vP>_$TF6cR$A=)Qe!vvk!{dI zb}_aYj21HtV+u2avAp;2`TmyoJ&xb;KL0s9^E~%@J=c9+=XG9Q8T%vhsWsSMN3ibV zK`sf^$@n)l*%(wM=)f+-q%IF?$s?N``d5*_`H3L6v#|P{u2%Pu+w86DMO0?N)Z6?! z_0pc8aBxj6e^ZOxD%s}fO0~DorGUXkhQB9xz4wDc$9qHm-a!iH<5Vx_jyhv6QH_bc zG70-FVhd+TSo?7sU8oh3@;+jsD!+^dfnfjrkR*W* z2@sh4_i~g+9t-{1R+rLhJ>^Bf8H~};w7FUSwFX+}%PpJ&ReQ30W_PR!bjzm~+pod@ z<-xLmHYtvx3Um3L5XIJlHH;t+Osh_gSww35k|HzPVJ`6=4ul~CQcy5+tvw}!jtTrO zWp1SWPbjF~y`58CuLXYAzACLt7HPCEx%%0hRiA1Lwu?vQpzs(*dNFvYzO^k08|M*n z(u_=Wor!`PyFQ%JhqA|7f=jY|OQcO2f8k9UeF-_VSUVoyiU7-Wv(ahkga6aBmtx12 zB&T}r9l9bwBBh!l&V(jYWidLqwsaK|1q>br$276*H+1&7uA@X z9>^l!)KQx{AA%z?!LiQjn5NKQHI5oEj6jayvvT8a(Aip*cjp+)%pGoWVe%C5) z9{pcbH#dOUejk*e;8F|Iy$bQc{(yw=F>pVjsMoU+0k$jRXJLlgr290_oi&ux)?+=q zG$CQMe;020^?RDL+a6^JIs)Ex@f*0NEVq=iPNDaYv)Fo?Hw2xjby`tc=#jKMd(2!? z^r-XWxd^%!TT>72p1ZmZj;|W(lnApf6|)}Woc3ztfH4)RUh90Qry3a|AYo^-wq(Y4@HoH+!ylFGF*cgxg8Yo^&g>L&~@?kHFC+X zkMkVaQ~KJBha59(bs89E?tSsZuju?^Hq3yY6~_^lHVbPa#V?A6ot)cITAtzVmLkYbb_>`GPIX9hJdNJ_*Kne4PvXF*t>I1tCYvjccurw|J&`aLg`K;t`@r62Tn_@T!la+zgX=){#f>3JIA*O4t4j&*O{aM#DdT| z2t==6633y0CoTb~JIiwYIY6p$)h#WnBmSw_)O zJ>;7WfY&=5a@>VPS+Oib6}6;-$uL@HYtFGi08jyK?0v+yC0_5ewwPn$PyeVvsOyIi z@z~lB$~jl!bqQCo8*#{4yR(CR6pDH)zRY7RfjOZubKrNcDhBOF1g7H9OU|iYdRcJt ztllwV`zcDj$Fy(PA%kNtM-rOqcL63S@^2~jdN8MtPpFMM*Q$0n5GA2Fl^fT#8g_-0 z`(zx1<3krpLNu((Xd~aUG&PpUbN%djO8>*vPy89wS$~U`75Vp=bH1!G&$P3qjY8n~ z=4*3_IC867}&R^+lVwtZ`xvz3KHLI{Zf-NvPeT z(O0H{_PZd$nThr?Ykb-lcfug2{7)CE)W03)k%vL$XC}=5RE1S-2Wn%^+#b$l?n3Yw z;2`CH2&Sa#b}E&a1}Fqz)MKg7yfI~oC|)^XfMDeV;(3$k)D0PKM_6gp*(XENcpi`K zVM>)IyXBeZx&fHBL>0+qjTFLazbLp-wDKStoxg~UDHOa{rx=IrdzQ86gnc6cL%WQW!`Xsn-z<#fLPHVo6EJok83tme zDsKGB1!uZH&bfKmip(*i|3~~6&zQi87ZEtM|>K?Ll?_>$> zpM+knJd@>-p%LYZ9t!pIkm@ayM)6osVYf0Cdlt;RTVv1=Za2Dk`?IV-b!^6=cNr2^ zrekdu#yP_YS*u?LN7OrxBEF%DnsUAmig=C1;U&K}*J8vaPeMwy<&}#uSKhBni2X43x|q|MqSLF{!)kvxp*! zjO`0l@TUhMdTe}Do(8!SwhL|<_YS1I)83UI@zel~ za-O+G3~7TG;5?q4VD~Y3vygnha{?1qtim73LQz+3bu|6zJCl$`=t)i1QKy0ci|f8! zC*seER49f|o&;(5ho`X?H!eYrYX?ne-)%0W(70A{Pwu*fk5p%-#muAW!-zjbwR4$e zh8HgJGh=y%J+H0_c7WX?Q%t*Ylc!2Y#W;E-LEW4C2;Xsqfbo#~q|Sq<(+f&oSP*bu zYA3aBomqUmFRBn4bRbO;oRZ>_z1vitCOX+$5P%VQ*W$z(s&fHca-$>(b0gh$odc6? zJrRwHM7a|rNZ+Bl37U;tv|G!Atz%KOf)!q z5S#+lDSOl@7U*sjXSoM*cR|0;TK}=w9xGa~b2q{S8?z5|+|zNF3KhDtyFo3MIqb&> zOt{ zC|+petI^T#ap-k=MpMKYs=t{2Jxw!w^9i4wZL|Kcc|s3H$4e}QFi&kOMX9T9G`$;c{-qePPEIdK#eAJQB+Gh|1w(t2XUl6XV>7vy{O-yk1Xt9ahZ6au@D z*6}9T!0XNBlEby)X?Xr&+*f}$$g^UDtReXyyK_>VnECb8$Rh<(zOx2`|GeMnY<%l^0Box(2C2Hc zlPm|chKg=;=BfYNv0#hs0)`*H8fI5jJ#TR83r*|mZ+b5ui`}+!6C^a|fP`h?>EHBR>3$_-xst?R@i!mdpvIrDm@(Fp!Za6O$BdS59Ko8ys?-|=Cm zhjA_XnLH0_14Ds0x;s&}qj=oUwGKS|!2!~5cxFX>r_RW00;#|I{14)uivb135aDML z#ATDl1eYkd>$m3mLk0fp8s;to45CwDHz=9s#ZN4;_9=tkZwsIan;HAiJCfUlzcpj? zV>!5T?6Lw%@Ik3D#u-!7QucljfHEgM|5$K6?``%+tj_680>#>yi8U+HVEm6|om(Y_ zuXU%9hRlJBk9VHc=}~CJ#tg8@d|#$OA_9#vMwJ!rK*Rf4YOMN{T_5%U#1R#AD}n0N ziJLxR?(vKIiu$-wkzoZNpxU1HyhIt|3MID0(v%TNu}1I<@Ehz~&4`Tl_cV3Lw>BoD z4E134qOd0UV@;$;ER%Aua+*Yl9ItV?SGI5rwb8r!{rh^aRX!b~{HMdXXSyM4S@csL z$zATYy@Lm_4aUQEn=4t#3#5E3*KflRSmbM zW#k=(?rb>n?6?12b38_cK~K?O`V!~m+v~)dXpla%EN%1>Tk10DJM>`1AurJXoZ8I! z^2IUuVPsWWYRH|px_+!zdP8uNsdqmA6<$`082ag<|q8^|1MaA4_k0_-X3o?D#BDsWv zs&oVT9+ln^`Us*b4P_*pub?9Ep=?pCj)0c`j0UF_T6wH8SC|Ta+G^GnN21+T=X#M) zCFh{2JMy7|+2>Z}KRkYt)828=OcjXGF8EbX2StiZB=_VB#usrjf`Xi9{EZ~QfFaI& z5nB@c?yIkZNaIf%tua?EF{t^~iD!K1N<_KO*iO|!RBp2QuYa%R08j7cwZS*;TYbS` z4cWByXV!2+lV3TvgYNdkolmV`;jdF$eruk)o>6O7mN7K&V8v_P8MFHUqFkm>x-l2p zvD28LT}RcQs=;W9+f@#PGCg@o52(mC>mEpHu;?#Zu-<~O{G4A)tw`KGU)M^yB3~=x zZ}O?BwYjj!#_7(5-cebky$Vm%bEacg;`Uvl?=8q7HZuHiZFURGq9%>iF$KWWSj3)1 zh>g{lV(nuw(n20>HZts=KDm!pofL`eYSt0 z0C+;c34ZDb3{rdGWT7Jdk7+w1-U$H4X*7J=i?w?E%OsQKTJBPjo1%~L;hr5AU#g(u z51hR){3IA83~ujzBfL{Tu8{USwhk6=B?&3L3R{3gP2U5heC0(@y5t)+CRq7$MygoG zIZK;@-X4qH>S3Ac8Dv+crpGwCymz!$XKIf8;+fvsF>fM}&A$$JZP&;{BnS?>_cv?7 zWrPhTb9_eP+Xrl@1n(52_LF?JjAtjVlK$M6gtu$;dGkL|r(q-NHuK3DnR=-#di6#dmmn^$~ zR35B1@Kx)o9_8|f-@~{GJ$x_52ug*6g$CeR>6uL9X}A7nZ*FIQ!#%eq%i#e>wVu_E zA|f0d?zmqJuc1CS1NMXbn^4fRPE`9K&w=~FavM5&a-hCou_i}80-a&C=M!#6j@pm7 z5|{M7kJzIhz9@v~jzUVJQ8y@5Nj~_w=4WD4mtNMHr29B%0~oQd|9lB|X3BfF=m~Ei z`Qg71uEg((7Cv>3Vg=M->t>F}pjn>j6tm@zS3Z(MVTOV>vp^O!cCp#_H)8K`d!s$ zw;ElziV7J~ye{qh0H9N6Ubj6eLHfK@&R@WPx+c7@x|Et74T}lGyH@Xp9UbBNI)3+} zmzd#$OmGUl@ww}!-=0sT)xV?1`~dS#@a?HCbArg2(n}x-bIJeIclKC#MlZbRoMq+0 z6*0)qPRi{((i#>Rt${uy855DUb{;Eti6?W7UiJ@mTuvpU8_)O$9Il9_;(b*ks2@hE zLp$ZHyCFYKAI0c`KsoT6vO>IH1Ptt>B%`CgiU3R1^SF74=SYCG(?UZfJy+03Y`Pcz z4f0duR><)|!8s$6wH54A8l(4mgqTTX<1Bj(^$k+yg~l2{n(EJm{wbc;PkC>PB;x#EP6lKF>`2s}*e=*rqugmpjMiy;wcqv{7E7ZhN3I(jjv2`IhkQ z(K88UaH?aFMk1&v%K40N+-j|oWSePcCD1}k?1bl(UPo(sk_OrY*Lp3^NRAyqwHkOU zIM&h^G&V8!sO;A4`cP0@B=H<5p$du>S9M@`ce7)nBO<0Ng$v*VxCz-RnA5U{w!#(9Z6{TWS77`$7Rh4Nc z5NG1b!rg2g;%k`|7xiuZ^OsFiQN&BsGf){bg^nA*me6j^L z)hiGdS&!sHM}M6)a6HVho&o`GrC!o2k`!cV%n31TzO0@U-#ixb+hMM%cq=dr__1zK@Mc_OaQhE5}JT^`Q{B7u0h~2Ux{2wojYo>+RBy zTi%+t8gmeWw&ipW!M8#77C-GO@SDe3y+NWN@p+{rfeMM2!|xpy>?n6_-c5T&n`u?p z=@lSLQWA5Lz9bxlL;d2X*;+!^Pu_nX;XV{%7nq`dI~r-NF~ASc;{Lo3P30-Bsupvk z;dvsvmn<^+5s*{o=Dws}KD5$>&N5vj!AQ2Qp08m`R8$$g*j=#J98a$VV{Sw29@aR2 z)oon$t^+aVUMv2W^l&htFiHPiVfw4Fgq0f`Mz&=sh7LSsW4=v&kE4Z(e(QA4ioe_5^Oa|@ z)h#!B^JeO*wGYa88%jEDLiW^I-N|=OT*nA%x{~(Dax~;MYGbrI&Ev@MrO6T>Y=lQK z#|QCcTuNDqF$s_}d=c^nyZyqb)@6RoKmn z<$EGsJ3j3xA;t}VkUe>p6lI%G>e%_(R2RwhV#jR#0hjl*o^t{$^ z)W|hwd@$ez8^T~}$&vW+2-Ufc;M!eD{ODI1m8RSImOhG2)13+;CtK3G&kWtjuiTu< z{(tec^ZbZKl&kQ_lNirTn6QuGPZae=twys6w6@!1H)4NuhxgR7(#vNU<@t^lRJuk3 z_5yhm>lgL&lDgOb7NcxXrAV7W)3u@Okjbtd*1m2#fw<~l?PHT8>%<2U?~7&CiAF;c zvRNy83-tJ6Bj;laVciyzswG+FZ?N;b+zT~{m3uepM(A_7njh?nFt`rIgO+#UYmVL6 z?Z=}q(7U^bdLmas3%#}eAQ%Rij6G?4#a3odWJpM_c$ZDiETP>jZ+d5DM*_Qn zCx2igXrzm;;VMz|1C<9v+923p)wDef#blk`hlOg%=9r9Lk(Jq+*7Q9?JU~i*c;0l` zEeo^3HiybMe3>SS1J({$NCw`TC>PZn!IfB?1ATOmS+i24H0elr-OlQBTb~Z~o-B(M z-sU?b>;P8N)O*!_taa9q+e|iA#N!CynA?l@U-boMA%A)9owMK%s^zSR>AYRV@3P@} zmRYBWDY2F-(G;*fK3*-@Y55L&5zzd*FJBBB?(vw}omMtECh~_6IiFeoMevVOZ2a0i znV8`V*Ddegm9D-xB|038@fz}DM@kc2o)b9rL=z1*w+)c3)x4Ea4vx~}R%dd=U(w8h zomPB>@&#HBX>|X-K}-k~RI`8r zj2WRt>_Ns1)b|zWTt>U5`7O+=!SkxQ7uGPOKB5s-V+vQOiazo)l&s1y_9Ng$n5~ce+BH+JPu_&+Z$~2%;Aw91x!3fWy|H5^EV{t*vU47sR*2I2P z46kw7=(2QdTMhc@8TL%<5Md6j6S)%&=kf6$DA{=l^4CqXX>s;Y!QbkaMjx}IFuFtD z8PB!%WIxeCY`-8sOBWTi&Hykv4S?C#+<~NYIJ%5yNgo&WSWwI7&)!4P*zKLhTvZr> znZ1OK^_>uv1tBIMPY@R*5l{faVt3)3;`}6ILiZTEDnB^FDc%Zc!p|1_vFG?eV6qSJ z^4^FxTNXuQ9#*+3KMB#rzT7!s&SBI+9LH8we)ck?A&}u?oF^@HSr$Fg3~Jj6yRu!T z`Yq>{&8rE!3zw~5TdP8@Zr)W7DCt|*@dNiXUk#l616a>n7pE=R`jno=`T)N-k1##j ztOwgl{_{`BwHHLe_xdYQLjaj4uFiyxK&+|C{Rh-wA8r7T!(b-oN+$ zpHYwS|50@C&A>v&(r@z3R1kA{xG>Ta9Jru}e^t8`u=QyA595>b*#8%qG;A~bT-7!u^pr5w!z*UG{^;BFizkHH(CHJy2U4_apE0rK)c(E{ue|$SnKpzoP&# zfoqIIrR)M)c!I(A+UZpSmXi7Hr((x}?sHp*%9la0Eix_yBDZ&w&3Z#p51<;oTebhz z_RFmfSBQ_TdZuzhE8DpYNu$j}xYgzT&rm;}7$bHnTeNU0TXmoF<t2vYmKo#9?IQ@rk5T6<| zKY5S$sINtyypSy>9y2dy5I&1@1mR9_U6|estz|xRj(7xXzB*^sql1!AqNkLv!LQk(nwt-MR*|8*=QDQ&f zeB=$I<5~~9k+*Q;>Ak}s3<)k1{>>PlmHZJ$gjbkJwTgYacU{hY-+=-Fs~NsQ4tKz^ zl$!g{Lk1^>55(eyw#cf{+fcZ&ivy~vrBgfy5Cgf}UIs=H7ro(u z-Hr0}J_dJ^7%Fsa9Dal*#C?k~X0bD5d0M@px<&Ts);edc>_q;y66V^8+JSr0TvlvU?Y0lKP7F>s&i=TCWm&CZsM&-CP({|AeN=8o+p-~p zTWZQLgjdE)xHkxo(&t!uy3My_{SFk5)3?Rxee- zE#rIvf_#Yt$7z1WV2_2<>HHpS_k1~X$^UB6v#kf>56GFH>4itzmmDwfzINtT_#iec zJH3KyIMMb*&*Y}UPNi{x_&zy=WgUT@N8D2=w?CG#ePV^xSF!`JV8C@PeCtsk9J4T& z1z-qj|3cuHs-=@m=CIWM-S$pRH=9j@20S}iz61=?;_wWzcTV3~2Z9Nm(GJ{V-(Tq3 zT9H@_hSGxM3`cf^b&13nGx0T4$Sp_yW5G*EtJ*ws_ zMSc(BZQ3;4VaZzwc7OLTnIE^fcp-ElqEL0};1P4NlX}VdV1GG3YN>r>X;~QwM!Y1; z_OR?yaaS%xuKM^^FRD247f6`J`G;TcuF@Og@1-;q4$ImcutI^v--Raj+Tz$iDPM?Z zh2>B>r|#yi&Aw-DB^jA$bBh;a0T;*L0Hx|S;_}Tiw-@o13NUwevl*<7ia{b~Exs~Y z-Za-VZT2vE@8*v$z_z;WkyQw#72jun8I=SXnJ2iq@rw<*mPLXB^)Kv8Sni(uHtb8& zip%kxo4zyu$4$D^!l9=8D&PCwcD6ne= zoQo#aV4Ewt$!2SeRQ5kK|6|&$$39Z>bK4u#Y+M=N9houJz&XcK-(m0e+H;lCbXdmF zRXQ)q8&)Q|(9l|5?SXBaE^sVzfO@Vpz&0CFs<4@bKGx+7{yZ+reFep-;ime!wh^=pkoWN#Qs-r5i`YJ7w0mSS= zxBhXO>wjf?Y<-}&1_AO+zMPbR#$--zdbalth~uSKUm=%1&TsN2 zAek39Uv@Ihdn3(KZDRtMfN!tM4qe#A8HIgIoE(xm4+A16)4hws%m@qaPS>srsHAhe zD$cgM5ze{0S`2e}7Ebu6ZF_ijUG8?{O`gyCqX~zM!YwsN#>Y@^d0FkfLM>&Je!$V` zdSAL(O86As2<3i|*rS}Y%}P^~$7P)t(gaw)T$Lx5Y09xCSrcz4+sl z#AHG*>ab?Xivdejl_BBS6O{9$)}61(iO!tzK!q-;-ZT8w@;#W9hRPbUiGkQl%5cJ~ zq^5{nxLYTY?=DT`?nZ{jmy+^;>e8IS?t=vz$$l7#C4y=`OWuX+o$ywhsV%*Zoo^to zsMR6WQiE?QeGezQI@qpnu`iJ6b@xgTH_(7Jpad(zfP?LYFW3n67IZ!#*hA&jLV;Ax;DSWA%2O+X zRVpB&&K8E*fQ=GoLODhV%YRJGP<0fjOf2_v%t0m(d%i9DdUf_c%JSu42$R@>9Q7C?&lv2lUi{8(W?4>=(&_GOp6!hj$s zZx)OIE@b63ctz;opQXwKWY|}RG0OItJb!}`X^ats%X;Yp2pFwqIA?7Bz#y`c=!;#pn{`c4?WWj)57QubwP>};i-YjkQRF2q6*1t!-ul}97CT;u?M2-0 zbtOi@Lk)aOL#?hJ`DShG^h9u9HN(4VXmgZ83?o4Wbp4|g-(rd`3H`g~?zg^{pLXnb zIKJWT-_4c+uWIESQWR`i9L~k-X*)(b*6-;DcaoGs50m~AIuzpmYe9gd;I*R8KvnNaAz|j34wPIPwPvGUb<*|QIzRe57H+0!7h?8%B-ER#Je}gF z_;wAGy}@?*v96HiBWk*yf7&?Y*eYt|F7I-B-!Qj#cSNoP_AQjD8nv$?@fHdn9uOks zeU$3I45-gpe%I6>fn)0=TF-};YIhzv z+J4(Dr6>Q(gAgh-Fi(Qw?e6^7+@%Vn&zIhb;dU5nHW0W<&GeFz0MVJDOyT7E>Id5` zY-^qS_9J#{3{TI`PLbxCir97;>mt5k{SC$?0~f!WfCN^>$;|xu>{Wu#1G{u*!9|bz zUfAJN`)+{iQWu&X!RPha7kE zs3Ui}G_KE-5nlBu>ZoCB*U%ev%qJj$M+4uB^n&62^(~(Ix7mPi zky*6H>AkiJ;2B@qn^XXcH!;)hb|SS8eijFX z=lXnYzu8r%yu;07T_)TveOLMBaml68n;0~U>vuI;KB9c9dxf^(kh^Fn#)~el9a76r zDxiIcFBE){c8I;YxF(-~)tIb+OEMk9P_S-#*^VsedzxlzrC6HW&;P{|sc8w2GxiES ztdZC^c<0u~ZN_9|CG5v9AubrV+1{(QO5Smv~=8}ERT){wR z$x@lh`UAqY)9_sS;|l`>>4jOx)T!e4#pjl8{iaus%p}gvQ^5UeGW5wQ9p{`ZRXAJ`#NCs( zy=rpbRD{^BjE4l&X=JzRb_5k*V{8B^Ps;dbA9k9Z5@gRl8E0aDX`1-FXp>x`dEWGh zrK`ErDk{I#qwto41A2YDWcxZc3gEOY%nL z3UcrWi*}?@5H8b=Q`Q+|vGT{|noXJLE+eDmJ`IOEhzCcx!GD2(TKTCpPS2A7WTRS^9XL6se9!5v7oQFhaibmvONM+o?sRi)u`@*<4> zM;={{cPW3YsMK;i^-+6ApMSN>@Bq>$86-aujaJKcYBQRs*%N2Funv4*+tn|#IHLc4LGaa=?B8k~u zLN|Y&*WtEQkO3UrGGgjJ^5z&NI~i`n%W*RD&LOgu#PT$31gv1&Da}_$wcqYSEE;5X z=a$^?8h^Uut!ddL8il_?Y9rTq-$b+A8rh352Gr0zSf;w>pr-mvwM$1|IoBi13dlc7 z$`~Jq(7xOzLSQt1K%9NTSs}YGM)@38Qj?}4fA{~E$`wGX>{jCer(_)pWnwQw2$Rql!A}c5Lc}_-x3+L z_kak9;w+KmK+t_jTcB~=)z>6kIMeheu~Ikf%8RcA?22G%$7sZB`P^YaJXw=mphfdf z!D+xXZzcu3d&glSf^L=zzM!VGd$iN@d&-;5P(7I1eSsL+9-f1z*Vc@u^_QZ}sykF> z{fl(nx--{p7b3Al>uH8AA&d|=Ucw^OJVOtC>`C`o-Ss^jaxknf^*ZWBdq>ofxw7#xv#WREDh+we1kPGHn!b z#vHf)#W+T;s>Y6H6tf>tT`KZODwY?Jv?H1$Vm_ZPg5!8bs#B~M*y$)vG}#UbL+mSaMa@3X!HJzzCJr6=iJ6a z{?5`rZ#I6;$pfsbab#>OwtUDJi`y+tPj!F#IArjj9D74~EKIR2mTe0EqF>W3jZV+h z0;ZZM=@ldkn^SEp26~w0i)s34o{_QVMRMQyf(gG@J~6W|R_lkA>9|U`$%O*%en<8A zouoZ++xgR`<%?+_FcAf2aD>rffWARW-`HKBqI0AF5GVUHKAcZ}(}G7Fx#p>P|Mr5d zPv|q$tnNg}#@NuYMIbPFNAPy*o(2b=8>tUGM*wLB~fsf{YAEkK4S9E#5RG&v7F?q@+8`f02{%CFk@t0=6(QQfj({9;) z|AH0*Uq_ZhSW_=OU4cQ>l6DPfb{Ixi9TN9_(_~mqbo;bV^DV!j$EASe6+R>+@x{u; zYDY_P3=~gVRMV!fbR<5h&|b>&yR=qp+%to~5d^cJwLqgHpeS!Oy%|lDY}TNaLXAn| z%*{Yz8N1QY0mj#sguHe-Ge;GBxxLUB9@3UGo&ZG83J^xu89OFTZLeu}4~2ZS#;7*R zlfSVBREOl?a@{BN9Jq^`?=)L#E^UYH*D8+@v8G;ryaMAf+;{H8{c4@*6ccylEZ)W8 zib~>cnAUIvggI#CG;>;598&cF^-%=1PGKpQz2z0!~dnB)DTM5FHcaa9^NELqd)<2e=a`WrWmFSYWR#@=)X zO$B?!R#ekGw@%fTG}cBY#Q$TNLfYy3JHjKyb5Kh}D6Rs0iqGBt!)T6{K@V5@l5ZR0 z-97dx`5jR{|KyQgIsW;zd_>_UOUO@{PiK#(RErVm@GID*m^WFB(PKM$H*GPHG4@y zx6k8kyJZ|S6YTi}?U|Yn!$R50GR5hR+AYxw*^L~6e@~Z$UJ6O%*Xl*=c~5iwuOC7}k1_klI6`!YW;j*R3>t0@U- zkfhR8jD3i`WglMZ+;Z>yzBF0k9Hs3$7X8^kJ*GhfPoI>jqCF;)%2w=C&eHaD^%F7r zgu791QnVks&j5k*21WUl7S)+l(;42u6UbD>A0P3mrr2fk#8X4XkINojA1syRisW?0 z(8yQe4O{XMpvOqu$aL;@mC=wm?TXX*Bx83f_+;>MNOOly#b&^} zjm!g+Kd|wFk->UF6Q47V6WZ9R!WnSE-lFTgw0-^AG%vk{owj#DL5t>SZnKQTY4gWF z(##gU#q9b-&PG38`KNn-PsGYtNeS4vX2j=7DUtTVX5%$|x7a47ndge<#E;c=gA8a#vB*~o#G?sTwV<-N1Y>X2s|DYbAdGvQmPpU-0u^49cOX9% zD{w==MBssKR(QKnY;iO@vO=FXL#3>UQH`DY${6Tkxi${TAa=_Rgi`z=uCOcTkk{J8>r+BP#1Ym2HGKefD5+{nR@C zvoS2}y%zb>M3?2wH{bZ*ia(EK3t5fB4;7Dq+9@HF9MStaaL8wVu5SO3i8Qt-)os|)2ou0*&Y_pb| z-my+Mh4pE}-C=dt^g~SxU;FlFi~G@T3^Dqbg_`DH^1bajSQr1YpsUm%-I-E+i|$*} zrSMvp;IF=?s<2_8y`iTeyQpH`V5hWhC`i8$FOFlPl9>%3d~wKQ`!nRtqRSRkb8!is zn#4iO>)>&l&pzQb-5FXGAt@nh#;11_pU$j+qK35lNj9StV{{!hhS|{b+WotC>L-GK54om9D&~%r&iH(yeNwZUP_Mp8VjGh84AfTb50wU-Z4PexMT-_4^P z^@&=1U980n!TD0r=LY>5Gw8yKtdRR{o|MFv?q+B$(y9)sUx!J+WqB$GEb?5jLEZKI zj&m+*Ku~mS;_6DYsz)4lIw*fLKSmt=1;mO+Bzkcfe(hku(Isgpd25mZ88+xDMNj%= z4oBBR&BSx-z3DzLX37V1JJ+Q`j-2As#v+rB}$z=Q_iLzX!k=i@P6y7@U+ z*s|eJwPc7G>%-FE{oM?Lbve%XfkMnssvlpC4vUrMtW9YiKi^doJYQit z?~W$z%=za$9)pZ!rg_fV(Qmhw{#FUyO$p3UWLJd93}CN|zGK>G8VB^4vLbH{K2)6R z){~F(ApS!u_e{a}SFtQiY_yg{$DaPR$qmZO>aHi_(^zk^z*?NqC`H}-bT}-{Mz2nz z57jeiQhLIpC+Ki({=oWY1+^J7!IO$3xCFmWPs%890%sbDD(N`UR;Y!=x8YTxBB|;L zlb#+!4-74<6G~^h%@gh$+xD5T?*#}bps(mm5%P~{cH@^Lcc1LuldaH%NmsPz3Y}Y_ z;~i_+*=D7&e=wvg_Za5yS)XW%RfdYK%}m3?>j{09&1KpP3q5O}hOi|E8Nb9gK9hq& z83`#YQfTNxf5!T+U!+m5bQE)>PDjM&d&r+=o)q!5DY{xHJ3`zmRI@0grHztY34lSpFfjF~|Ivy=uEiB1$h9 zADd2WY0n5;`mD1=&|@bT7P2YnL34kNjfa97aSMT#X~x#>?2Vtud-MJ%il+$DU&C=##2x z<=(} z%PY!Nc5H; zGTP?h{Cwu7=EmBN`)w-`4Yx}VLZUms?brVtnD{g*-?8xbI0BdNN>+tHQgeet=)@|S z9gyDmAf?{q3E6|PYHhc82Tz{0@9?I&v~{a{3^m~Zg=@HY<8a^{#UCP3%Q`GK z(2N&;TeU>LFMJvz_FJ1hICIp6_?Po+$i&ekpxxP2ATgZSGz!ELWUDVWE1wr#hOMV( z-r4rd%!LRGpm<*&xo0$P5b)#4L2L=(y4c2d@kYi??L`D584jKS!K875>V&onLc{dJefkwFI~F#ABky7$O30;|yC^oPK7B$ezx9pM{Rqvu_r(v0LuukIR?T zg^p819?zzy)XEOTBHq$`X04gk!9BPRF-{2Fj;{yQshx1Mfi6^a%0{MdPj+;ZKXblv zqtjkKqNI4<-sTZ{?Qk9M8{Bfs3^9W&JCm)2nWr0QgA(r~uzbKHQcrPa(khKmG*ISf z7n0GK5P3H*K76QiF*SvSKJD5NKSu(#Ycl}NVf7SsHw0yHc4g)S#`?{2P>uN?e#QDy z6r;@M0`rJB3dJtKVClZrflEI=XLvb?&Aq8N6{nbT<_%)NAyGIom;!w97f0if+z~^0 zjj0cEMVb^`+<(e-tx^jeh5?%qjJ`65xmKz%S>8kkbI2U{C{2Vp^)Bw2^!l+rJ5aB% zy?kw@gT(H6+|^hm@t(6f6`pX$+Olx>Ph`usXcdQ}n*{DyM69QPKv#ugXuH-=BBmO7Gvg2Mf>zbQ8F~U}~^r)03FKel`Qw*oOpgt<(ej8`aTBZ5i zoyKQ+98Ns?4)TNO9qP;OPg5{SYH~`L_tp89=o3e`jW}u>?RxD|fpcrzFC#~li=+7K z>O>VfZCeF+3?$(OXHg6UEd7xpTKczJ$E)z}9=z)*cpbv*a9H_iRsoICOVDU@rJdFE z)pG7}rcqCYs-IKK)4Y1ZlXhiFbE2w{7ce zpu?kP%932a!s*m7chiGfh6Q3~kA|HM_ULWPPx?wgdu7xrZYPfDZR2n>AVa^&UlZ&D zDp7!&_Okt5C5m_FT%pLHnZ@ld$5fYLk3LK)H|uhJ+Ht@E#o&)%A9|PhOWHM%8IfDp z{ubUbQouWC1UvbWSx+p}(7rr^G=lYTo~qKby6W5br;6qYm}hv;k~K1OAkBaMXHRtr znf>RpkeXuS>O3=7N}D}fA&%~@#xY!9S8u#F2DzO&E9@@P`UDR#13Jhp!6r)P{Xgwp z`(Kh-x5rC5PFhXQTVrWjlbPkE(bVvgHBD2KrKM(RAr_)0qNOQ$LCMbOG*epUB^57S zOw9`^p&<%5OtBII6qV~}4PgjP(^eC*df(F|OIQy@HhK5!*;N~IQmaXy2S z{*z{@Os5igQWgSq=hT%xW&-V7+bpT)b~O(H6jfPd>H3fv+C_<1&C)xTO{fE$|C7+R z2q=sVqONuFF$Yn1z=y}#b42u_S`c0&PGBdR3AH!a@5+ir5*b>oW`!e55jl#4#xuVY z{l9(IH15=cwR&7`QAt5L`b+#^tYW>{erCM9w8H#9mWP*_87e}u%P6R;oMf}N5o2xU zYqr%uuJV`s_;Sek+V2AV3Z31%cfQ9ezg*tG^?9ZQd~m-yXl$Tvv_0Fh;v_;9@Zra5 zOP?1SwMahOY#acoMU`2!5`Q zO&GrR7>@ZaU~={WL-Nhq!I}CnnpKNgbM|5YH0%727j;)i+$Uz)4kdIq%7@=q^7RH2 zRCZ85V}C0XCkGJh^EpoLnU-6n!C*`I8qL^+nD#V0#7ErxSMs`x#Dw{Zm=${S#aN!2 z2Lr|k*qsm2bbU& z*l=}7h27X`P@6d;uxdZ=yl1T8i=V_tUpvMdupAeQ@wxfA4_{F_3OBGW>&LjYL~kJ0 zf7{zo7L2w?ttzvyXA>)Oy3Y zC$k|w_MK8IrB1@9PZuO5t@`k$)CkjqA$)3xWiUgZSHEC zai&406x^D>dFP?}=0GvAWiR%?(I^h`A^GrcpsmwvKMLxwmG8Rco79=N;lSR|J>|xO zeQP-n(zI&*uJkC2e{S#oHxxnr7qs|aK>^Sh1p2o&{}FloC*lA9kIRb{m}Sf0>i@_% zUaedY%@F^o@Br-f5&+T&M3ih)I&HdoRG2m1f)fEN8F=>C_nz8Q1l16rqAc-FN~dO$ z`3YW*r%U)EF1ah^S2vLLecLP2FQ|r8tMm!WLLKhy_h)pfph6vs4H%gxs~pV8+W*0# z^(KMeUaLO`l4sZ-<|@5%fSPYjsg_wifp9cMqwrdv^af(P5$>=4@b-ECq(h~0G&3m; zoTr*y;FIPMLs|m#euC-k4vij#_Xy_Dm{uUVyh57nlYP3|)Qp}M%WgzSzP zSiXweeB`+Mf;yY*all}Nv2HqGAFLo*?Nq}^r6v4s9Q9`19mcNcpWwO(1spUeNZ91Z z+Z_PNMYP9#@VF);VD_2J;1|x6nlg&oAv76{>tq#5SlG1>VpvtTP)XR^F&#Uv42Sl%44;S} zuOMJvu-Cq*bHf#z#<_CBa?6OLoP`znbu@^}cg-@rO^zyik&7eL<+|_jPwm*(VYA`A zX;dWUX)*aq96v(Wvs3i0jCu%Kee?8=POg%Bz0^9dqrEsWCq^C7Ob~$9;})Y&;+H?% zv-H2~POVGNrTK`oOaNS&KXdilUW(aYZFbsZYrD*xdCX!-K_6Wm6P*`Uef3J5wr-tx zQ)WkExRRg|?~wt>|2=t*9I>meaz#jJ?(wvF=hq`zXoH zI@lG?f0;RPRt8sBS^nUJql2Myd-B@JXG(BsejV{a;>EiedlATP-%&K~TKm!d?zBhN zoV#c~>NoP#H_E)=vLJJ*&!cyKVcOT5Iboc_KQ_y4`nBel%nvAUrf}{HQdG~A`6(n@ z1M4>ftR;TGPBcpzE!2wZ3w65ZB)wl|3bwcdOCh4d@|W_R*C19=0e1I24_As}$9(zG zA5scjC^O%t-w@XzCr-Bu&_cT_(KSFPCFp|9bE6N##1PtAd6hJ8YIm0}t8K69k@bNN zEAIm1RjRpaFipcOAG+Hv7uD3@_UpO|OjlQ9Ph|WumsKo59d>?soS7i>T5|$X+3zNP z*2h4K0v1ksILj;`&=FTu%M5xkbNvV3x?i<{$RJ$;Tsv&MFvc?NmKpIxBNjvmPh{%)MYiX^5$+bs|~jO znjegbv`fFPVv6v>;w-!nT$yanjcJIC?`>?95;}x~t_X9&P@_5$KZkW*Tu}x#cw=gr zOBH*Pn5WXK%ew2$9qCvM_iKWucUtT4m86&2dd1ImA+EN5^<=z|tRy(9ziEOo`^iu{ zgJ+(FiLkox9nVezo2pM29?ZqTZzVSN4&B@T#+-p_jG2u}1&?@_qD~7p2lAU+q&sk7w-@q~cr^V_!Pq1b)>=LKL z!>zDRrdjUW!!ipZk^D9{IQNeV9-id3>(r{lh2oxQqB>q;CvKKDb<_}9jscs@Y6ndu z5?@$LDnI-&86iOjlB+^|y>P?ieA*^4_@osmBQ zL72jQYTA!M(+h$w34cg;sed}uC~F*gh20CEN(~{6t|EFdCRd*EdB>}Wm>t|N1-HP| zkL7JAGJ{&I!44@ZDr|eD=Lerj4WN#J+JlUdVQegeRr#HyK zX2}+lowwU>+Jx{p)&z1=0~$)}IoB+ZM914!dpBsvzEILrm3i+aKVe`X>8K}_7d`nl z3_yNS#|q{c9Q$H}O31Yys|6=hX1|GA?Y7I?^NT9@S87_j+y*Z|bCbfEjO5R-aE%LC(G$%zt^MAsJ^j0{6~5MSYs zfv9Hj`s--~Sp+8OrH5c6^Oaz4cQX)5z=HEz_8tI;JL0q9x>*B7N(jX7itOnCKypM` z1a8V2Sc@bDH(EzkZ+PNpI`C-Kmd*_`{%J&YnuWH*Jg!&`9&)ON4J(UlMMRUAkIc7m zc!ADr_f%3r_(2Y0DmAR*wxcxFah|VyikY$(ZcJvF-7OrVHx88F7Bmt_jr|Q!k~NH@ zXJ0^<12<$1P~j%=sJdV= z#N0xiqD6fGgJXziEGA;NC7*g!OetlyRQoud3GXCaZWTj+b7SAiz;V5P8R%kwaU{db z684XeI%M@mUT@T6FvZFI(T!`1tur5`qJj<|DQ8Wt^iG+402(Vcj?%-5Jv0WV5r1G0 z)XIrPO-F8XTNsM@WhmEK;>(u_N(`u_9Pzz$)}+jK=I+6a!=Mz$3nDlXAthmNX1&69 zkw6LIt{#wGp3ryUh&83t$0MK5o&dQE1q0d05@2YDqwkgDf>s^37%-<{kU8MzF6EHd zqnqiQVldsF&v!^JY`?S^zkfIoY&i?;lG8I-h%W<-l7Q9T3G%Vh{5C$TR%o@WI&QJ z9!7ktnX1Y_zgv{j5x_zWHp-@ttaHWdX89JiB_sw(S>Ndxu^Vf%hZEMrat` z*YGEGIZKjf*co22+~?`<`&Op#60M>~Jn-$l1upE&S@$jZG@F#znO6Deu7k9aahWMB zpnori#CsRoGWT`~o2fH;@@3)F=0UV2(EAD}1%ne2RXY?FhP;(-VFX7wp~A_I``g`~ zcpjqL34-N4No2M0F4q2n7V`!nMJJRtor6c+x3$J@#Qt@GKn-fDb4F;CUlS0Nw4Q~M z{y$e_&@nRNiuZsP;`L!^;nRLdTLP@gQUIm3sz!ke+#)6HZ3bboiAhPvrlRGq-ZqAC z-o>6unUF5tipfl4H)DV)1w0h`_qd0bH#|1_UaP@3KPjh|a{_#51j_R=K zN?LYnXoqy2W*v&zrkjn4XB~r+ULKXrCV@EhSW-sZvcCl2oRy&2qB|CYe*xkjzvyLj zMdBOPGn|Y{8^F*nZ3Tt;C_|pE;<=s_i5aZ6+Ea=<=+;vWs8E$*%3lX`LsxX=dZy(IHGp8}Fxj+gp>Jk|i?K z;nJ_%U2vjWe3R!f!cBD^ELI0Vl=J=%SG`uyy>Fsv#Nr4aM?U}S1Dxqrg^;4~7K1l= zo2KxuKs(AegzjX)xIa$3OFaq~mWi3HliiL|yq*nb}- za?GW$neI}~K&!SPZ8Ydo(l<@A_D%6}>->(YyZjtmk^n~YObt|S!N4*XuIEs=f~V?M z<$%AgroTM|HCaZbuUb z24gFRXcZ|Xy?;HN6{NRL;N_2hZ-eKTRVIVaqql|N{*B~RVEL|mP?3{K4SUt zQqu7Hf^dG%X4+U{@uIy&`5RFgkZ%^U|DI806@}6iDTY&s*EqbJKEYe{eTy|jp8z(D zG9xrL$PeSc+_^|5}#xwu8C!i&8-~{oh_E{|7zdH*DE0L0dawZB;k$l z`(%=D{oO7hm_}qdHCxN}_VIofjD=p}}6ayFG{gnv-?!^*fv$aI%N1j4{KYM9B zXQX?6eGWN&nab5n`tQ{a3`}riYkJhMAQLbDk7L&c3bPlKm1?CdNh{HPl<_I%0iz;y z69tP!y;f3m8oqLPLIbHQ0^FMe#3d0FDFo4^JlzdevVBff+x(rqq6O_31@$9X+>{XjA^l2?Fk zwdS{`+@=IcW9zHO2jRM|Mu_f05XAOLlF%PuriZef7Eu}+SoO{!r!FJiHQHB-&hY}d)qTP`dC&A|bvxuYt$IYfX_>yZ zDB;>BFs;42T#Azk<{R&idx z-6+kgK0zvE#QAZ=1;jv}7YH%;-m)fPW0ppula>QH0cyN8oO`Y{B=?3PE~sll_3?p! zHbOncyceLqsw;MC;t^7BPJjvqQI{w>)hRC^FUQwj_L?4Q258u4$2+GH_Hn!&`j{sl zo`3Zqybt03VM7u{a}0m~LMs-6{)NF%tT12#c8^-&tK!!1cSF)X)@aN0JR)WYNIF)~ zk8(Z=B-vtxXfG>c1qee=Tc3rFQNMYcsGT&@v&2yzKqB&*=oVkE!C3@-} zy=8D}zSN%%oqk)6qn=p)qer73Nq$QoA*{;9QG4f>sGH5YCaiG!{KrLmee4t6wCq|} z136z)x_hKds9sRi7hY=D3=eZjA!8IP3aJKnvqEva}-|80+TGIt@NJBrSRm v%&30~yU=y$TwEC18Mz#m$_kIMaTbAn|P literal 0 HcmV?d00001 From 3632c317bfdddb960597439b4a63a5427398d3b2 Mon Sep 17 00:00:00 2001 From: dglr Date: Fri, 20 Sep 2024 13:29:29 +0800 Subject: [PATCH 26/27] [Fix](mluOpCholesky): add dimension equals test --- kernels/cholesky/cholesky.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index 86d8d8638..9bc736abb 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -277,6 +277,13 @@ mluOpCholesky(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, PARAM_CHECK("mluOpCholesky", input_desc->dims[1] > 0); PARAM_CHECK("mluOpCholesky", output_desc->dims[0] > 0); PARAM_CHECK("mluOpCholesky", output_desc->dims[1] > 0); + if (input_desc->dim == 2) { + PARAM_CHECK("mluOpCholesky", input_desc->dims[0] == input_desc->dims[1]); + PARAM_CHECK("mluOpCholesky", output_desc->dims[0] == output_desc->dims[1]); + } else { + PARAM_CHECK("mluOpCholesky", input_desc->dims[1] == input_desc->dims[2]); + PARAM_CHECK("mluOpCholesky", output_desc->dims[1] == output_desc->dims[2]); + } cnrtQueue_t queue; mluOpGetQueue(handle, &queue); From f863840f1bb3badd2bb191a811088a36ffe03e4a Mon Sep 17 00:00:00 2001 From: dglr Date: Sat, 21 Sep 2024 09:47:31 +0800 Subject: [PATCH 27/27] [Fix](mluOpCholesky): add coverage function --- kernels/cholesky/cholesky.cpp | 3 - kernels/cholesky/cholesky.h | 4 +- kernels/cholesky/cholesky_union1.mlu | 137 --------------- kernels/cholesky/complex_cholesky_union1.mlu | 171 ------------------- 4 files changed, 1 insertion(+), 314 deletions(-) diff --git a/kernels/cholesky/cholesky.cpp b/kernels/cholesky/cholesky.cpp index 9bc736abb..b787a3f42 100644 --- a/kernels/cholesky/cholesky.cpp +++ b/kernels/cholesky/cholesky.cpp @@ -58,9 +58,6 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetCholeskyWorkspaceSize( *size = size_a * size_a * sizeof(float) * 2 * batch_size * 3; } printf("workspace size:%ul\n", (int)(*size)); - // if (*size > 0) { - // CHECK_RETURN("mluOpCholesky", workspace_malloc(*size, workspace)); - // } return MLUOP_STATUS_SUCCESS; } diff --git a/kernels/cholesky/cholesky.h b/kernels/cholesky/cholesky.h index 7859f3f46..a20c1cbde 100644 --- a/kernels/cholesky/cholesky.h +++ b/kernels/cholesky/cholesky.h @@ -45,7 +45,7 @@ #include "kernels/debug.h" #include "kernels/utils/cnnl_helper.h" -#define CNB (16) +#define CNB (32) #define REC_NB (16) #define POTF_NB ((REC_NB) / 4) #define CREC_NB (16) @@ -99,8 +99,6 @@ mluOpStatus_t cgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, int stride_b, float* d_rc, float* d_ic, int ldc, int stride_c, mluOpHandle_t handle, float* workspace); -mluOpStatus_t workspace_malloc(size_t size, float** workspace); - mluOpStatus_t workspace_free(float** workspace); mluOpStatus_t set_half_zero(int batch, int stride, float* d_a, int lda, int m, diff --git a/kernels/cholesky/cholesky_union1.mlu b/kernels/cholesky/cholesky_union1.mlu index fda31bef3..f983f3bf1 100644 --- a/kernels/cholesky/cholesky_union1.mlu +++ b/kernels/cholesky/cholesky_union1.mlu @@ -42,19 +42,6 @@ unsigned int next_power_of_2(unsigned int n) { __nram__ uint8_t nram_buffer[MAX_NRAM_SIZE]; __mlu_shared__ uint8_t sram_buffer[MAX_SRAM_SIZE]; -__mlu_func__ float recur_add(float* input, int length) { - if (length == 1) { - return input[0]; - } else { - int half_length; - half_length = length / 2; - float sum1 = recur_add(input, half_length); - float sum2 = recur_add(input + half_length, length - half_length); - input[0] = sum1 + sum2; - return sum1 + sum2; - } -} - __mlu_func__ float kahansum(float* input, int length) { float sum = 0.0; float c = 0.0; @@ -797,130 +784,6 @@ mluOpStatus_t strsm_rectile(int batch, int stride, bool upper, bool trans, return MLUOP_STATUS_SUCCESS; } -__mlu_global__ void add_c_batch(int batch, int stride, float beta, float* d_c, - float* src, int ldc, int ldsrc, int m, int n) { - int id = taskId; - int batch_id = id; - if (batch_id >= batch) return; - float* orignC = d_c; - float* orignSrc = src; - d_c = orignC + batch_id * stride; - src = orignSrc + batch_id * m * n; - - if (beta == 0.0f) { - __memcpy(d_c, src, n * sizeof(float), GDRAM2GDRAM, ldc * sizeof(float), - ldsrc * sizeof(float), m - 1); - return; - } - - float* a_sram = (float*)nram_buffer + m * n; - - __memcpy(nram_buffer, d_c, n * sizeof(float), LDRAM2NRAM, n * sizeof(float), - ldc * sizeof(float), m - 1); - __memcpy(a_sram, src, n * m * sizeof(float), LDRAM2NRAM); - - __sync(); - - int32_t data_num = m * n; - const float* a_offset = a_sram; - const float* b_offset = (float*)nram_buffer; - - float* a_nram = (float*)a_offset; - float* b_nram = (float*)b_offset; - - __bang_add(b_nram, a_nram, b_nram, data_num); - - __memcpy(d_c, b_nram, n * sizeof(float), NRAM2LDRAM, ldc * sizeof(float), - n * sizeof(float), m - 1); - - __sync(); -} - -__mlu_global__ void add_c(int batch, int stride, float beta, float* d_c, - float* src, int ldc, int ldsrc, int m, int n) { - int id = taskId; - int ipu_per_cluster = 4; - int batch_id = id / ipu_per_cluster; - if (batch_id >= batch) return; - id = taskId % ipu_per_cluster; - float* orignC = d_c; - float* orignSrc = src; - d_c = orignC + batch_id * stride; - src = orignSrc + batch_id * m * n; - - if (beta == 0.0f) { - if (id == 0) { - __memcpy(sram_buffer, src, n * sizeof(float), GDRAM2SRAM, - n * sizeof(float), ldsrc * sizeof(float), m - 1); - } - __sync_cluster(); - if (id == 0) { - __memcpy(d_c, sram_buffer, n * sizeof(float), SRAM2LDRAM, - ldc * sizeof(float), n * sizeof(float), m - 1); - } - __sync_cluster(); - return; - } - - float* a_sram = (float*)sram_buffer + 3 * m * n; - - if (id == 0) { - __memcpy(sram_buffer, d_c, n * sizeof(float), GDRAM2SRAM, n * sizeof(float), - ldc * sizeof(float), m - 1); - __memcpy(a_sram, src, n * m * sizeof(float), GDRAM2SRAM); - } - - __sync_cluster(); - - int32_t data_num = m * n; - int32_t data_per_core = data_num / ipu_per_cluster; - int32_t data_last_core = data_per_core + data_num % ipu_per_cluster; - const float* a_offset = a_sram + id * data_per_core; - const float* b_offset = (float*)sram_buffer + id * data_per_core; - float* output_offset = (float*)sram_buffer + id * data_per_core; - - if (id == ipu_per_cluster - 1) { - data_per_core = data_last_core; - } - - int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); - int32_t data_nram_num = - MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; - float* a_nram = (float*)nram_buffer; - float* b_nram = (float*)a_nram + data_nram_num; - int32_t loop_num = data_per_core / data_nram_num; - int32_t rem_nram_num = data_per_core % data_nram_num; - - for (int32_t i = 0; i < loop_num; i++) { - __memcpy(a_nram, a_offset + i * data_nram_num, - data_nram_num * sizeof(float), SRAM2NRAM); - __memcpy(b_nram, b_offset + i * data_nram_num, - data_nram_num * sizeof(float), SRAM2NRAM); - __bang_add(a_nram, a_nram, b_nram, data_nram_num); - __memcpy(output_offset + i * data_nram_num, a_nram, - data_nram_num * sizeof(float), NRAM2SRAM); - } - if (rem_nram_num != 0) { - int32_t rem_align_num = - (rem_nram_num + align_num - 1) / align_num * align_num; - __memcpy(a_nram, a_offset + loop_num * data_nram_num, - rem_nram_num * sizeof(float), SRAM2NRAM); - __memcpy(b_nram, b_offset + loop_num * data_nram_num, - rem_nram_num * sizeof(float), SRAM2NRAM); - __bang_add(a_nram, a_nram, b_nram, rem_align_num); - __memcpy(output_offset + loop_num * data_nram_num, a_nram, - rem_nram_num * sizeof(float), NRAM2SRAM); - } - __sync_cluster(); - - if (id == 0) { - __memcpy(d_c, sram_buffer, n * sizeof(float), SRAM2GDRAM, - ldc * sizeof(float), n * sizeof(float), m - 1); - } - - __sync_cluster(); -} - mluOpStatus_t sgemm(int batch, bool trans_a, bool trans_b, int m, int n, int k, float alpha, float beta, float* d_a, int lda, int stride_a, float* d_b, int ldb, int stride_b, float* d_c, int ldc, diff --git a/kernels/cholesky/complex_cholesky_union1.mlu b/kernels/cholesky/complex_cholesky_union1.mlu index 3346156f0..bc24ad707 100644 --- a/kernels/cholesky/complex_cholesky_union1.mlu +++ b/kernels/cholesky/complex_cholesky_union1.mlu @@ -407,177 +407,6 @@ mluOpStatus_t mlu_cpotf_lpin(int batch, int stride, int n, int lda, float* drA, return MLUOP_STATUS_SUCCESS; } -__mlu_global__ void add_c1(int batch, int stride, float beta, float* d_c, - float* src, int ldc, int ldsrc, int m, int n) { - int id = taskId; - int ipu_per_cluster = 4; - int batch_id = id / ipu_per_cluster; - if (batch_id >= batch) return; - id = taskId % ipu_per_cluster; - float* orignC = d_c; - float* orignSrc = src; - d_c = orignC + batch_id * stride; - src = orignSrc + batch_id * m * n; - - if (beta == 0.0f) { - if (id == 0) { - __memcpy(sram_buffer, src, n * sizeof(float), GDRAM2SRAM, - n * sizeof(float), ldsrc * sizeof(float), m - 1); - } - __sync_cluster(); - if (id == 0) { - __memcpy(d_c, sram_buffer, n * sizeof(float), SRAM2LDRAM, - ldc * sizeof(float), n * sizeof(float), m - 1); - } - __sync_cluster(); - return; - } - - float* a_sram = (float*)sram_buffer + 3 * m * n; - - if (id == 0) { - __memcpy(sram_buffer, d_c, n * sizeof(float), GDRAM2SRAM, n * sizeof(float), - ldc * sizeof(float), m - 1); - __memcpy(a_sram, src, n * m * sizeof(float), GDRAM2SRAM); - } - - __sync_cluster(); - - int32_t data_num = m * n; - int32_t data_per_core = data_num / ipu_per_cluster; - int32_t data_last_core = data_per_core + data_num % ipu_per_cluster; - const float* a_offset = a_sram + id * data_per_core; - const float* b_offset = (float*)sram_buffer + id * data_per_core; - float* output_offset = (float*)sram_buffer + id * data_per_core; - - if (id == ipu_per_cluster - 1) { - data_per_core = data_last_core; - } - - int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); - - int32_t data_nram_num = - MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; - float* a_nram = (float*)nram_buffer; - float* b_nram = (float*)a_nram + data_nram_num; - int32_t loop_num = data_per_core / data_nram_num; - int32_t rem_nram_num = data_per_core % data_nram_num; - - for (int32_t i = 0; i < loop_num; i++) { - __memcpy(a_nram, a_offset + i * data_nram_num, - data_nram_num * sizeof(float), SRAM2NRAM); - __memcpy(b_nram, b_offset + i * data_nram_num, - data_nram_num * sizeof(float), SRAM2NRAM); - __bang_add(a_nram, a_nram, b_nram, data_nram_num); - __memcpy(output_offset + i * data_nram_num, a_nram, - data_nram_num * sizeof(float), NRAM2SRAM); - } - if (rem_nram_num != 0) { - int32_t rem_align_num = - (rem_nram_num + align_num - 1) / align_num * align_num; - __memcpy(a_nram, a_offset + loop_num * data_nram_num, - rem_nram_num * sizeof(float), SRAM2NRAM); - __memcpy(b_nram, b_offset + loop_num * data_nram_num, - rem_nram_num * sizeof(float), SRAM2NRAM); - __bang_add(a_nram, a_nram, b_nram, rem_align_num); - __memcpy(output_offset + loop_num * data_nram_num, a_nram, - rem_nram_num * sizeof(float), NRAM2SRAM); - } - __sync_cluster(); - - if (id == 0) { - __memcpy(d_c, sram_buffer, n * sizeof(float), SRAM2GDRAM, - ldc * sizeof(float), n * sizeof(float), m - 1); - } - - __sync_cluster(); -} - -__mlu_global__ void complex_add_c(int batch, int stride, float beta, float* d_c, - float* src, int ldc, int ldsrc, int m, - int n) { - int id = taskId; - int ipu_per_cluster = 4; - id = taskId; - - int span = m / 4; - int finish = id * span; - if (id == 3) { - span = m - 3 * span; - } - - float* sram_buffer = (float*)nram_buffer; - if (beta == 0.0f) { - if (id == 0) { - __memcpy(sram_buffer, src, n * sizeof(float), GDRAM2NRAM, - n * sizeof(float), ldsrc * sizeof(float), m - 1); - } - __sync_cluster(); - if (id == 0) { - __memcpy(d_c, sram_buffer, n * sizeof(float), NRAM2LDRAM, - ldc * sizeof(float), n * sizeof(float), m - 1); - } - __sync_cluster(); - return; - } - - float* a_sram = (float*)sram_buffer + 3 * m * n; - - int d_c_offset = ldc * finish; - int src_offset = ldsrc * finish; - - __memcpy(sram_buffer, d_c + d_c_offset, n * sizeof(float), LDRAM2NRAM, - n * sizeof(float), ldc * sizeof(float), span - 1); - __memcpy(a_sram, src + src_offset, n * span * sizeof(float), LDRAM2NRAM); - - int32_t data_per_core = span * n; - int32_t data_last_core = data_per_core; - const float* a_offset = a_sram; - const float* b_offset = (float*)sram_buffer; - float* output_offset = (float*)sram_buffer; - - if (id == ipu_per_cluster - 1) { - data_per_core = data_last_core; - } - - int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); - - int32_t data_nram_num = - MAX_NRAM_SIZE / sizeof(float) / 2 / align_num * align_num; - float* a_nram = (float*)a_sram + m * n; - float* b_nram = (float*)a_nram + data_nram_num; - int32_t loop_num = data_per_core / data_nram_num; - int32_t rem_nram_num = data_per_core % data_nram_num; - - for (int32_t i = 0; i < loop_num; i++) { - __memcpy(a_nram, a_offset + i * data_nram_num, - data_nram_num * sizeof(float), NRAM2NRAM); - __memcpy(b_nram, b_offset + i * data_nram_num, - data_nram_num * sizeof(float), NRAM2NRAM); - __bang_add(a_nram, a_nram, b_nram, data_nram_num); - __memcpy(output_offset + i * data_nram_num, a_nram, - data_nram_num * sizeof(float), NRAM2NRAM); - } - if (rem_nram_num != 0) { - int32_t rem_align_num = - (rem_nram_num + align_num - 1) / align_num * align_num; - __memcpy(a_nram, a_offset + loop_num * data_nram_num, - rem_nram_num * sizeof(float), NRAM2NRAM); - __memcpy(b_nram, b_offset + loop_num * data_nram_num, - rem_nram_num * sizeof(float), NRAM2NRAM); - __bang_add(a_nram, a_nram, b_nram, rem_align_num); - __memcpy(output_offset + loop_num * data_nram_num, a_nram, - rem_nram_num * sizeof(float), NRAM2NRAM); - } - __memcpy(d_c + d_c_offset, sram_buffer, n * sizeof(float), NRAM2LDRAM, - ldc * sizeof(float), n * sizeof(float), span - 1); -} - -mluOpStatus_t workspace_malloc(size_t size, float** workspace) { - CNRT_CHECK(cnrtMalloc((void**)workspace, size)); - - return MLUOP_STATUS_SUCCESS; -} __mlu_global__ void complex_inverse_kernel(int batch, float* rd_input, float* id_input, int ld_input,