refactor: 重构变量处理模块, 采用模块化设计而为计算变量等未来功能提供支持

LeafYeeXYZ · Oct 27, 2024 · 4b62023 · 4b62023
1 parent c8bccca
commit 4b62023
Show file tree

Hide file tree

Showing 8 changed files with 519 additions and 418 deletions.
diff --git a/src/lib/derive.ts b/src/lib/derive.ts
@@ -0,0 +1,174 @@
+/**
+ * @file 处理派生变量, 包括标准化, 中心化, 离散化
+ */
+
+import type { Variable, AllowedDiscreteMethods } from './types'
+import { mean, std, quantileSeq, max, min } from 'mathjs'
+import { calculateMode } from './utils'
+import { kmeans } from 'ml-kmeans'
+
+/** 生成子变量 */
+export class Derive {
+
+  /**
+   * 生成子变量
+   * @param dataCols 数据列
+   * @param dataRows 数据行
+   */
+  constructor(
+    dataCols: Variable[], 
+    dataRows: { [key: string]: unknown }[]
+  ) {
+    const derivedCols: Variable[] = []
+    dataCols.forEach((col) => {
+      if (col.derived) {
+        return
+      }
+      if (col.subVars?.standard) {
+        derivedCols.push({ 
+          name: `${col.name}_标准化`, 
+          derived: true,
+          count: col.count,
+          missing: col.missing,
+          valid: col.valid,
+          unique: col.unique,
+          type: col.type,
+          min: Number(col.min! - col.mean!) / col.std!,
+          max: Number(col.max! - col.mean!) / col.std!,
+          mean: 0,
+          q1: Number(col.q1! - col.mean!) / col.std!,
+          q2: Number(col.q2! - col.mean!) / col.std!,
+          q3: Number(col.q3! - col.mean!) / col.std!,
+          std: 1,
+          mode: ((parseFloat(col.mode!) - col.mean!) / col.std!).toFixed(4) + (/皮尔逊经验公式/.test(col.mode!) ? '(皮尔逊经验公式)' : ''),
+        })
+        dataRows.forEach((row) => {
+          row[`${col.name}_标准化`] = (Number(row[col.name]) - col.mean!) / col.std!
+        })
+      }
+      if (col.subVars?.center) {
+        derivedCols.push({ 
+          name: `${col.name}_中心化`, 
+          derived: true,
+          count: col.count,
+          missing: col.missing,
+          valid: col.valid,
+          unique: col.unique,
+          type: col.type,
+          min: Number(col.min! - col.mean!),
+          max: Number(col.max! - col.mean!),
+          mean: 0,
+          q1: Number(col.q1! - col.mean!),
+          q2: Number(col.q2! - col.mean!),
+          q3: Number(col.q3! - col.mean!),
+          std: col.std,
+          mode: (parseFloat(col.mode!) - col.mean!).toFixed(4) + (/皮尔逊经验公式/.test(col.mode!) ? '(皮尔逊经验公式)' : ''),
+        })
+        dataRows.forEach((row) => {
+          row[`${col.name}_中心化`] = Number(row[col.name]) - col.mean!
+        })
+      }
+      if (col.subVars?.discrete) {
+        const groups = col.subVars.discrete.groups
+        const method = col.subVars.discrete.method
+        const discrete = new Discrete(
+          dataRows.filter((row) => typeof row[col.name] !== 'undefined').map((row) => Number(row[col.name])),
+          groups,
+          method
+        )
+        const predictedData = dataRows.map((row) => discrete.predictor(typeof row[col.name] !== 'undefined' ? Number(row[col.name]) : undefined))
+        const predictedNums = predictedData.filter((v) => typeof v !== 'undefined') as number[]
+        derivedCols.push({ 
+          name: `${col.name}_${method}离散`, 
+          derived: true,
+          count: col.count,
+          missing: col.missing,
+          valid: col.valid,
+          unique: groups,
+          type: col.type,
+          min: 0,
+          max: groups - 1,
+          mean: Number(mean(predictedNums)),
+          q1: quantileSeq(predictedNums, 0.25),
+          q2: quantileSeq(predictedNums, 0.5),
+          q3: quantileSeq(predictedNums, 0.75),
+          std: Number(std(predictedNums)),
+          mode: calculateMode(predictedNums),
+        })
+        predictedData.forEach((v, i) => {
+          dataRows[i][`${col.name}_${method}离散`] = v
+        })
+      }
+    })
+    this.updatedCols = [...derivedCols, ...dataCols]
+    this.updatedRows = dataRows
+  }
+
+  /** 更新后的数据列 */
+  updatedCols: Variable[]
+  /** 更新后的数据行 */
+  updatedRows: { [key: string]: unknown }[]
+
+}
+
+/** 变量离散化 */
+class Discrete {
+
+  /**
+   * 变量离散化
+   * @param data 原始数据
+   * @param groups 分组数
+   * @param methed 离散化方法
+   */
+  constructor(data: number[], groups: number, methed: AllowedDiscreteMethods) {
+
+    this.method = methed
+    this.groups = groups
+    this.#data = data.toSorted()
+    this.#min = min(data)
+    this.#max = max(data)
+    switch (methed) {
+      case '等宽':
+        this.predictor = (data: number | undefined) => {
+          if (typeof data === 'undefined') return undefined
+          return Math.floor((data - this.#min) / (this.#range / this.groups))
+        }
+        break
+      case '等频':
+        this.predictor = (data: number | undefined) => {
+          if (typeof data === 'undefined') return undefined
+          return Math.floor(this.#data.findIndex((v) => v >= data) / (this.#count / this.groups))
+        }
+        break
+      case '聚类分析':
+        const { clusters } = kmeans(data.map((v) => [v]), groups, {})
+        this.#kmeans = new Map(clusters.map((v, i) => [data[i], v]))
+        this.predictor = (index: number | undefined) => {
+          if (typeof index === 'undefined') return undefined
+          return this.#kmeans?.get(index)
+        }
+        break
+    }
+
+  }
+
+  /** 分组器 */
+  predictor: (data: number | undefined) => number | undefined
+  /** 分组方法 */
+  method: AllowedDiscreteMethods
+  /** 分组数 */
+  groups: number
+  /** 排序后数据 */
+  #data: number[]
+  /** 数据最小值 */
+  #min: number
+  /** 数据最大值 */
+  #max: number
+  /** 数据全距 */
+  get #range() { return this.#max - this.#min }
+  /** 数据量 */
+  get #count() { return this.#data.length }
+  /** 聚类分析的分析结果 (原始数据 => 分组) */
+  #kmeans?: Map<number, number>
+
+}
diff --git a/src/lib/describe.ts b/src/lib/describe.ts
@@ -0,0 +1,58 @@
+/**
+ * @file 生成描述统计量
+ */
+
+import type { Variable } from './types'
+import { min, max, mean, quantileSeq, std } from 'mathjs'
+import { calculateMode } from './utils'
+
+/** 生成描述统计量 */
+export class Describe {
+
+  /**
+   * 生成描述统计量
+   * @param dataCols 数据列
+   * @param dataRows 数据行
+   */
+  constructor(
+    dataCols: Variable[], 
+    dataRows: { [key: string]: unknown }[]
+  ) {
+    this.updatedCols = dataCols.map((col) => {
+      // 原始数据
+      const data = dataRows.map((row) => row[col.name])
+      // 基础统计量
+      const count = data.length
+      const missing = data.filter((v) => v === undefined).length
+      const valid = count - missing
+      const unique = new Set(data.filter((v) => v !== undefined)).size
+      let type: '称名或等级数据' | '等距或等比数据' = '称名或等级数据'
+      if ( 
+        data.every((v) => typeof v === 'undefined' || !isNaN(Number(v))) &&
+        data.some((v) => !isNaN(Number(v)))
+      ) {
+        type = '等距或等比数据'
+        const nums = data.filter((v) => typeof v !== 'undefined').map((v) => Number(v))
+        return { ...col, count, missing, valid, unique, type, 
+          min: min(nums), 
+          max: max(nums), 
+          mean: mean(nums), 
+          std: Number(std(nums)), 
+          q1: quantileSeq(nums, 0.25), 
+          q2: quantileSeq(nums, 0.5), 
+          q3: quantileSeq(nums, 0.75), 
+          mode: calculateMode(nums)
+        }
+      } else {
+        return { ...col, count, missing, valid, unique, type }
+      }
+    })
+    this.updatedRows = dataRows 
+  }
+
+  /** 更新后的数据列 */
+  updatedCols: Variable[]
+  /** 更新后的数据行 */
+  updatedRows: { [key: string]: unknown }[]
+
+}
diff --git a/src/lib/discrete.ts b/src/lib/discrete.ts