Skip to content

Commit

Permalink
refactor: 重构变量处理模块, 采用模块化设计而为计算变量等未来功能提供支持
Browse files Browse the repository at this point in the history
  • Loading branch information
LeafYeeXYZ committed Oct 27, 2024
1 parent c8bccca commit 4b62023
Show file tree
Hide file tree
Showing 8 changed files with 519 additions and 418 deletions.
174 changes: 174 additions & 0 deletions src/lib/derive.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/**
* @file 处理派生变量, 包括标准化, 中心化, 离散化
*/

import type { Variable, AllowedDiscreteMethods } from './types'
import { mean, std, quantileSeq, max, min } from 'mathjs'
import { calculateMode } from './utils'
import { kmeans } from 'ml-kmeans'

/** 生成子变量 */
export class Derive {

/**
* 生成子变量
* @param dataCols 数据列
* @param dataRows 数据行
*/
constructor(
dataCols: Variable[],
dataRows: { [key: string]: unknown }[]
) {
const derivedCols: Variable[] = []
dataCols.forEach((col) => {
if (col.derived) {
return
}
if (col.subVars?.standard) {
derivedCols.push({
name: `${col.name}_标准化`,
derived: true,
count: col.count,
missing: col.missing,
valid: col.valid,
unique: col.unique,
type: col.type,
min: Number(col.min! - col.mean!) / col.std!,
max: Number(col.max! - col.mean!) / col.std!,
mean: 0,
q1: Number(col.q1! - col.mean!) / col.std!,
q2: Number(col.q2! - col.mean!) / col.std!,
q3: Number(col.q3! - col.mean!) / col.std!,
std: 1,
mode: ((parseFloat(col.mode!) - col.mean!) / col.std!).toFixed(4) + (/皮尔逊经验公式/.test(col.mode!) ? '(皮尔逊经验公式)' : ''),
})
dataRows.forEach((row) => {
row[`${col.name}_标准化`] = (Number(row[col.name]) - col.mean!) / col.std!
})
}
if (col.subVars?.center) {
derivedCols.push({
name: `${col.name}_中心化`,
derived: true,
count: col.count,
missing: col.missing,
valid: col.valid,
unique: col.unique,
type: col.type,
min: Number(col.min! - col.mean!),
max: Number(col.max! - col.mean!),
mean: 0,
q1: Number(col.q1! - col.mean!),
q2: Number(col.q2! - col.mean!),
q3: Number(col.q3! - col.mean!),
std: col.std,
mode: (parseFloat(col.mode!) - col.mean!).toFixed(4) + (/皮尔逊经验公式/.test(col.mode!) ? '(皮尔逊经验公式)' : ''),
})
dataRows.forEach((row) => {
row[`${col.name}_中心化`] = Number(row[col.name]) - col.mean!
})
}
if (col.subVars?.discrete) {
const groups = col.subVars.discrete.groups
const method = col.subVars.discrete.method
const discrete = new Discrete(
dataRows.filter((row) => typeof row[col.name] !== 'undefined').map((row) => Number(row[col.name])),
groups,
method
)
const predictedData = dataRows.map((row) => discrete.predictor(typeof row[col.name] !== 'undefined' ? Number(row[col.name]) : undefined))
const predictedNums = predictedData.filter((v) => typeof v !== 'undefined') as number[]
derivedCols.push({
name: `${col.name}_${method}离散`,
derived: true,
count: col.count,
missing: col.missing,
valid: col.valid,
unique: groups,
type: col.type,
min: 0,
max: groups - 1,
mean: Number(mean(predictedNums)),
q1: quantileSeq(predictedNums, 0.25),
q2: quantileSeq(predictedNums, 0.5),
q3: quantileSeq(predictedNums, 0.75),
std: Number(std(predictedNums)),
mode: calculateMode(predictedNums),
})
predictedData.forEach((v, i) => {
dataRows[i][`${col.name}_${method}离散`] = v
})
}
})
this.updatedCols = [...derivedCols, ...dataCols]
this.updatedRows = dataRows
}

/** 更新后的数据列 */
updatedCols: Variable[]
/** 更新后的数据行 */
updatedRows: { [key: string]: unknown }[]

}

/** 变量离散化 */
class Discrete {

/**
* 变量离散化
* @param data 原始数据
* @param groups 分组数
* @param methed 离散化方法
*/
constructor(data: number[], groups: number, methed: AllowedDiscreteMethods) {

this.method = methed
this.groups = groups
this.#data = data.toSorted()
this.#min = min(data)
this.#max = max(data)
switch (methed) {
case '等宽':
this.predictor = (data: number | undefined) => {
if (typeof data === 'undefined') return undefined
return Math.floor((data - this.#min) / (this.#range / this.groups))
}
break
case '等频':
this.predictor = (data: number | undefined) => {
if (typeof data === 'undefined') return undefined
return Math.floor(this.#data.findIndex((v) => v >= data) / (this.#count / this.groups))
}
break
case '聚类分析':
const { clusters } = kmeans(data.map((v) => [v]), groups, {})
this.#kmeans = new Map(clusters.map((v, i) => [data[i], v]))
this.predictor = (index: number | undefined) => {
if (typeof index === 'undefined') return undefined
return this.#kmeans?.get(index)
}
break
}

}

/** 分组器 */
predictor: (data: number | undefined) => number | undefined
/** 分组方法 */
method: AllowedDiscreteMethods
/** 分组数 */
groups: number
/** 排序后数据 */
#data: number[]
/** 数据最小值 */
#min: number
/** 数据最大值 */
#max: number
/** 数据全距 */
get #range() { return this.#max - this.#min }
/** 数据量 */
get #count() { return this.#data.length }
/** 聚类分析的分析结果 (原始数据 => 分组) */
#kmeans?: Map<number, number>

}
58 changes: 58 additions & 0 deletions src/lib/describe.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/**
* @file 生成描述统计量
*/

import type { Variable } from './types'
import { min, max, mean, quantileSeq, std } from 'mathjs'
import { calculateMode } from './utils'

/** 生成描述统计量 */
export class Describe {

/**
* 生成描述统计量
* @param dataCols 数据列
* @param dataRows 数据行
*/
constructor(
dataCols: Variable[],
dataRows: { [key: string]: unknown }[]
) {
this.updatedCols = dataCols.map((col) => {
// 原始数据
const data = dataRows.map((row) => row[col.name])
// 基础统计量
const count = data.length
const missing = data.filter((v) => v === undefined).length
const valid = count - missing
const unique = new Set(data.filter((v) => v !== undefined)).size
let type: '称名或等级数据' | '等距或等比数据' = '称名或等级数据'
if (
data.every((v) => typeof v === 'undefined' || !isNaN(Number(v))) &&
data.some((v) => !isNaN(Number(v)))
) {
type = '等距或等比数据'
const nums = data.filter((v) => typeof v !== 'undefined').map((v) => Number(v))
return { ...col, count, missing, valid, unique, type,
min: min(nums),
max: max(nums),
mean: mean(nums),
std: Number(std(nums)),
q1: quantileSeq(nums, 0.25),
q2: quantileSeq(nums, 0.5),
q3: quantileSeq(nums, 0.75),
mode: calculateMode(nums)
}
} else {
return { ...col, count, missing, valid, unique, type }
}
})
this.updatedRows = dataRows
}

/** 更新后的数据列 */
updatedCols: Variable[]
/** 更新后的数据行 */
updatedRows: { [key: string]: unknown }[]

}
66 changes: 0 additions & 66 deletions src/lib/discrete.ts

This file was deleted.

Loading

0 comments on commit 4b62023

Please sign in to comment.