-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: 重构变量处理模块, 采用模块化设计而为计算变量等未来功能提供支持
- Loading branch information
1 parent
c8bccca
commit 4b62023
Showing
8 changed files
with
519 additions
and
418 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
/** | ||
* @file 处理派生变量, 包括标准化, 中心化, 离散化 | ||
*/ | ||
|
||
import type { Variable, AllowedDiscreteMethods } from './types' | ||
import { mean, std, quantileSeq, max, min } from 'mathjs' | ||
import { calculateMode } from './utils' | ||
import { kmeans } from 'ml-kmeans' | ||
|
||
/** 生成子变量 */ | ||
export class Derive { | ||
|
||
/** | ||
* 生成子变量 | ||
* @param dataCols 数据列 | ||
* @param dataRows 数据行 | ||
*/ | ||
constructor( | ||
dataCols: Variable[], | ||
dataRows: { [key: string]: unknown }[] | ||
) { | ||
const derivedCols: Variable[] = [] | ||
dataCols.forEach((col) => { | ||
if (col.derived) { | ||
return | ||
} | ||
if (col.subVars?.standard) { | ||
derivedCols.push({ | ||
name: `${col.name}_标准化`, | ||
derived: true, | ||
count: col.count, | ||
missing: col.missing, | ||
valid: col.valid, | ||
unique: col.unique, | ||
type: col.type, | ||
min: Number(col.min! - col.mean!) / col.std!, | ||
max: Number(col.max! - col.mean!) / col.std!, | ||
mean: 0, | ||
q1: Number(col.q1! - col.mean!) / col.std!, | ||
q2: Number(col.q2! - col.mean!) / col.std!, | ||
q3: Number(col.q3! - col.mean!) / col.std!, | ||
std: 1, | ||
mode: ((parseFloat(col.mode!) - col.mean!) / col.std!).toFixed(4) + (/皮尔逊经验公式/.test(col.mode!) ? '(皮尔逊经验公式)' : ''), | ||
}) | ||
dataRows.forEach((row) => { | ||
row[`${col.name}_标准化`] = (Number(row[col.name]) - col.mean!) / col.std! | ||
}) | ||
} | ||
if (col.subVars?.center) { | ||
derivedCols.push({ | ||
name: `${col.name}_中心化`, | ||
derived: true, | ||
count: col.count, | ||
missing: col.missing, | ||
valid: col.valid, | ||
unique: col.unique, | ||
type: col.type, | ||
min: Number(col.min! - col.mean!), | ||
max: Number(col.max! - col.mean!), | ||
mean: 0, | ||
q1: Number(col.q1! - col.mean!), | ||
q2: Number(col.q2! - col.mean!), | ||
q3: Number(col.q3! - col.mean!), | ||
std: col.std, | ||
mode: (parseFloat(col.mode!) - col.mean!).toFixed(4) + (/皮尔逊经验公式/.test(col.mode!) ? '(皮尔逊经验公式)' : ''), | ||
}) | ||
dataRows.forEach((row) => { | ||
row[`${col.name}_中心化`] = Number(row[col.name]) - col.mean! | ||
}) | ||
} | ||
if (col.subVars?.discrete) { | ||
const groups = col.subVars.discrete.groups | ||
const method = col.subVars.discrete.method | ||
const discrete = new Discrete( | ||
dataRows.filter((row) => typeof row[col.name] !== 'undefined').map((row) => Number(row[col.name])), | ||
groups, | ||
method | ||
) | ||
const predictedData = dataRows.map((row) => discrete.predictor(typeof row[col.name] !== 'undefined' ? Number(row[col.name]) : undefined)) | ||
const predictedNums = predictedData.filter((v) => typeof v !== 'undefined') as number[] | ||
derivedCols.push({ | ||
name: `${col.name}_${method}离散`, | ||
derived: true, | ||
count: col.count, | ||
missing: col.missing, | ||
valid: col.valid, | ||
unique: groups, | ||
type: col.type, | ||
min: 0, | ||
max: groups - 1, | ||
mean: Number(mean(predictedNums)), | ||
q1: quantileSeq(predictedNums, 0.25), | ||
q2: quantileSeq(predictedNums, 0.5), | ||
q3: quantileSeq(predictedNums, 0.75), | ||
std: Number(std(predictedNums)), | ||
mode: calculateMode(predictedNums), | ||
}) | ||
predictedData.forEach((v, i) => { | ||
dataRows[i][`${col.name}_${method}离散`] = v | ||
}) | ||
} | ||
}) | ||
this.updatedCols = [...derivedCols, ...dataCols] | ||
this.updatedRows = dataRows | ||
} | ||
|
||
/** 更新后的数据列 */ | ||
updatedCols: Variable[] | ||
/** 更新后的数据行 */ | ||
updatedRows: { [key: string]: unknown }[] | ||
|
||
} | ||
|
||
/** 变量离散化 */ | ||
class Discrete { | ||
|
||
/** | ||
* 变量离散化 | ||
* @param data 原始数据 | ||
* @param groups 分组数 | ||
* @param methed 离散化方法 | ||
*/ | ||
constructor(data: number[], groups: number, methed: AllowedDiscreteMethods) { | ||
|
||
this.method = methed | ||
this.groups = groups | ||
this.#data = data.toSorted() | ||
this.#min = min(data) | ||
this.#max = max(data) | ||
switch (methed) { | ||
case '等宽': | ||
this.predictor = (data: number | undefined) => { | ||
if (typeof data === 'undefined') return undefined | ||
return Math.floor((data - this.#min) / (this.#range / this.groups)) | ||
} | ||
break | ||
case '等频': | ||
this.predictor = (data: number | undefined) => { | ||
if (typeof data === 'undefined') return undefined | ||
return Math.floor(this.#data.findIndex((v) => v >= data) / (this.#count / this.groups)) | ||
} | ||
break | ||
case '聚类分析': | ||
const { clusters } = kmeans(data.map((v) => [v]), groups, {}) | ||
this.#kmeans = new Map(clusters.map((v, i) => [data[i], v])) | ||
this.predictor = (index: number | undefined) => { | ||
if (typeof index === 'undefined') return undefined | ||
return this.#kmeans?.get(index) | ||
} | ||
break | ||
} | ||
|
||
} | ||
|
||
/** 分组器 */ | ||
predictor: (data: number | undefined) => number | undefined | ||
/** 分组方法 */ | ||
method: AllowedDiscreteMethods | ||
/** 分组数 */ | ||
groups: number | ||
/** 排序后数据 */ | ||
#data: number[] | ||
/** 数据最小值 */ | ||
#min: number | ||
/** 数据最大值 */ | ||
#max: number | ||
/** 数据全距 */ | ||
get #range() { return this.#max - this.#min } | ||
/** 数据量 */ | ||
get #count() { return this.#data.length } | ||
/** 聚类分析的分析结果 (原始数据 => 分组) */ | ||
#kmeans?: Map<number, number> | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/** | ||
* @file 生成描述统计量 | ||
*/ | ||
|
||
import type { Variable } from './types' | ||
import { min, max, mean, quantileSeq, std } from 'mathjs' | ||
import { calculateMode } from './utils' | ||
|
||
/** 生成描述统计量 */ | ||
export class Describe { | ||
|
||
/** | ||
* 生成描述统计量 | ||
* @param dataCols 数据列 | ||
* @param dataRows 数据行 | ||
*/ | ||
constructor( | ||
dataCols: Variable[], | ||
dataRows: { [key: string]: unknown }[] | ||
) { | ||
this.updatedCols = dataCols.map((col) => { | ||
// 原始数据 | ||
const data = dataRows.map((row) => row[col.name]) | ||
// 基础统计量 | ||
const count = data.length | ||
const missing = data.filter((v) => v === undefined).length | ||
const valid = count - missing | ||
const unique = new Set(data.filter((v) => v !== undefined)).size | ||
let type: '称名或等级数据' | '等距或等比数据' = '称名或等级数据' | ||
if ( | ||
data.every((v) => typeof v === 'undefined' || !isNaN(Number(v))) && | ||
data.some((v) => !isNaN(Number(v))) | ||
) { | ||
type = '等距或等比数据' | ||
const nums = data.filter((v) => typeof v !== 'undefined').map((v) => Number(v)) | ||
return { ...col, count, missing, valid, unique, type, | ||
min: min(nums), | ||
max: max(nums), | ||
mean: mean(nums), | ||
std: Number(std(nums)), | ||
q1: quantileSeq(nums, 0.25), | ||
q2: quantileSeq(nums, 0.5), | ||
q3: quantileSeq(nums, 0.75), | ||
mode: calculateMode(nums) | ||
} | ||
} else { | ||
return { ...col, count, missing, valid, unique, type } | ||
} | ||
}) | ||
this.updatedRows = dataRows | ||
} | ||
|
||
/** 更新后的数据列 */ | ||
updatedCols: Variable[] | ||
/** 更新后的数据行 */ | ||
updatedRows: { [key: string]: unknown }[] | ||
|
||
} |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.