Members
WCluster
- Description:
數據主成份分析與分群
- Source:
數據主成份分析與分群
Example
async function testPCA() {
let mat = [
[40, 50, 60],
[50, 70, 60],
[80, 70, 90],
[50, 60, 80]
]
console.log('mat', mat)
// => mat [ [ 40, 50, 60 ], [ 50, 70, 60 ], [ 80, 70, 90 ], [ 50, 60, 80 ] ]
let resMat = await WCluster.PCA(mat, { nCompNIPALS: 2 })
console.log(resMat)
// => [
// [ -1.704002697669786, 0.43087564048799354 ],
// [ -0.2509699164590232, -1.1519035967558147 ],
// [ 1.9913098008410954, 0.23244503334983269 ],
// [ -0.03633718671228592, 0.48858292291798827 ]
// ]
let mat2 = [
[1040, 50, 60],
[1050, 70, 60],
[1080, 70, 90],
[1050, 60, 80]
]
console.log('mat2', mat2)
// => mat [ [ 1040, 50, 60 ], [ 1050, 70, 60 ], [ 1080, 70, 90 ], [ 1050, 60, 80 ] ]
let resMat2 = await WCluster.PCA(mat2, { nCompNIPALS: 2 })
console.log(resMat2)
// => [
// [ -1.704002697669786, 0.43087564048799354 ],
// [ -0.2509699164590232, -1.1519035967558147 ],
// [ 1.9913098008410954, 0.23244503334983269 ],
// [ -0.03633718671228592, 0.48858292291798827 ]
// ]
let mat3 = [
[11040, 50, 60],
[13050, 70, 60],
[15080, 70, 90],
[17050, 60, 80]
]
console.log('mat3', mat3)
// => mat [ [ 11040, 50, 60 ], [ 13050, 70, 60 ], [ 15080, 70, 90 ], [ 17050, 60, 80 ] ]
let resMat3 = await WCluster.PCA(mat3, { nCompNIPALS: 2 })
console.log(resMat3)
// => [
// [ -1.8599655569892897, 0.4908764271508211 ],
// [ -0.3950941652793108, -1.0977355294143445 ],
// [ 1.3430199944041517, -0.17213692267477554 ],
// [ 0.912039727864449, 0.778996024938299 ]
// ]
}
testPCA()
.catch((err) => {
console.log(err)
})
async function testCluster() {
let mode = 'k-medoids'
let mat = [
[40, 50, 60],
[50, 70, 60],
[80, 70, 90],
[50, 60, 80]
]
console.log('mat', mat)
// => mat [ [ 40, 50, 60 ], [ 50, 70, 60 ], [ 80, 70, 90 ], [ 50, 60, 80 ] ]
let resMat = await WCluster.cluster(mat, { mode, kNumber: 2, nCompNIPALS: 2 })
console.log(JSON.stringify(resMat, null, 2))
// => {
// "keys": null,
// "ginds": [
// [ 0, 1, 3 ],
// [ 2 ]
// ],
// "gmat": [
// [
// [ -1.704002697669786, 0.43087564048799354 ],
// [ -0.2509699164590232, -1.1519035967558147 ],
// [ -0.03633718671228592, 0.48858292291798827 ]
// ],
// [
// [ 1.9913098008410954, 0.23244503334983269 ]
// ]
// ],
// "gltdt": [
// [
// [ 40, 50, 60 ],
// [ 50, 70, 60 ],
// [ 50, 60, 80 ]
// ],
// [
// [ 80, 70, 90 ]
// ]
// ]
// }
let mat2 = [
[1040, 50, 60],
[1050, 70, 60],
[1080, 70, 90],
[1050, 60, 80]
]
console.log('mat2', mat2)
// => mat [ [ 1040, 50, 60 ], [ 1050, 70, 60 ], [ 1080, 70, 90 ], [ 1050, 60, 80 ] ]
let resMat2 = await WCluster.cluster(mat2, { mode, kNumber: 2, nCompNIPALS: 2 })
console.log(JSON.stringify(resMat2, null, 2))
// => {
// "keys": null,
// "ginds": [
// [ 0, 1, 3 ],
// [ 2 ]
// ],
// "gmat": [
// [
// [ -1.704002697669786, 0.43087564048799354 ],
// [ -0.2509699164590232, -1.1519035967558147 ],
// [ -0.03633718671228592, 0.48858292291798827 ]
// ],
// [
// [ 1.9913098008410954, 0.23244503334983269 ]
// ]
// ],
// "gltdt": [
// [
// [ 1040, 50, 60 ],
// [ 1050, 70, 60 ],
// [ 1050, 60, 80 ]
// ],
// [
// [ 1080, 70, 90 ]
// ]
// ]
// }
let mat3 = [
[11040, 50, 60],
[13050, 70, 60],
[15080, 70, 90],
[17050, 60, 80]
]
console.log('mat3', mat3)
// => mat [ [ 11040, 50, 60 ], [ 13050, 70, 60 ], [ 15080, 70, 90 ], [ 17050, 60, 80 ] ]
let resMat3 = await WCluster.cluster(mat3, { mode, kNumber: 2, nCompNIPALS: 2 })
console.log(JSON.stringify(resMat3, null, 2))
// => {
// "keys": null,
// "ginds": [
// [ 1, 2, 3 ],
// [ 0 ]
// ],
// "gmat": [
// [
// [ -0.3950941652793108, -1.0977355294143445 ],
// [ 1.3430199944041517, -0.17213692267477554 ],
// [ 0.912039727864449, 0.778996024938299 ]
// ],
// [
// [ -1.8599655569892897, 0.4908764271508211 ]
// ]
// ],
// "gltdt": [
// [
// [ 13050, 70, 60 ],
// [ 15080, 70, 90 ],
// [ 17050, 60, 80 ]
// ],
// [
// [ 11040, 50, 60 ]
// ]
// ]
// }
let ltdt = [
{ name: 'Cameron', a: 40, b: 50, c: 60 },
{ name: 'Buckley', a: 50, b: 70, c: 60 },
{ name: 'Paul', a: 80, b: 70, c: 90 },
{ name: 'Fawcett', a: 50, b: 60, c: 80 },
]
console.log('ltdt', ltdt)
// => ltdt [
// { name: 'Cameron', a: 40, b: 50, c: 60 },
// { name: 'Buckley', a: 50, b: 70, c: 60 },
// { name: 'Paul', a: 80, b: 70, c: 90 },
// { name: 'Fawcett', a: 50, b: 60, c: 80 }
// ]
let resLtdt = await WCluster.cluster(ltdt, { mode, kNumber: 2, nCompNIPALS: 2 })
console.log(JSON.stringify(resLtdt, null, 2))
// => {
// "keys": [ "a", "b", "c" ],
// "ginds": [
// [ 0, 1, 3 ],
// [ 2 ]
// ],
// "gmat": [
// [
// [ -1.704002697669786, 0.43087564048799354 ],
// [ -0.2509699164590232, -1.1519035967558147 ],
// [ -0.03633718671228592, 0.48858292291798827 ]
// ],
// [
// [ 1.9913098008410954, 0.23244503334983269 ]
// ]
// ],
// "gltdt": [
// [
// { "name": "Cameron", "a": 40, "b": 50, "c": 60 },
// { "name": "Buckley", "a": 50, "b": 70, "c": 60 },
// { "name": "Fawcett", "a": 50, "b": 60, "c": 80 }
// ],
// [
// { "name": "Paul", "a": 80, "b": 70, "c": 90 }
// ]
// ]
// }
}
testCluster()
.catch((err) => {
console.log(err)
})
Methods
(async) PCA(data, optopt) → {Promise}
- Description:
數據主成分分析(Principal Component Analysis, PCA)
- Source:
Example
Parameters:
| Name | Type | Attributes | Default | Description | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
data |
Array | 輸入數據陣列 |
|||||||||||||||||
opt |
Object |
<optional> |
{}
|
輸入設定物件,預設{} Properties
|
Returns:
回傳Promise,resolve回傳指定維數數值的陣列,reject回傳錯誤訊息
- Type
- Promise
alternating(mat, med, maxiter) → {Object}
- Description:
Run the Alternating algorithm, a k-means-style alternate optimization.
This is fairly fast (O(n²), like the FasterPAM method), but because the newly chosen medoid must cover the entire existing cluster, it tends to get stuck in worse local optima as the alternatives. Hence, it is not really recommended to use this algorithm (also known as "Alternate" in classic facility location literature, and re-invented by Park and Jun 2009).
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
* | a pairwise distance matrix (2D array or adapter object) |
med |
Array.<number> | the list of medoids (mutated in place) |
maxiter |
number | the maximum number of iterations allowed |
Returns:
- Type
- Object
arrayAdapter()
- Description:
Normalize user input into a matrix object. Idempotent. Accepts a 2D array, DenseMatrix, or LowerTriangle.
- Source:
assign_nearest(mat, med, dataArr) → {number}
- Description:
Assign each point to the nearest medoid, return total loss. Mutates dataArr[i] = index into med of nearest medoid.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
* | already-wrapped matrix |
med |
Array.<number> | medoid point indices |
dataArr |
Array.<number> | assignment array, mutated in place |
Returns:
total loss
- Type
- number
choose_medoid_within_partition()
- Description:
Choose the best medoid within a partition. Mutates med[m]. Returns [changed, sumb].
- Source:
(async) cluster(data, optopt) → {Promise}
- Description:
針對數據陣列做分群
- Source:
Example
Parameters:
| Name | Type | Attributes | Default | Description | ||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
data |
Array | 輸入數據陣列,可輸入純數據陣列(mat)或物件陣列(ltdt) |
||||||||||||||||||||||||||||||||||||||||||
opt |
Object |
<optional> |
{}
|
輸入設定物件,預設{} Properties
|
Returns:
回傳Promise,resolve回傳分群後結果物件,keys代表分群有使用到的欄位,ginds代表分群後的指標陣列,gltdt代表分群後的物件陣列,gmat代表分群後的數據陣列(usePCA為true時為PCA降維後數據,可例如取最左2欄代表x,y繪製二維分佈圖;usePCA為false時為原始數據),reject回傳錯誤訊息
- Type
- Promise
do_swap()
- Description:
Perform a single swap. Returns RAW loss.
- Source:
do_swap()
- Description:
Perform a single swap. Returns the RAW summed loss (sum of near.d).
- Source:
do_swap_k2()
- Description:
Perform a single swap, k=2. Returns RAW loss.
- Source:
dynmsc(mat, med, mink, maxiter)
- Description:
Run the DynMSC algorithm.
We begin with a maximum number of clusters, optimize the Average Medoid Silhouette, then decrease the number of clusters by one, and repeat until we have reached a minimum number of clusters. During this process, we store the solution with the highest AMS to return later.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
object | a pairwise distance matrix |
med |
Array.<number> | the list of medoids |
mink |
number | the minimum number of clusters |
maxiter |
number | the maximum number of iterations allowed returns { loss, assi, nIter, nSwaps, meds, losses } |
fastermsc(mat, med, maxiter)
- Description:
Run the FasterMSC algorithm.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
object | pairwise distance matrix (wrapped) |
med |
Array.<number> | the list of medoids (mutated in place) |
maxiter |
number | maximum number of iterations returns { loss, assi, nIter, nSwaps } |
fastermsc_k2()
- Description:
Special case k=2 of the FasterMSC algorithm. Returns { loss, assi, nIter, nSwaps }.
- Source:
fasterpam(mat, med, maxiter)
- Description:
Run the FasterPAM algorithm.
If used multiple times, it is better to additionally shuffle the input data, to increase randomness of the solutions found and hence increase the chance of finding a better solution.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
a pairwise distance matrix |
|
med |
the list of medoids (mutated in place) |
|
maxiter |
the maximum number of iterations allowed |
Returns:
fastmsc(mat, med, maxiter)
- Description:
Run the FastMSC algorithm, which yields the same results as the original PAMMEDSIL.
This is faster than PAMMEDSIL, but slower than FasterMSC, and mostly of interest for academic reasons. This is the improved version, which costs O(n^2) per iteration to find the best swap.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
object | a pairwise distance matrix |
med |
Array.<number> | the list of medoids (mutated in place) |
maxiter |
number | the maximum number of iterations allowed returns { loss, assi, nIter, nSwaps } |
fastmsc_k2()
- Description:
Special case k=2 of the FastMSC algorithm.
- Source:
fastpam1(mat, med, maxiter) → {Object}
- Description:
Run the FastPAM1 algorithm, which yields the same results as the original PAM.
This is faster than PAM, but slower than FasterPAM, and mostly of interest for academic reasons. Quality-wise, FasterPAM is not worse on average, but much faster.
This is the improved version from the journal version of the paper, which costs O(n²) per iteration to find the best swap.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
object | a pairwise distance matrix (array or wrapped) |
med |
Array.<number> | the list of medoids (mutated in place) |
maxiter |
number | the maximum number of iterations allowed |
Returns:
- Type
- Object
find_best_swap()
- Description:
Find the best swap for object j - FastMSC version. Returns [change, b].
- Source:
find_best_swap()
- Description:
Find the best swap for object j - FastPAM version. Returns [change, b].
- Source:
find_best_swap_k2()
- Description:
Find the best swap for object j - FastMSC version, k=2. Returns [loss, b].
- Source:
find_best_swap_pam()
- Description:
Find the best swap for object j - slower PAM version. Returns [acc_best, m_best].
- Source:
find_best_swap_pammedsil()
- Description:
Find the best swap for object j. Returns [change, b].
- Source:
find_best_swap_pammedsil_k2()
- Description:
Find the best swap for object j (k=2 variant). Returns [change, b].
- Source:
find_max()
- Description:
Find the maximum (index and value) over an array of numbers.
- Source:
find_min()
- Description:
Find the minimum (index and value) over an array of numbers.
- Source:
first_k()
- Description:
Use the first k objects (0..k-1) as initial medoids.
- Source:
initial_assignment()
- Description:
Perform the initial assignment to medoids. Returns [loss, data] (data = Reco[]).
- Source:
initial_assignment()
- Description:
Perform the initial assignment to medoids. Returns [loss, data] with data = Rec[].
- Source:
initial_assignment_k2()
- Description:
Perform the initial assignment to medoids, for k=2 only. Returns [loss, assi, data] (data = [d0,d1] pairs).
- Source:
medoid_silhouette(mat, meds, samples)
- Description:
Compute the Medoid Silhouette of a clustering.
The Medoid Silhouette is an approximation to the original Silhouette where the distance to the cluster medoid is used instead of the average distance.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
a pairwise distance matrix |
|
meds |
the medoid list |
|
samples |
whether to keep the individual samples, or not |
Returns:
pam(mat, k, maxiter)
- Description:
Run the original PAM algorithm (BUILD and SWAP).
Provided for academic reasons to see the performance difference.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
a pairwise distance matrix |
|
k |
the number of medoids to pick |
|
maxiter |
the maximum number of iterations allowed |
Returns:
pam_build(mat, k)
- Description:
Run the original PAM BUILD algorithm.
Provided for academic reasons to see the performance difference.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
a pairwise distance matrix |
|
k |
the number of medoids to pick |
Returns:
pam_build_initialize()
- Description:
Not exposed. Use pam_build or pam. Pushes into medsArr (numbers) and dataArr (Rec). Returns loss.
- Source:
pam_optimize()
- Description:
Main optimization function of PAM, not exposed (use pam_swap or pam). Returns [loss, assi, nIter, nSwaps].
- Source:
pam_swap(mat, med, maxiter)
- Description:
Run the original PAM SWAP algorithm (no BUILD, but given initial medoids).
Provided for academic reasons to see the performance difference.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
a pairwise distance matrix |
|
med |
the list of medoids (mutated in place) |
|
maxiter |
the maximum number of iterations allowed |
Returns:
pammedsil(mat, k, maxiter)
- Description:
Run the original PAM BUILD algorithm combined with the PAMMEDSIL SWAP.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
object | pairwise distance matrix |
k |
number | the number of medoids to pick |
maxiter |
number | the maximum number of iterations allowed returns { loss, assi, meds, nIter, nSwaps } |
pammedsil_build_initialize()
- Description:
Not exposed. Use pammedsil_build or pammedsil. Pushes into meds & data. Returns loss.
- Source:
pammedsil_optimize()
- Description:
Main optimization function of PAMMEDSIL, not exposed (use pammedsil_swap or pammedsil)
- Source:
pammedsil_swap(mat, med, maxiter)
- Description:
Run the original PAMMEDSIL SWAP algorithm (no initialization, but given initial medoids).
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
object | pairwise distance matrix |
med |
Array.<number> | the list of medoids (mutated in place) |
maxiter |
number | the maximum number of iterations allowed returns { loss, assi, nIter, nSwaps } |
pamsil(mat, med, maxiter) → {Object}
- Description:
Run the original PAMSIL SWAP algorithm (no BUILD, but given initial medoids).
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
* | a pairwise distance matrix |
med |
Array.<number> | the list of medoids (mutated in place) |
maxiter |
number | the maximum number of iterations allowed |
Returns:
- Type
- Object
pamsil_build_initialize() → {number}
- Description:
Not exposed. Use pamsil_build or pamsil. Standard PAM BUILD storing nearest distance per point in a plain number array
data.
- Source:
Returns:
loss
- Type
- number
pamsil_optimize()
- Description:
Main optimization function of PAMSIL, not exposed (use pamsil_swap or pamsil).
- Source:
Returns:
pamsil_swap(mat, med, maxiter) → {Object}
- Description:
Run the original PAMSIL SWAP algorithm (no BUILD, but given initial medoids).
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
* | a pairwise distance matrix |
med |
Array.<number> | the list of medoids (mutated in place) |
maxiter |
number | the maximum number of iterations allowed |
Returns:
- Type
- Object
par_fasterpam(mat, med, maxiter, rng)
- Description:
Run the FasterPAM algorithm (parallel version).
For small data sets (n<1000) it is usually faster to use the non-parallel version.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
a pairwise distance matrix |
|
med |
the list of medoids (mutated in place) |
|
maxiter |
the maximum number of iterations allowed |
|
rng |
random number generator for shuffling the input data |
Returns:
par_silhouette(mat, assi) → {number}
- Description:
Compute the Silhouette of a strict partitional clustering (sequential JS port of parallel Rust impl).
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
object | Array | pairwise distance matrix (will be wrapped by arrayAdapter) |
assi |
Array.<number> | cluster assignment for each point |
Returns:
the average silhouette value
- Type
- number
rand_fasterpam(mat, med, maxiter, rng)
- Description:
Run the FasterPAM algorithm with additional randomization.
This increases the chance of finding a better solution when used multiple times, as it decreases the dependency on the input data order.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
a pairwise distance matrix |
|
med |
the list of medoids (mutated in place) |
|
maxiter |
the maximum number of iterations allowed |
|
rng |
random number generator for shuffling the input data |
Returns:
random_initialization()
- Description:
Random initialization: k distinct medoid indices in [0, n).
- Source:
remove_med()
- Description:
Remove one medoid. Returns RAW loss.
- Source:
sample()
- Description:
Sample k DISTINCT indices from [0, n) using Floyd's algorithm. rng: () => float in [0,1).
- Source:
shuffle()
- Description:
Fisher-Yates shuffle of [0, n). Used by rand_fasterpam / par_fasterpam.
- Source:
silhouette(mat, assi, samples)
- Description:
Compute the Silhouette of a strict partitional clustering.
The Silhouette, proposed by Peter Rousseeuw in 1987, is a popular internal evaluation measure for clusterings.
- Source:
Parameters:
| Name | Type | Description |
|---|---|---|
mat |
a pairwise distance matrix |
|
assi |
the cluster assignment |
|
samples |
whether to keep the individual samples, or not |
Returns:
update_removal_loss()
- Description:
Update the loss when removing each medoid. Mutates loss in place.
- Source:
update_removal_loss()
- Description:
Update the loss when removing each medoid. Mutates lossArr.
- Source:
update_second_nearest()
- Description:
Update the second nearest medoid information. Called after each swap. Returns a fresh DistancePair.
- Source:
update_third_nearest()
- Description:
Update the third nearest medoid information. Called after each swap. Returns fresh DistancePair.
- Source:
update_third_nearest_without_new()
- Description:
Update the third nearest medoid information. Called after each swap. Returns a fresh DistancePair.
- Source: