Global

Members

WCluster

Description:
  • 數據主成份分析與分群

Source:

數據主成份分析與分群

Example
async function testPCA() {

    let mat = [
        [40, 50, 60],
        [50, 70, 60],
        [80, 70, 90],
        [50, 60, 80]
    ]
    console.log('mat', mat)
    // => mat [ [ 40, 50, 60 ], [ 50, 70, 60 ], [ 80, 70, 90 ], [ 50, 60, 80 ] ]

    let resMat = await WCluster.PCA(mat, { nCompNIPALS: 2 })
    console.log(resMat)
    // => [
    //   [ -1.704002697669786, 0.43087564048799354 ],
    //   [ -0.2509699164590232, -1.1519035967558147 ],
    //   [ 1.9913098008410954, 0.23244503334983269 ],
    //   [ -0.03633718671228592, 0.48858292291798827 ]
    // ]

    let mat2 = [
        [1040, 50, 60],
        [1050, 70, 60],
        [1080, 70, 90],
        [1050, 60, 80]
    ]
    console.log('mat2', mat2)
    // => mat [ [ 1040, 50, 60 ], [ 1050, 70, 60 ], [ 1080, 70, 90 ], [ 1050, 60, 80 ] ]

    let resMat2 = await WCluster.PCA(mat2, { nCompNIPALS: 2 })
    console.log(resMat2)
    // => [
    //   [ -1.704002697669786, 0.43087564048799354 ],
    //   [ -0.2509699164590232, -1.1519035967558147 ],
    //   [ 1.9913098008410954, 0.23244503334983269 ],
    //   [ -0.03633718671228592, 0.48858292291798827 ]
    // ]

    let mat3 = [
        [11040, 50, 60],
        [13050, 70, 60],
        [15080, 70, 90],
        [17050, 60, 80]
    ]
    console.log('mat3', mat3)
    // => mat [ [ 11040, 50, 60 ], [ 13050, 70, 60 ], [ 15080, 70, 90 ], [ 17050, 60, 80 ] ]

    let resMat3 = await WCluster.PCA(mat3, { nCompNIPALS: 2 })
    console.log(resMat3)
    // => [
    //   [ -1.8599655569892897, 0.4908764271508211 ],
    //   [ -0.3950941652793108, -1.0977355294143445 ],
    //   [ 1.3430199944041517, -0.17213692267477554 ],
    //   [ 0.912039727864449, 0.778996024938299 ]
    // ]

}
testPCA()
    .catch((err) => {
        console.log(err)
    })

async function testCluster() {
    let mode = 'k-medoids'

    let mat = [
        [40, 50, 60],
        [50, 70, 60],
        [80, 70, 90],
        [50, 60, 80]
    ]
    console.log('mat', mat)
    // => mat [ [ 40, 50, 60 ], [ 50, 70, 60 ], [ 80, 70, 90 ], [ 50, 60, 80 ] ]

    let resMat = await WCluster.cluster(mat, { mode, kNumber: 2, nCompNIPALS: 2 })
    console.log(JSON.stringify(resMat, null, 2))
    // => {
    //   "keys": null,
    //   "ginds": [
    //     [ 0, 1, 3 ],
    //     [ 2 ]
    //   ],
    //   "gmat": [
    //     [
    //       [ -1.704002697669786, 0.43087564048799354 ],
    //       [ -0.2509699164590232, -1.1519035967558147 ],
    //       [ -0.03633718671228592, 0.48858292291798827 ]
    //     ],
    //     [
    //       [ 1.9913098008410954, 0.23244503334983269 ]
    //     ]
    //   ],
    //   "gltdt": [
    //     [
    //       [ 40, 50, 60 ],
    //       [ 50, 70, 60 ],
    //       [ 50, 60, 80 ]
    //     ],
    //     [
    //       [ 80, 70, 90 ]
    //     ]
    //   ]
    // }

    let mat2 = [
        [1040, 50, 60],
        [1050, 70, 60],
        [1080, 70, 90],
        [1050, 60, 80]
    ]
    console.log('mat2', mat2)
    // => mat [ [ 1040, 50, 60 ], [ 1050, 70, 60 ], [ 1080, 70, 90 ], [ 1050, 60, 80 ] ]

    let resMat2 = await WCluster.cluster(mat2, { mode, kNumber: 2, nCompNIPALS: 2 })
    console.log(JSON.stringify(resMat2, null, 2))
    // => {
    //   "keys": null,
    //   "ginds": [
    //     [ 0, 1, 3 ],
    //     [ 2 ]
    //   ],
    //   "gmat": [
    //     [
    //       [ -1.704002697669786, 0.43087564048799354 ],
    //       [ -0.2509699164590232, -1.1519035967558147 ],
    //       [ -0.03633718671228592, 0.48858292291798827 ]
    //     ],
    //     [
    //       [ 1.9913098008410954, 0.23244503334983269 ]
    //     ]
    //   ],
    //   "gltdt": [
    //     [
    //       [ 1040, 50, 60 ],
    //       [ 1050, 70, 60 ],
    //       [ 1050, 60, 80 ]
    //     ],
    //     [
    //       [ 1080, 70, 90 ]
    //     ]
    //   ]
    // }

    let mat3 = [
        [11040, 50, 60],
        [13050, 70, 60],
        [15080, 70, 90],
        [17050, 60, 80]
    ]
    console.log('mat3', mat3)
    // => mat [ [ 11040, 50, 60 ], [ 13050, 70, 60 ], [ 15080, 70, 90 ], [ 17050, 60, 80 ] ]

    let resMat3 = await WCluster.cluster(mat3, { mode, kNumber: 2, nCompNIPALS: 2 })
    console.log(JSON.stringify(resMat3, null, 2))
    // => {
    //   "keys": null,
    //   "ginds": [
    //     [ 1, 2, 3 ],
    //     [ 0 ]
    //   ],
    //   "gmat": [
    //     [
    //       [ -0.3950941652793108, -1.0977355294143445 ],
    //       [ 1.3430199944041517, -0.17213692267477554 ],
    //       [ 0.912039727864449, 0.778996024938299 ]
    //     ],
    //     [
    //       [ -1.8599655569892897, 0.4908764271508211 ]
    //     ]
    //   ],
    //   "gltdt": [
    //     [
    //       [ 13050, 70, 60 ],
    //       [ 15080, 70, 90 ],
    //       [ 17050, 60, 80 ]
    //     ],
    //     [
    //       [ 11040, 50, 60 ]
    //     ]
    //   ]
    // }

    let ltdt = [
        { name: 'Cameron', a: 40, b: 50, c: 60 },
        { name: 'Buckley', a: 50, b: 70, c: 60 },
        { name: 'Paul', a: 80, b: 70, c: 90 },
        { name: 'Fawcett', a: 50, b: 60, c: 80 },
    ]
    console.log('ltdt', ltdt)
    // => ltdt [
    //     { name: 'Cameron', a: 40, b: 50, c: 60 },
    //     { name: 'Buckley', a: 50, b: 70, c: 60 },
    //     { name: 'Paul', a: 80, b: 70, c: 90 },
    //     { name: 'Fawcett', a: 50, b: 60, c: 80 }
    // ]

    let resLtdt = await WCluster.cluster(ltdt, { mode, kNumber: 2, nCompNIPALS: 2 })
    console.log(JSON.stringify(resLtdt, null, 2))
    // => {
    //   "keys": [ "a", "b", "c" ],
    //   "ginds": [
    //     [ 0, 1, 3 ],
    //     [ 2 ]
    //   ],
    //   "gmat": [
    //     [
    //       [ -1.704002697669786, 0.43087564048799354 ],
    //       [ -0.2509699164590232, -1.1519035967558147 ],
    //       [ -0.03633718671228592, 0.48858292291798827 ]
    //     ],
    //     [
    //       [ 1.9913098008410954, 0.23244503334983269 ]
    //     ]
    //   ],
    //   "gltdt": [
    //     [
    //       { "name": "Cameron", "a": 40, "b": 50, "c": 60 },
    //       { "name": "Buckley", "a": 50, "b": 70, "c": 60 },
    //       { "name": "Fawcett", "a": 50, "b": 60, "c": 80 }
    //     ],
    //     [
    //       { "name": "Paul", "a": 80, "b": 70, "c": 90 }
    //     ]
    //   ]
    // }

}
testCluster()
    .catch((err) => {
        console.log(err)
    })

Methods

(async) PCA(data, optopt) → {Promise}

Description:
  • 數據主成分分析(Principal Component Analysis, PCA)

Source:
Example
Parameters:
Name Type Attributes Default Description
data Array

輸入數據陣列

opt Object <optional>
{}

輸入設定物件,預設{}

Properties
Name Type Attributes Default Description
scale Boolean <optional>
true

輸入是否對數據正規化布林值,預設true

nCompNIPALS Number <optional>
2

輸入指定降維的維度整數,不能超過數據的維度(各列有效元素數量),預設2

Returns:

回傳Promise,resolve回傳指定維數數值的陣列,reject回傳錯誤訊息

Type
Promise

alternating(mat, med, maxiter) → {Object}

Description:
  • Run the Alternating algorithm, a k-means-style alternate optimization.

    This is fairly fast (O(n²), like the FasterPAM method), but because the newly chosen medoid must cover the entire existing cluster, it tends to get stuck in worse local optima as the alternatives. Hence, it is not really recommended to use this algorithm (also known as "Alternate" in classic facility location literature, and re-invented by Park and Jun 2009).

Source:
Parameters:
Name Type Description
mat *

a pairwise distance matrix (2D array or adapter object)

med Array.<number>

the list of medoids (mutated in place)

maxiter number

the maximum number of iterations allowed

Returns:
Type
Object

arrayAdapter()

Description:
  • Normalize user input into a matrix object. Idempotent. Accepts a 2D array, DenseMatrix, or LowerTriangle.

Source:

assign_nearest(mat, med, dataArr) → {number}

Description:
  • Assign each point to the nearest medoid, return total loss. Mutates dataArr[i] = index into med of nearest medoid.

Source:
Parameters:
Name Type Description
mat *

already-wrapped matrix

med Array.<number>

medoid point indices

dataArr Array.<number>

assignment array, mutated in place

Returns:

total loss

Type
number

choose_medoid_within_partition()

Description:
  • Choose the best medoid within a partition. Mutates med[m]. Returns [changed, sumb].

Source:

(async) cluster(data, optopt) → {Promise}

Description:
  • 針對數據陣列做分群

Source:
Example
Parameters:
Name Type Attributes Default Description
data Array

輸入數據陣列,可輸入純數據陣列(mat)或物件陣列(ltdt)

opt Object <optional>
{}

輸入設定物件,預設{}

Properties
Name Type Attributes Default Description
kNumber Number <optional>
2

輸入指定分群數整數,不能超過數據的長度,預設2

nCompNIPALS Number <optional>
2

輸入指定降維的維度整數,不能超過數據的維度(各列有效元素數量),預設2

mode String <optional>
'k-medoids'

輸入分群方法字串,可為'k-means'、'k-medoids',k-means受初始隨機群中心影響較大,預設'k-medoids'

usePCA Boolean <optional>
true

輸入分群前是否先對數據做PCA降維布林值,預設true;若使用自訂非歐距離(如Jaccard對二元向量)應設為false,以保留原始數據語意

funDist function <optional>

輸入自訂距離函數,函數接收兩數據陣列(t1,t2)並回傳兩者距離數值,僅於mode為'k-medoids'時生效,未提供時底層採用歐氏(euclidean)距離

useMethod String <optional>
'fasterPAM'

輸入k-medoids分群後端字串,僅於mode為'k-medoids'時生效,'simple'為npm套件'k-medoids'(樸素PAM),'fasterPAM'為內建./src/k-medoids之矩陣式FasterPAM(大數據較快),預設'fasterPAM'

seed Number <optional>

輸入k-means隨機初始化種子整數,僅於mode為'k-means'時生效,給定則分群結果可重現,未提供時為隨機初始化

Returns:

回傳Promise,resolve回傳分群後結果物件,keys代表分群有使用到的欄位,ginds代表分群後的指標陣列,gltdt代表分群後的物件陣列,gmat代表分群後的數據陣列(usePCA為true時為PCA降維後數據,可例如取最左2欄代表x,y繪製二維分佈圖;usePCA為false時為原始數據),reject回傳錯誤訊息

Type
Promise

do_swap()

Description:
  • Perform a single swap. Returns RAW loss.

Source:

do_swap()

Description:
  • Perform a single swap. Returns the RAW summed loss (sum of near.d).

Source:

do_swap_k2()

Description:
  • Perform a single swap, k=2. Returns RAW loss.

Source:

dynmsc(mat, med, mink, maxiter)

Description:
  • Run the DynMSC algorithm.

    We begin with a maximum number of clusters, optimize the Average Medoid Silhouette, then decrease the number of clusters by one, and repeat until we have reached a minimum number of clusters. During this process, we store the solution with the highest AMS to return later.

Source:
Parameters:
Name Type Description
mat object

a pairwise distance matrix

med Array.<number>

the list of medoids

mink number

the minimum number of clusters

maxiter number

the maximum number of iterations allowed returns { loss, assi, nIter, nSwaps, meds, losses }

fastermsc(mat, med, maxiter)

Description:
  • Run the FasterMSC algorithm.

Source:
Parameters:
Name Type Description
mat object

pairwise distance matrix (wrapped)

med Array.<number>

the list of medoids (mutated in place)

maxiter number

maximum number of iterations returns { loss, assi, nIter, nSwaps }

fastermsc_k2()

Description:
  • Special case k=2 of the FasterMSC algorithm. Returns { loss, assi, nIter, nSwaps }.

Source:

fasterpam(mat, med, maxiter)

Description:
  • Run the FasterPAM algorithm.

    If used multiple times, it is better to additionally shuffle the input data, to increase randomness of the solutions found and hence increase the chance of finding a better solution.

Source:
Parameters:
Name Type Description
mat

a pairwise distance matrix

med

the list of medoids (mutated in place)

maxiter

the maximum number of iterations allowed

Returns:

fastmsc(mat, med, maxiter)

Description:
  • Run the FastMSC algorithm, which yields the same results as the original PAMMEDSIL.

    This is faster than PAMMEDSIL, but slower than FasterMSC, and mostly of interest for academic reasons. This is the improved version, which costs O(n^2) per iteration to find the best swap.

Source:
Parameters:
Name Type Description
mat object

a pairwise distance matrix

med Array.<number>

the list of medoids (mutated in place)

maxiter number

the maximum number of iterations allowed returns { loss, assi, nIter, nSwaps }

fastmsc_k2()

Description:
  • Special case k=2 of the FastMSC algorithm.

Source:

fastpam1(mat, med, maxiter) → {Object}

Description:
  • Run the FastPAM1 algorithm, which yields the same results as the original PAM.

    This is faster than PAM, but slower than FasterPAM, and mostly of interest for academic reasons. Quality-wise, FasterPAM is not worse on average, but much faster.

    This is the improved version from the journal version of the paper, which costs O(n²) per iteration to find the best swap.

Source:
Parameters:
Name Type Description
mat object

a pairwise distance matrix (array or wrapped)

med Array.<number>

the list of medoids (mutated in place)

maxiter number

the maximum number of iterations allowed

Returns:
Type
Object

find_best_swap()

Description:
  • Find the best swap for object j - FastMSC version. Returns [change, b].

Source:

find_best_swap()

Description:
  • Find the best swap for object j - FastPAM version. Returns [change, b].

Source:

find_best_swap_k2()

Description:
  • Find the best swap for object j - FastMSC version, k=2. Returns [loss, b].

Source:

find_best_swap_pam()

Description:
  • Find the best swap for object j - slower PAM version. Returns [acc_best, m_best].

Source:

find_best_swap_pammedsil()

Description:
  • Find the best swap for object j. Returns [change, b].

Source:

find_best_swap_pammedsil_k2()

Description:
  • Find the best swap for object j (k=2 variant). Returns [change, b].

Source:

find_max()

Description:
  • Find the maximum (index and value) over an array of numbers.

Source:

find_min()

Description:
  • Find the minimum (index and value) over an array of numbers.

Source:

first_k()

Description:
  • Use the first k objects (0..k-1) as initial medoids.

Source:

initial_assignment()

Description:
  • Perform the initial assignment to medoids. Returns [loss, data] (data = Reco[]).

Source:

initial_assignment()

Description:
  • Perform the initial assignment to medoids. Returns [loss, data] with data = Rec[].

Source:

initial_assignment_k2()

Description:
  • Perform the initial assignment to medoids, for k=2 only. Returns [loss, assi, data] (data = [d0,d1] pairs).

Source:

medoid_silhouette(mat, meds, samples)

Description:
  • Compute the Medoid Silhouette of a clustering.

    The Medoid Silhouette is an approximation to the original Silhouette where the distance to the cluster medoid is used instead of the average distance.

Source:
Parameters:
Name Type Description
mat

a pairwise distance matrix

meds

the medoid list

samples

whether to keep the individual samples, or not

Returns:

pam(mat, k, maxiter)

Description:
  • Run the original PAM algorithm (BUILD and SWAP).

    Provided for academic reasons to see the performance difference.

Source:
Parameters:
Name Type Description
mat

a pairwise distance matrix

k

the number of medoids to pick

maxiter

the maximum number of iterations allowed

Returns:

pam_build(mat, k)

Description:
  • Run the original PAM BUILD algorithm.

    Provided for academic reasons to see the performance difference.

Source:
Parameters:
Name Type Description
mat

a pairwise distance matrix

k

the number of medoids to pick

Returns:

pam_build_initialize()

Description:
  • Not exposed. Use pam_build or pam. Pushes into medsArr (numbers) and dataArr (Rec). Returns loss.

Source:

pam_optimize()

Description:
  • Main optimization function of PAM, not exposed (use pam_swap or pam). Returns [loss, assi, nIter, nSwaps].

Source:

pam_swap(mat, med, maxiter)

Description:
  • Run the original PAM SWAP algorithm (no BUILD, but given initial medoids).

    Provided for academic reasons to see the performance difference.

Source:
Parameters:
Name Type Description
mat

a pairwise distance matrix

med

the list of medoids (mutated in place)

maxiter

the maximum number of iterations allowed

Returns:

pammedsil(mat, k, maxiter)

Description:
  • Run the original PAM BUILD algorithm combined with the PAMMEDSIL SWAP.

Source:
Parameters:
Name Type Description
mat object

pairwise distance matrix

k number

the number of medoids to pick

maxiter number

the maximum number of iterations allowed returns { loss, assi, meds, nIter, nSwaps }

pammedsil_build_initialize()

Description:
  • Not exposed. Use pammedsil_build or pammedsil. Pushes into meds & data. Returns loss.

Source:

pammedsil_optimize()

Description:
  • Main optimization function of PAMMEDSIL, not exposed (use pammedsil_swap or pammedsil)

Source:

pammedsil_swap(mat, med, maxiter)

Description:
  • Run the original PAMMEDSIL SWAP algorithm (no initialization, but given initial medoids).

Source:
Parameters:
Name Type Description
mat object

pairwise distance matrix

med Array.<number>

the list of medoids (mutated in place)

maxiter number

the maximum number of iterations allowed returns { loss, assi, nIter, nSwaps }

pamsil(mat, med, maxiter) → {Object}

Description:
  • Run the original PAMSIL SWAP algorithm (no BUILD, but given initial medoids).

Source:
Parameters:
Name Type Description
mat *

a pairwise distance matrix

med Array.<number>

the list of medoids (mutated in place)

maxiter number

the maximum number of iterations allowed

Returns:
Type
Object

pamsil_build_initialize() → {number}

Description:
  • Not exposed. Use pamsil_build or pamsil. Standard PAM BUILD storing nearest distance per point in a plain number array data.

Source:
Returns:

loss

Type
number

pamsil_optimize()

Description:
  • Main optimization function of PAMSIL, not exposed (use pamsil_swap or pamsil).

Source:
Returns:

pamsil_swap(mat, med, maxiter) → {Object}

Description:
  • Run the original PAMSIL SWAP algorithm (no BUILD, but given initial medoids).

Source:
Parameters:
Name Type Description
mat *

a pairwise distance matrix

med Array.<number>

the list of medoids (mutated in place)

maxiter number

the maximum number of iterations allowed

Returns:
Type
Object

par_fasterpam(mat, med, maxiter, rng)

Description:
  • Run the FasterPAM algorithm (parallel version).

    For small data sets (n<1000) it is usually faster to use the non-parallel version.

Source:
Parameters:
Name Type Description
mat

a pairwise distance matrix

med

the list of medoids (mutated in place)

maxiter

the maximum number of iterations allowed

rng

random number generator for shuffling the input data

Returns:

par_silhouette(mat, assi) → {number}

Description:
  • Compute the Silhouette of a strict partitional clustering (sequential JS port of parallel Rust impl).

Source:
Parameters:
Name Type Description
mat object | Array

pairwise distance matrix (will be wrapped by arrayAdapter)

assi Array.<number>

cluster assignment for each point

Returns:

the average silhouette value

Type
number

rand_fasterpam(mat, med, maxiter, rng)

Description:
  • Run the FasterPAM algorithm with additional randomization.

    This increases the chance of finding a better solution when used multiple times, as it decreases the dependency on the input data order.

Source:
Parameters:
Name Type Description
mat

a pairwise distance matrix

med

the list of medoids (mutated in place)

maxiter

the maximum number of iterations allowed

rng

random number generator for shuffling the input data

Returns:

random_initialization()

Description:
  • Random initialization: k distinct medoid indices in [0, n).

Source:

remove_med()

Description:
  • Remove one medoid. Returns RAW loss.

Source:

sample()

Description:
  • Sample k DISTINCT indices from [0, n) using Floyd's algorithm. rng: () => float in [0,1).

Source:

shuffle()

Description:
  • Fisher-Yates shuffle of [0, n). Used by rand_fasterpam / par_fasterpam.

Source:

silhouette(mat, assi, samples)

Description:
  • Compute the Silhouette of a strict partitional clustering.

    The Silhouette, proposed by Peter Rousseeuw in 1987, is a popular internal evaluation measure for clusterings.

Source:
Parameters:
Name Type Description
mat

a pairwise distance matrix

assi

the cluster assignment

samples

whether to keep the individual samples, or not

Returns:

update_removal_loss()

Description:
  • Update the loss when removing each medoid. Mutates loss in place.

Source:

update_removal_loss()

Description:
  • Update the loss when removing each medoid. Mutates lossArr.

Source:

update_second_nearest()

Description:
  • Update the second nearest medoid information. Called after each swap. Returns a fresh DistancePair.

Source:

update_third_nearest()

Description:
  • Update the third nearest medoid information. Called after each swap. Returns fresh DistancePair.

Source:

update_third_nearest_without_new()

Description:
  • Update the third nearest medoid information. Called after each swap. Returns a fresh DistancePair.

Source: