大家好,欢迎来到IT知识分享网。
目录
第 51 天: kNN 分类器
kNN 的原始论文为: T. Cover and P. Hart. Nearest neighbor pattern classification. IEEE Transactions in Information Theory, IT-13, pages 21–27, 1967.
- 简单. 没有学习过程, 也被称为惰性学习 lazy learning. 类似于开卷考试, 在已有数据中去找答案.
- 本源. 找相似, 正是人类认识事物的常用方法, 隐藏于人类或者其他动物的基因里面. 当然, 人类也会上当, 例如有人把邻居的滴水观音误认为是芋头, 偷食后中毒.
- 效果好. 永远不要小视 kNN, 对于很多数据, 你很难设计算法超越它.
- 适应性强. 可用于分类, 回归. 可用于各种数据.
- 可扩展性强. 设计不同的度量, 可获得意想不到的效果.
- 一般需要对数据归一化.
- 复杂度高. 这也是 kNN 最重要的缺点. 对于每一个测试数据, 复杂度为 O ( ( m + k ) n ) O((m+k)n) O((m+k)n), 其中 n n n 为训练数据个数, m m m 为条件属性个数, k k k 为邻居个数. 代码见 computeNearests().
代码说明:
8. 两种距离度量.
9. 数据随机分割方式.
10. 间址的灵活使用: trainingSet 和 testingSet 都是整数数组, 表示下标.
11. arff 文件的读取. 需要 weka.jar 包.
12. 求邻居.
13. 投票.
package machinelearning.knn; import java.io.FileReader; import java.util.Arrays; import java.util.Random; import weka.core.*; / * kNN classification. * * @author Fan Min . */ public class KnnClassification { / * Manhattan distance. */ public static final int MANHATTAN = 0; / * Euclidean distance. */ public static final int EUCLIDEAN = 1; / * The distance measure. */ public int distanceMeasure = EUCLIDEAN; / * A random instance; */ public static final Random random = new Random(); / * The number of neighbors. */ int numNeighbors = 7; / * The whole dataset. */ Instances dataset; / * The training set. Represented by the indices of the data. */ int[] trainingSet; / * The testing set. Represented by the indices of the data. */ int[] testingSet; / * The predictions. */ int[] predictions; / * * The first constructor. * * @param paraFilename * The arff filename. * */ public KnnClassification(String paraFilename) { try { FileReader fileReader = new FileReader(paraFilename); dataset = new Instances(fileReader); // The last attribute is the decision class. dataset.setClassIndex(dataset.numAttributes() - 1); fileReader.close(); } catch (Exception ee) { System.out.println("Error occurred while trying to read \'" + paraFilename + "\' in KnnClassification constructor.\r\n" + ee); System.exit(0); } // Of try }// Of the first constructor / * * Get a random indices for data randomization. * * @param paraLength * The length of the sequence. * @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6. * */ public static int[] getRandomIndices(int paraLength) { int[] resultIndices = new int[paraLength]; // Step 1. Initialize. for (int i = 0; i < paraLength; i++) { resultIndices[i] = i; } // Of for i // Step 2. Randomly swap. int tempFirst, tempSecond, tempValue; for (int i = 0; i < paraLength; i++) { // Generate two random indices. tempFirst = random.nextInt(paraLength); tempSecond = random.nextInt(paraLength); // Swap. tempValue = resultIndices[tempFirst]; resultIndices[tempFirst] = resultIndices[tempSecond]; resultIndices[tempSecond] = tempValue; } // Of for i return resultIndices; }// Of getRandomIndices / * * Split the data into training and testing parts. * * @param paraTrainingFraction * The fraction of the training set. * */ public void splitTrainingTesting(double paraTrainingFraction) { int tempSize = dataset.numInstances(); int[] tempIndices = getRandomIndices(tempSize); int tempTrainingSize = (int) (tempSize * paraTrainingFraction); trainingSet = new int[tempTrainingSize]; testingSet = new int[tempSize - tempTrainingSize]; for (int i = 0; i < tempTrainingSize; i++) { trainingSet[i] = tempIndices[i]; } // Of for i for (int i = 0; i < tempSize - tempTrainingSize; i++) { testingSet[i] = tempIndices[tempTrainingSize + i]; } // Of for i }// Of splitTrainingTesting / * * Predict for the whole testing set. The results are stored in predictions. * #see predictions. * */ public void predict() { predictions = new int[testingSet.length]; for (int i = 0; i < predictions.length; i++) { predictions[i] = predict(testingSet[i]); } // Of for i }// Of predict / * * Predict for given instance. * * @return The prediction. * */ public int predict(int paraIndex) { int[] tempNeighbors = computeNearests(paraIndex); int resultPrediction = simpleVoting(tempNeighbors); return resultPrediction; }// Of predict / * * The distance between two instances. * * @param paraI * The index of the first instance. * @param paraJ * The index of the second instance. * @return The distance. * */ public double distance(int paraI, int paraJ) { double resultDistance = 0; double tempDifference; switch (distanceMeasure) { case MANHATTAN: for (int i = 0; i < dataset.numAttributes() - 1; i++) { tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i); if (tempDifference < 0) { resultDistance -= tempDifference; } else { resultDistance += tempDifference; } // Of if } // Of for i break; case EUCLIDEAN: for (int i = 0; i < dataset.numAttributes() - 1; i++) { tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i); resultDistance += tempDifference * tempDifference; } // Of for i break; default: System.out.println("Unsupported distance measure: " + distanceMeasure); }// Of switch return resultDistance; }// Of distance / * * Get the accuracy of the classifier. * * @return The accuracy. * */ public double getAccuracy() { // A double divides an int gets another double. double tempCorrect = 0; for (int i = 0; i < predictions.length; i++) { if (predictions[i] == dataset.instance(testingSet[i]).classValue()) { tempCorrect++; } // Of if } // Of for i return tempCorrect / testingSet.length; }// Of getAccuracy / * Compute the nearest k neighbors. Select one neighbor in each scan. In * fact we can scan only once. You may implement it by yourself. * * @param paraK * the k value for kNN. * @param paraCurrent * current instance. We are comparing it with all others. * @return the indices of the nearest instances. */ public int[] computeNearests(int paraCurrent) { int[] resultNearests = new int[numNeighbors]; boolean[] tempSelected = new boolean[trainingSet.length]; double tempMinimalDistance; int tempMinimalIndex = 0; // Compute all distances to avoid redundant computation. double[] tempDistances = new double[trainingSet.length]; for (int i = 0; i < trainingSet.length; i ++) { tempDistances[i] = distance(paraCurrent, trainingSet[i]); }//Of for i // Select the nearest paraK indices. for (int i = 0; i < numNeighbors; i++) { tempMinimalDistance = Double.MAX_VALUE; for (int j = 0; j < trainingSet.length; j++) { if (tempSelected[j]) { continue; } // Of if if (tempDistances[j] < tempMinimalDistance) { tempMinimalDistance = tempDistances[j]; tempMinimalIndex = j; } // Of if } // Of for j resultNearests[i] = trainingSet[tempMinimalIndex]; tempSelected[tempMinimalIndex] = true; } // Of for i System.out.println("The nearest of " + paraCurrent + " are: " + Arrays.toString(resultNearests)); return resultNearests; }// Of computeNearests / * Voting using the instances. * * @param paraNeighbors * The indices of the neighbors. * @return The predicted label. */ public int simpleVoting(int[] paraNeighbors) { int[] tempVotes = new int[dataset.numClasses()]; for (int i = 0; i < paraNeighbors.length; i++) { tempVotes[(int) dataset.instance(paraNeighbors[i]).classValue()]++; } // Of for i int tempMaximalVotingIndex = 0; int tempMaximalVoting = 0; for (int i = 0; i < dataset.numClasses(); i++) { if (tempVotes[i] > tempMaximalVoting) { tempMaximalVoting = tempVotes[i]; tempMaximalVotingIndex = i; } // Of if } // Of for i return tempMaximalVotingIndex; }// Of simpleVoting / * * The entrance of the program. * * @param args * Not used now. * */ public static void main(String args[]) { KnnClassification tempClassifier = new KnnClassification("D:/data/iris.arff"); tempClassifier.splitTrainingTesting(0.8); tempClassifier.predict(); System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy()); }// Of main }// Of class KnnClassification
在 https://github.com/FanSmale/sampledata/ 可下载 iris.arff. 万一访问不畅, 把下面的内容拷贝另存成 iris.arff 即可.
@RELATION iris @ATTRIBUTE sepallength REAL @ATTRIBUTE sepalwidth REAL @ATTRIBUTE petallength REAL @ATTRIBUTE petalwidth REAL @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa 5.4,3.7,1.5,0.2,Iris-setosa 4.8,3.4,1.6,0.2,Iris-setosa 4.8,3.0,1.4,0.1,Iris-setosa 4.3,3.0,1.1,0.1,Iris-setosa 5.8,4.0,1.2,0.2,Iris-setosa 5.7,4.4,1.5,0.4,Iris-setosa 5.4,3.9,1.3,0.4,Iris-setosa 5.1,3.5,1.4,0.3,Iris-setosa 5.7,3.8,1.7,0.3,Iris-setosa 5.1,3.8,1.5,0.3,Iris-setosa 5.4,3.4,1.7,0.2,Iris-setosa 5.1,3.7,1.5,0.4,Iris-setosa 4.6,3.6,1.0,0.2,Iris-setosa 5.1,3.3,1.7,0.5,Iris-setosa 4.8,3.4,1.9,0.2,Iris-setosa 5.0,3.0,1.6,0.2,Iris-setosa 5.0,3.4,1.6,0.4,Iris-setosa 5.2,3.5,1.5,0.2,Iris-setosa 5.2,3.4,1.4,0.2,Iris-setosa 4.7,3.2,1.6,0.2,Iris-setosa 4.8,3.1,1.6,0.2,Iris-setosa 5.4,3.4,1.5,0.4,Iris-setosa 5.2,4.1,1.5,0.1,Iris-setosa 5.5,4.2,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa 5.0,3.2,1.2,0.2,Iris-setosa 5.5,3.5,1.3,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa 4.4,3.0,1.3,0.2,Iris-setosa 5.1,3.4,1.5,0.2,Iris-setosa 5.0,3.5,1.3,0.3,Iris-setosa 4.5,2.3,1.3,0.3,Iris-setosa 4.4,3.2,1.3,0.2,Iris-setosa 5.0,3.5,1.6,0.6,Iris-setosa 5.1,3.8,1.9,0.4,Iris-setosa 4.8,3.0,1.4,0.3,Iris-setosa 5.1,3.8,1.6,0.2,Iris-setosa 4.6,3.2,1.4,0.2,Iris-setosa 5.3,3.7,1.5,0.2,Iris-setosa 5.0,3.3,1.4,0.2,Iris-setosa 7.0,3.2,4.7,1.4,Iris-versicolor 6.4,3.2,4.5,1.5,Iris-versicolor 6.9,3.1,4.9,1.5,Iris-versicolor 5.5,2.3,4.0,1.3,Iris-versicolor 6.5,2.8,4.6,1.5,Iris-versicolor 5.7,2.8,4.5,1.3,Iris-versicolor 6.3,3.3,4.7,1.6,Iris-versicolor 4.9,2.4,3.3,1.0,Iris-versicolor 6.6,2.9,4.6,1.3,Iris-versicolor 5.2,2.7,3.9,1.4,Iris-versicolor 5.0,2.0,3.5,1.0,Iris-versicolor 5.9,3.0,4.2,1.5,Iris-versicolor 6.0,2.2,4.0,1.0,Iris-versicolor 6.1,2.9,4.7,1.4,Iris-versicolor 5.6,2.9,3.6,1.3,Iris-versicolor 6.7,3.1,4.4,1.4,Iris-versicolor 5.6,3.0,4.5,1.5,Iris-versicolor 5.8,2.7,4.1,1.0,Iris-versicolor 6.2,2.2,4.5,1.5,Iris-versicolor 5.6,2.5,3.9,1.1,Iris-versicolor 5.9,3.2,4.8,1.8,Iris-versicolor 6.1,2.8,4.0,1.3,Iris-versicolor 6.3,2.5,4.9,1.5,Iris-versicolor 6.1,2.8,4.7,1.2,Iris-versicolor 6.4,2.9,4.3,1.3,Iris-versicolor 6.6,3.0,4.4,1.4,Iris-versicolor 6.8,2.8,4.8,1.4,Iris-versicolor 6.7,3.0,5.0,1.7,Iris-versicolor 6.0,2.9,4.5,1.5,Iris-versicolor 5.7,2.6,3.5,1.0,Iris-versicolor 5.5,2.4,3.8,1.1,Iris-versicolor 5.5,2.4,3.7,1.0,Iris-versicolor 5.8,2.7,3.9,1.2,Iris-versicolor 6.0,2.7,5.1,1.6,Iris-versicolor 5.4,3.0,4.5,1.5,Iris-versicolor 6.0,3.4,4.5,1.6,Iris-versicolor 6.7,3.1,4.7,1.5,Iris-versicolor 6.3,2.3,4.4,1.3,Iris-versicolor 5.6,3.0,4.1,1.3,Iris-versicolor 5.5,2.5,4.0,1.3,Iris-versicolor 5.5,2.6,4.4,1.2,Iris-versicolor 6.1,3.0,4.6,1.4,Iris-versicolor 5.8,2.6,4.0,1.2,Iris-versicolor 5.0,2.3,3.3,1.0,Iris-versicolor 5.6,2.7,4.2,1.3,Iris-versicolor 5.7,3.0,4.2,1.2,Iris-versicolor 5.7,2.9,4.2,1.3,Iris-versicolor 6.2,2.9,4.3,1.3,Iris-versicolor 5.1,2.5,3.0,1.1,Iris-versicolor 5.7,2.8,4.1,1.3,Iris-versicolor 6.3,3.3,6.0,2.5,Iris-virginica 5.8,2.7,5.1,1.9,Iris-virginica 7.1,3.0,5.9,2.1,Iris-virginica 6.3,2.9,5.6,1.8,Iris-virginica 6.5,3.0,5.8,2.2,Iris-virginica 7.6,3.0,6.6,2.1,Iris-virginica 4.9,2.5,4.5,1.7,Iris-virginica 7.3,2.9,6.3,1.8,Iris-virginica 6.7,2.5,5.8,1.8,Iris-virginica 7.2,3.6,6.1,2.5,Iris-virginica 6.5,3.2,5.1,2.0,Iris-virginica 6.4,2.7,5.3,1.9,Iris-virginica 6.8,3.0,5.5,2.1,Iris-virginica 5.7,2.5,5.0,2.0,Iris-virginica 5.8,2.8,5.1,2.4,Iris-virginica 6.4,3.2,5.3,2.3,Iris-virginica 6.5,3.0,5.5,1.8,Iris-virginica 7.7,3.8,6.7,2.2,Iris-virginica 7.7,2.6,6.9,2.3,Iris-virginica 6.0,2.2,5.0,1.5,Iris-virginica 6.9,3.2,5.7,2.3,Iris-virginica 5.6,2.8,4.9,2.0,Iris-virginica 7.7,2.8,6.7,2.0,Iris-virginica 6.3,2.7,4.9,1.8,Iris-virginica 6.7,3.3,5.7,2.1,Iris-virginica 7.2,3.2,6.0,1.8,Iris-virginica 6.2,2.8,4.8,1.8,Iris-virginica 6.1,3.0,4.9,1.8,Iris-virginica 6.4,2.8,5.6,2.1,Iris-virginica 7.2,3.0,5.8,1.6,Iris-virginica 7.4,2.8,6.1,1.9,Iris-virginica 7.9,3.8,6.4,2.0,Iris-virginica 6.4,2.8,5.6,2.2,Iris-virginica 6.3,2.8,5.1,1.5,Iris-virginica 6.1,2.6,5.6,1.4,Iris-virginica 7.7,3.0,6.1,2.3,Iris-virginica 6.3,3.4,5.6,2.4,Iris-virginica 6.4,3.1,5.5,1.8,Iris-virginica 6.0,3.0,4.8,1.8,Iris-virginica 6.9,3.1,5.4,2.1,Iris-virginica 6.7,3.1,5.6,2.4,Iris-virginica 6.9,3.1,5.1,2.3,Iris-virginica 5.8,2.7,5.1,1.9,Iris-virginica 6.8,3.2,5.9,2.3,Iris-virginica 6.7,3.3,5.7,2.5,Iris-virginica 6.7,3.0,5.2,2.3,Iris-virginica 6.3,2.5,5.0,1.9,Iris-virginica 6.5,3.0,5.2,2.0,Iris-virginica 6.2,3.4,5.4,2.3,Iris-virginica 5.9,3.0,5.1,1.8,Iris-virginica
第 52 天: kNN 分类器 (续)
- 重新实现 computeNearests, 仅需要扫描一遍训练集, 即可获得 k k k 个邻居. 提示: 现代码与插入排序思想相结合. 其时间复杂度为 O ( k n ) O(kn) O(kn), 其中 O ( n ) O(n) O(n) 用于扫描训练集, O ( k ) O(k) O(k) 用于插入.
- 增加 setDistanceMeasure() 方法.
- 增加 setNumNeighors() 方法.
第 53 天: kNN 分类器 (续)
- 增加 weightedVoting() 方法, 距离越短话语权越大. 支持两种以上的加权方式.
- 实现 leave-one-out 测试.
第 54 天: 基于 M-distance 的推荐
这里夹带一点私货, 即论文 Mei Zheng, Fan Min, Heng-Ru Zhang, Wen-Bin Chen, Fast recommendations with the M-distance, IEEE Access 4 (2016) 1464–1468 的源代码. 点击下载论文.
- 评分表 (用户, 项目, 评分) 的压缩方式给出. 见 https://github.com/FanSmale/sampledata/ 中 movielens-943u1682m.txt.
前几行数据为:
0,0,5
0,1,3
0,2,4
0,3,3
0,4,3
0,5,5
0,6,4
…
1,0,4
1,9,2
1,12,4
其中, “0,2,4” 表示用户 0 对项目 2 的评分为 4. 用户 1 对项目 1、2 等的评分没有, 表示没看过该电影. 在用户数、项目数很多时, 必须使用压缩存储. - 一篇论文的代码就这么一点点. 当然, 这篇论文本身很简单. 所谓 M-distance, 就是根据平均分来计算两个用户 (或项目) 之间的距离.
炫一下数学表达式. 令项目 j j j 的平均分为 x ⋅ j x_{\cdot j} x⋅j,
采用 item-based recommendation, 则第 j j j 个项目关于第 i i i 个用户的邻居项目集合为
N i j = { 1 ≤ j ′ ≤ m ∣ j ′ ≠ j , p i j ′ ≠ 0 , ∣ r ⋅ j ‾ − r ⋅ j ′ ‾ ∣ < ϵ } (1) N_{ij} = \{1 \leq j’ \leq m | j’ \neq j, p_{ij’} \neq 0, |\overline{r_{\cdot j}} – \overline{r_{\cdot j’}}| < \epsilon\} \tag{1} Nij={
1≤j′≤m∣j′=j,pij′=0,∣r⋅j−r⋅j′∣<ϵ}(1)
第 i i i 个用户对 j j j 个项目的评分预测为:
p i j = ∑ j ′ ∈ N i j r i j ′ ∣ N i j ∣ (2) p_{ij} = \frac{\sum_{j’ \in N_{ij}} r_{ij’}}{|N_{ij}|} \tag{2} pij=∣Nij∣∑j′∈Nijrij′(2) - 邻居不用 k k k 控制. 距离小于 radius (即 ϵ \epsilon ϵ) 的都是邻居. 使用 M-distance 时, 这种方式效果更好.
- 使用 leave-one-out 的测试方式, 很高效的算法才能使用这种方式.
package machinelearning.knn; / * Recommendation with M-distance. * @author Fan Min . */ import java.io.*; public class MBR { / * Default rating for 1-5 points. */ public static final double DEFAULT_RATING = 3.0; / * The total number of users. */ private int numUsers; / * The total number of items. */ private int numItems; / * The total number of ratings (non-zero values) */ private int numRatings; / * The predictions. */ private double[] predictions; / * Compressed rating matrix. User-item-rating triples. */ private int[][] compressedRatingMatrix; / * The degree of users (how many item he has rated). */ private int[] userDegrees; / * The average rating of the current user. */ private double[] userAverageRatings; / * The degree of users (how many item he has rated). */ private int[] itemDegrees; / * The average rating of the current item. */ private double[] itemAverageRatings; / * The first user start from 0. Let the first user has x ratings, the second * user will start from x. */ private int[] userStartingIndices; / * Number of non-neighbor objects. */ private int numNonNeighbors; / * The radius (delta) for determining the neighborhood. */ private double radius; / * * Construct the rating matrix. * * @param paraRatingFilename * the rating filename. * @param paraNumUsers * number of users * @param paraNumItems * number of items * @param paraNumRatings * number of ratings * */ public MBR(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception { // Step 1. Initialize these arrays numItems = paraNumItems; numUsers = paraNumUsers; numRatings = paraNumRatings; userDegrees = new int[numUsers]; userStartingIndices = new int[numUsers + 1]; userAverageRatings = new double[numUsers]; itemDegrees = new int[numItems]; compressedRatingMatrix = new int[numRatings][3]; itemAverageRatings = new double[numItems]; predictions = new double[numRatings]; System.out.println("Reading " + paraFilename); // Step 2. Read the data file. File tempFile = new File(paraFilename); if (!tempFile.exists()) { System.out.println("File " + paraFilename + " does not exists."); System.exit(0); } // Of if BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile)); String tempString; String[] tempStrArray; int tempIndex = 0; userStartingIndices[0] = 0; userStartingIndices[numUsers] = numRatings; while ((tempString = tempBufReader.readLine()) != null) { // Each line has three values tempStrArray = tempString.split(","); compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]); compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]); compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]); userDegrees[compressedRatingMatrix[tempIndex][0]]++; itemDegrees[compressedRatingMatrix[tempIndex][1]]++; if (tempIndex > 0) { // Starting to read the data of a new user. if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) { userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex; } // Of if } // Of if tempIndex++; } // Of while tempBufReader.close(); double[] tempUserTotalScore = new double[numUsers]; double[] tempItemTotalScore = new double[numItems]; for (int i = 0; i < numRatings; i++) { tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2]; tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2]; } // Of for i for (int i = 0; i < numUsers; i++) { userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i]; } // Of for i for (int i = 0; i < numItems; i++) { itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i]; } // Of for i }// Of the first constructor / * * Set the radius (delta). * * @param paraRadius * The given radius. * */ public void setRadius(double paraRadius) { if (paraRadius > 0) { radius = paraRadius; } else { radius = 0.1; } // Of if }// Of setRadius / * * Leave-one-out prediction. The predicted values are stored in predictions. * * @see predictions * */ public void leaveOneOutPrediction() { double tempItemAverageRating; // Make each line of the code shorter. int tempUser, tempItem, tempRating; System.out.println("\r\nLeaveOneOutPrediction for radius " + radius); numNonNeighbors = 0; for (int i = 0; i < numRatings; i++) { tempUser = compressedRatingMatrix[i][0]; tempItem = compressedRatingMatrix[i][1]; tempRating = compressedRatingMatrix[i][2]; // Step 1. Recompute average rating of the current item. tempItemAverageRating = (itemAverageRatings[tempItem] * itemDegrees[tempItem] - tempRating) / (itemDegrees[tempItem] - 1); // Step 2. Recompute neighbors, at the same time obtain the ratings // Of neighbors. int tempNeighbors = 0; double tempTotal = 0; int tempComparedItem; for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) { tempComparedItem = compressedRatingMatrix[j][1]; if (tempItem == tempComparedItem) { continue;// Ignore itself. } // Of if if (Math.abs(tempItemAverageRating - itemAverageRatings[tempComparedItem]) < radius) { tempTotal += compressedRatingMatrix[j][2]; tempNeighbors++; } // Of if } // Of for j // Step 3. Predict as the average value of neighbors. if (tempNeighbors > 0) { predictions[i] = tempTotal / tempNeighbors; } else { predictions[i] = DEFAULT_RATING; numNonNeighbors++; } // Of if } // Of for i }// Of leaveOneOutPrediction / * * Compute the MAE based on the deviation of each leave-one-out. * * @author Fan Min * */ public double computeMAE() throws Exception { double tempTotalError = 0; for (int i = 0; i < predictions.length; i++) { tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]); } // Of for i return tempTotalError / predictions.length; }// Of computeMAE / * * Compute the MAE based on the deviation of each leave-one-out. * * @author Fan Min * */ public double computeRSME() throws Exception { double tempTotalError = 0; for (int i = 0; i < predictions.length; i++) { tempTotalError += (predictions[i] - compressedRatingMatrix[i][2]) * (predictions[i] - compressedRatingMatrix[i][2]); } // Of for i double tempAverage = tempTotalError / predictions.length; return Math.sqrt(tempAverage); }// Of computeRSME / * * The entrance of the program. * * @param args * Not used now. * */ public static void main(String[] args) { try { MBR tempRecommender = new MBR("D:/data/movielens-943u1682m.txt", 943, 1682, ); for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) { tempRecommender.setRadius(tempRadius); tempRecommender.leaveOneOutPrediction(); double tempMAE = tempRecommender.computeMAE(); double tempRSME = tempRecommender.computeRSME(); System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME + ", numNonNeighbors = " + tempRecommender.numNonNeighbors); } // Of for tempRadius } catch (Exception ee) { System.out.println(ee); } // Of try }// Of main }// Of class MBR
第 55 天: 基于 M-distance 的推荐 (续)
j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++
就可将 tempUser 的所有评分信息读入. 然而, user-based recommendation 没有这样的便利. 为解决该问题, 可以有两种方案:
- 将压缩矩阵转置, 用户与项目关系互换. 这种方案要增加相应的代码, 但复杂度低. 推荐使用.
- 扫描时不仅仅是连续的数据, 而是需要整个数据集. 这种方案实现简单, 但复杂度高.
第 56 天: kMeans 聚类
kMeans 是最常用的聚类算法.
- kMeans 聚类需要中心点收敛时结束. 偷懒使用了 Arrays.equals()
- 数据集为 iris, 所以最后一个属性没使用. 如果对于没有决策属性的数据集, 需要进行相应修改.
- 数据没有归一化.
- getRandomIndices() 和 kMeans 的完全相同, 拷贝过来. 本来应该写在 SimpleTools.java 里面的, 代码不多, 为保证独立性就放这里了.
- distance() 和 kMeans 的相似, 注意不要用决策属性, 而且参数不同. 第 2 个参数为实数向量, 这是类为中心可能为虚拟的, 而中心点那里并没有对象.
package machinelearning.kmeans; import java.io.FileReader; import java.util.Arrays; import java.util.Random; import weka.core.Instances; / * kMeans clustering. * @author Fan Min . */ public class KMeans { / * Manhattan distance. */ public static final int MANHATTAN = 0; / * Euclidean distance. */ public static final int EUCLIDEAN = 1; / * The distance measure. */ public int distanceMeasure = EUCLIDEAN; / * A random instance; */ public static final Random random = new Random(); / * The data. */ Instances dataset; / * The number of clusters. */ int numClusters = 2; / * The clusters. */ int[][] clusters; / * * The first constructor. * * @param paraFilename * The data filename. * */ public KMeans(String paraFilename) { dataset = null; try { FileReader fileReader = new FileReader(paraFilename); dataset = new Instances(fileReader); fileReader.close(); } catch (Exception ee) { System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee); System.exit(0); } // Of try }// Of the first constructor / * * A setter. * */ public void setNumClusters(int paraNumClusters) { numClusters = paraNumClusters; }// Of the setter / * * Get a random indices for data randomization. * * @param paraLength * The length of the sequence. * @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6. * */ public static int[] getRandomIndices(int paraLength) { int[] resultIndices = new int[paraLength]; // Step 1. Initialize. for (int i = 0; i < paraLength; i++) { resultIndices[i] = i; } // Of for i // Step 2. Randomly swap. int tempFirst, tempSecond, tempValue; for (int i = 0; i < paraLength; i++) { // Generate two random indices. tempFirst = random.nextInt(paraLength); tempSecond = random.nextInt(paraLength); // Swap. tempValue = resultIndices[tempFirst]; resultIndices[tempFirst] = resultIndices[tempSecond]; resultIndices[tempSecond] = tempValue; } // Of for i return resultIndices; }// Of getRandomIndices / * * The distance between two instances. * * @param paraI * The index of the first instance. * @param paraArray * The array representing a point in the space. * @return The distance. * */ public double distance(int paraI, double[] paraArray) { int resultDistance = 0; double tempDifference; switch (distanceMeasure) { case MANHATTAN: for (int i = 0; i < dataset.numAttributes() - 1; i++) { tempDifference = dataset.instance(paraI).value(i) - paraArray[i]; if (tempDifference < 0) { resultDistance -= tempDifference; } else { resultDistance += tempDifference; } // Of if } // Of for i break; case EUCLIDEAN: for (int i = 0; i < dataset.numAttributes() - 1; i++) { tempDifference = dataset.instance(paraI).value(i) - paraArray[i]; resultDistance += tempDifference * tempDifference; } // Of for i break; default: System.out.println("Unsupported distance measure: " + distanceMeasure); }// Of switch return resultDistance; }// Of distance / * * Clustering. * */ public void clustering() { int[] tempOldClusterArray = new int[dataset.numInstances()]; tempOldClusterArray[0] = -1; int[] tempClusterArray = new int[dataset.numInstances()]; Arrays.fill(tempClusterArray, 0); double[][] tempCenters = new double[numClusters][dataset.numAttributes() - 1]; // Step 1. Initialize centers. int[] tempRandomOrders = getRandomIndices(dataset.numInstances()); for (int i = 0; i < numClusters; i++) { for (int j = 0; j < tempCenters[0].length; j++) { tempCenters[i][j] = dataset.instance(tempRandomOrders[i]).value(j); } // Of for j } // Of for i int[] tempClusterLengths = null; while (!Arrays.equals(tempOldClusterArray, tempClusterArray)) { System.out.println("New loop ..."); tempOldClusterArray = tempClusterArray; tempClusterArray = new int[dataset.numInstances()]; // Step 2.1 Minimization. Assign cluster to each instance. int tempNearestCenter; double tempNearestDistance; double tempDistance; for (int i = 0; i < dataset.numInstances(); i++) { tempNearestCenter = -1; tempNearestDistance = Double.MAX_VALUE; for (int j = 0; j < numClusters; j++) { tempDistance = distance(i, tempCenters[j]); if (tempNearestDistance > tempDistance) { tempNearestDistance = tempDistance; tempNearestCenter = j; } // Of if } // Of for j tempClusterArray[i] = tempNearestCenter; } // Of for i // Step 2.2 Mean. Find new centers. tempClusterLengths = new int[numClusters]; Arrays.fill(tempClusterLengths, 0); double[][] tempNewCenters = new double[numClusters][dataset.numAttributes() - 1]; // Arrays.fill(tempNewCenters, 0); for (int i = 0; i < dataset.numInstances(); i++) { for (int j = 0; j < tempNewCenters[0].length; j++) { tempNewCenters[tempClusterArray[i]][j] += dataset.instance(i).value(j); } // Of for j tempClusterLengths[tempClusterArray[i]]++; } // Of for i // Step 2.3 Now average for (int i = 0; i < tempNewCenters.length; i++) { for (int j = 0; j < tempNewCenters[0].length; j++) { tempNewCenters[i][j] /= tempClusterLengths[i]; } // Of for j } // Of for i System.out.println("Now the new centers are: " + Arrays.deepToString(tempNewCenters)); tempCenters = tempNewCenters; } // Of while // Step 3. Form clusters. clusters = new int[numClusters][]; int[] tempCounters = new int[numClusters]; for (int i = 0; i < numClusters; i++) { clusters[i] = new int[tempClusterLengths[i]]; } // Of for i for (int i = 0; i < tempClusterArray.length; i++) { clusters[tempClusterArray[i]][tempCounters[tempClusterArray[i]]] = i; tempCounters[tempClusterArray[i]]++; } // Of for i System.out.println("The clusters are: " + Arrays.deepToString(clusters)); }// Of clustering / * * Clustering. * */ public static void testClustering() { KMeans tempKMeans = new KMeans("D:/data/iris.arff"); tempKMeans.setNumClusters(3); tempKMeans.clustering(); }// Of testClustering / * * A testing method. * */ public static void main(String arags[]) { testClustering(); }// Of main }// Of class KMeans
第 57 天: kMeans 聚类 (续)
第 58 天: 符号型数据的 NB 算法
Naive Bayes 是一种用后验概率公式推导出的算法. 它有一个独立性假设, 从数学上看起来不靠谱. 但从机器学习效果来说是不错的. 写程序之前, 先点击NB 算法 (包括符号型与数值型, 结合 Java 程序分析)进行学习.
- 所有的程序都在今天列出, 但今天只研究符号型数据的分类. 为此, 可以只抄符号型数据相关的方法 (从 main() 顺藤摸瓜开始有选择性地抄), 明天再抄数值型数据处理算法. 421 行的代码仅仅是测试训练与测试集不同的情况, 没有必要抄.
- 必须自己举一个小的例子 (如 10 个对象, 3 个条件属性, 2 个类别) 来辅助理解.
- 需要查阅相关基础知识.
- 需要理解三维数组每个维度的涵义: The conditional probabilities for all classes over all attributes on all values. 注意到三维数组不是规则的, 例如, 不同属性的属性值个数可能不同.
- 这里使用同样的数据进行训练和测试. 如果要划分训练集和测试集, 可参考 kNN 代码.
- tempPseudoProbability 初始化为 0 就错了. 对于类平衡数据集没影响, 但不平衡的话效果就不对了. 在这个问题上输了 50 块钱, 害!
package datastructure.nb; import java.io.FileReader; import java.util.Arrays; import java.util.Random; import weka.core.*; / * The Naive Bayes algorithm. * * @author Fan Min . */ public class NaiveBayes { / * * An inner class to store parameters. * */ private class GaussianParamters { double mu; double sigma; public GaussianParamters(double paraMu, double paraSigma) { mu = paraMu; sigma = paraSigma; }// Of the constructor public String toString() { return "(" + mu + ", " + sigma + ")"; }// Of toString }// Of GaussianParamters / * The data. */ Instances dataset; / * The number of classes. For binary classification it is 2. */ int numClasses; / * The number of instances. */ int numInstances; / * The number of conditional attributes. */ int numConditions; / * The prediction, including queried and predicted labels. */ int[] predicts; / * Class distribution. */ double[] classDistribution; / * Class distribution with Laplacian smooth. */ double[] classDistributionLaplacian; / * To calculate the conditional probabilities for all classes over all * attributes on all values. */ double[][][] conditionalCounts; / * The conditional probabilities with Laplacian smooth. */ double[][][] conditionalProbabilitiesLaplacian; / * The Guassian parameters. */ GaussianParamters[][] gaussianParameters; / * Data type. */ int dataType; / * Nominal. */ public static final int NOMINAL = 0; / * Numerical. */ public static final int NUMERICAL = 1; / * The constructor. * * @param paraFilename * The given file. */ public NaiveBayes(String paraFilename) { dataset = null; try { FileReader fileReader = new FileReader(paraFilename); dataset = new Instances(fileReader); fileReader.close(); } catch (Exception ee) { System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee); System.exit(0); } // Of try dataset.setClassIndex(dataset.numAttributes() - 1); numConditions = dataset.numAttributes() - 1; numInstances = dataset.numInstances(); numClasses = dataset.attribute(numConditions).numValues(); }// Of the constructor / * The constructor. * * @param paraFilename * The given file. */ public NaiveBayes(Instances paraInstances) { dataset = paraInstances; dataset.setClassIndex(dataset.numAttributes() - 1); numConditions = dataset.numAttributes() - 1; numInstances = dataset.numInstances(); numClasses = dataset.attribute(numConditions).numValues(); }// Of the constructor / * Set the data type. */ public void setDataType(int paraDataType) { dataType = paraDataType; }// Of setDataType / * Calculate the class distribution with Laplacian smooth. */ public void calculateClassDistribution() { classDistribution = new double[numClasses]; classDistributionLaplacian = new double[numClasses]; double[] tempCounts = new double[numClasses]; for (int i = 0; i < numInstances; i++) { int tempClassValue = (int) dataset.instance(i).classValue(); tempCounts[tempClassValue]++; } // Of for i for (int i = 0; i < numClasses; i++) { classDistribution[i] = tempCounts[i] / numInstances; classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses); } // Of for i System.out.println("Class distribution: " + Arrays.toString(classDistribution)); System.out.println("Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian)); }// Of calculateClassDistribution / * Calculate the conditional probabilities with Laplacian smooth. ONLY scan * the dataset once. There was a simpler one, I have removed it because the * time complexity is higher. */ public void calculateConditionalProbabilities() { conditionalCounts = new double[numClasses][numConditions][]; conditionalProbabilitiesLaplacian = new double[numClasses][numConditions][]; // Allocate space for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numConditions; j++) { int tempNumValues = (int) dataset.attribute(j).numValues(); conditionalCounts[i][j] = new double[tempNumValues]; conditionalProbabilitiesLaplacian[i][j] = new double[tempNumValues]; } // Of for j } // Of for i // Count the numbers int[] tempClassCounts = new int[numClasses]; for (int i = 0; i < numInstances; i++) { int tempClass = (int) dataset.instance(i).classValue(); tempClassCounts[tempClass]++; for (int j = 0; j < numConditions; j++) { int tempValue = (int) dataset.instance(i).value(j); conditionalCounts[tempClass][j][tempValue]++; } // Of for j } // Of for i // Now for the real probability with Laplacian for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numConditions; j++) { int tempNumValues = (int) dataset.attribute(j).numValues(); for (int k = 0; k < tempNumValues; k++) { conditionalProbabilitiesLaplacian[i][j][k] = (conditionalCounts[i][j][k] + 1) / (tempClassCounts[i] + tempNumValues); // I wrote a bug here. This is an alternative approach, // however its performance is better in the mushroom dataset. // conditionalProbabilitiesLaplacian[i][j][k] = // (numInstances * conditionalCounts[i][j][k] + 1) // / (numInstances * tempClassCounts[i] + tempNumValues); } // Of for k } // Of for j } // Of for i System.out.println("Conditional probabilities: " + Arrays.deepToString(conditionalCounts)); }// Of calculateConditionalProbabilities / * Calculate the conditional probabilities with Laplacian smooth. */ public void calculateGausssianParameters() { gaussianParameters = new GaussianParamters[numClasses][numConditions]; double[] tempValuesArray = new double[numInstances]; int tempNumValues = 0; double tempSum = 0; for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numConditions; j++) { tempSum = 0; // Obtain values for this class. tempNumValues = 0; for (int k = 0; k < numInstances; k++) { if ((int) dataset.instance(k).classValue() != i) { continue; } // Of if tempValuesArray[tempNumValues] = dataset.instance(k).value(j); tempSum += tempValuesArray[tempNumValues]; tempNumValues++; } // Of for k // Obtain parameters. double tempMu = tempSum / tempNumValues; double tempSigma = 0; for (int k = 0; k < tempNumValues; k++) { tempSigma += (tempValuesArray[k] - tempMu) * (tempValuesArray[k] - tempMu); } // Of for k tempSigma /= tempNumValues; tempSigma = Math.sqrt(tempSigma); gaussianParameters[i][j] = new GaussianParamters(tempMu, tempSigma); } // Of for j } // Of for i System.out.println(Arrays.deepToString(gaussianParameters)); }// Of calculateGausssianParameters / * Classify all instances, the results are stored in predicts[]. */ public void classify() { predicts = new int[numInstances]; for (int i = 0; i < numInstances; i++) { predicts[i] = classify(dataset.instance(i)); } // Of for i }// Of classify / * Classify an instances. */ public int classify(Instance paraInstance) { if (dataType == NOMINAL) { return classifyNominal(paraInstance); } else if (dataType == NUMERICAL) { return classifyNumerical(paraInstance); } // Of if return -1; }// Of classify / * Classify an instances with nominal data. */ public int classifyNominal(Instance paraInstance) { // Find the biggest one double tempBiggest = -10000; int resultBestIndex = 0; for (int i = 0; i < numClasses; i++) { double tempPseudoProbability = Math.log(classDistributionLaplacian[i]); for (int j = 0; j < numConditions; j++) { int tempAttributeValue = (int) paraInstance.value(j); tempPseudoProbability += Math.log(conditionalProbabilitiesLaplacian[i][j][tempAttributeValue]); } // Of for j if (tempBiggest < tempPseudoProbability) { tempBiggest = tempPseudoProbability; resultBestIndex = i; } // Of if } // Of for i return resultBestIndex; }// Of classifyNominal / * Classify an instances with numerical data. */ public int classifyNumerical(Instance paraInstance) { // Find the biggest one double tempBiggest = -10000; int resultBestIndex = 0; for (int i = 0; i < numClasses; i++) { double tempPseudoProbability = Math.log(classDistributionLaplacian[i]); for (int j = 0; j < numConditions; j++) { double tempAttributeValue = paraInstance.value(j); double tempSigma = gaussianParameters[i][j].sigma; double tempMu = gaussianParameters[i][j].mu; tempPseudoProbability += -Math.log(tempSigma) - (tempAttributeValue - tempMu) * (tempAttributeValue - tempMu) / (2 * tempSigma * tempSigma); } // Of for j if (tempBiggest < tempPseudoProbability) { tempBiggest = tempPseudoProbability; resultBestIndex = i; } // Of if } // Of for i return resultBestIndex; }// Of classifyNumerical / * Compute accuracy. */ public double computeAccuracy() { double tempCorrect = 0; for (int i = 0; i < numInstances; i++) { if (predicts[i] == (int) dataset.instance(i).classValue()) { tempCorrect++; } // Of if } // Of for i double resultAccuracy = tempCorrect / numInstances; return resultAccuracy; }// Of computeAccuracy / * * Test nominal data. * */ public static void testNominal() { System.out.println("Hello, Naive Bayes. I only want to test the nominal data."); String tempFilename = "D:/data/mushroom.arff"; NaiveBayes tempLearner = new NaiveBayes(tempFilename); tempLearner.setDataType(NOMINAL); tempLearner.calculateClassDistribution(); tempLearner.calculateConditionalProbabilities(); tempLearner.classify(); System.out.println("The accuracy is: " + tempLearner.computeAccuracy()); }// Of testNominal / * * Test numerical data. * */ public static void testNumerical() { System.out.println("Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption."); // String tempFilename = "D:/data/iris.arff"; String tempFilename = "D:/data/iris-imbalance.arff"; NaiveBayes tempLearner = new NaiveBayes(tempFilename); tempLearner.setDataType(NUMERICAL); tempLearner.calculateClassDistribution(); tempLearner.calculateGausssianParameters(); tempLearner.classify(); System.out.println("The accuracy is: " + tempLearner.computeAccuracy()); }// Of testNumerical / * * Test this class. * * @param args * Not used now. * */ public static void main(String[] args) { testNominal(); testNumerical(); // testNominal(0.8); }// Of main / * * Get a random indices for data randomization. * * @param paraLength * The length of the sequence. * @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6. * */ public static int[] getRandomIndices(int paraLength) { Random random = new Random(); int[] resultIndices = new int[paraLength]; // Step 1. Initialize. for (int i = 0; i < paraLength; i++) { resultIndices[i] = i; } // Of for i // Step 2. Randomly swap. int tempFirst, tempSecond, tempValue; for (int i = 0; i < paraLength; i++) { // Generate two random indices. tempFirst = random.nextInt(paraLength); tempSecond = random.nextInt(paraLength); // Swap. tempValue = resultIndices[tempFirst]; resultIndices[tempFirst] = resultIndices[tempSecond]; resultIndices[tempSecond] = tempValue; } // Of for i return resultIndices; }// Of getRandomIndices / * * Split the data into training and testing parts. * * @param paraTrainingFraction * The fraction of the training set. * */ public static Instances[] splitTrainingTesting(Instances paraDataset, double paraTrainingFraction) { int tempSize = paraDataset.numInstances(); int[] tempIndices = getRandomIndices(tempSize); int tempTrainingSize = (int) (tempSize * paraTrainingFraction); // Empty datasets. Instances tempTrainingSet = new Instances(paraDataset); tempTrainingSet.delete(); Instances tempTestingSet = new Instances(tempTrainingSet); for (int i = 0; i < tempTrainingSize; i++) { tempTrainingSet.add(paraDataset.instance(tempIndices[i])); } // Of for i for (int i = 0; i < tempSize - tempTrainingSize; i++) { tempTestingSet.add(paraDataset.instance(tempIndices[tempTrainingSize + i])); } // Of for i tempTrainingSet.setClassIndex(tempTrainingSet.numAttributes() - 1); tempTestingSet.setClassIndex(tempTestingSet.numAttributes() - 1); Instances[] resultInstanesArray = new Instances[2]; resultInstanesArray[0] = tempTrainingSet; resultInstanesArray[1] = tempTestingSet; return resultInstanesArray; }// Of splitTrainingTesting / * Classify all instances, the results are stored in predicts[]. */ public double classify(Instances paraTestingSet) { double tempCorrect = 0; int[] tempPredicts = new int[paraTestingSet.numInstances()]; for (int i = 0; i < tempPredicts.length; i++) { tempPredicts[i] = classify(paraTestingSet.instance(i)); if (tempPredicts[i] == (int) paraTestingSet.instance(i).classValue()) { tempCorrect++; } // Of if } // Of for i System.out.println("" + tempCorrect + " correct over " + tempPredicts.length + " instances."); double resultAccuracy = tempCorrect / tempPredicts.length; return resultAccuracy; }// Of classify / * * Test nominal data. * */ public static void testNominal(double paraTrainingFraction) { System.out.println("Hello, Naive Bayes. I only want to test the nominal data."); String tempFilename = "D:/data/mushroom.arff"; // String tempFilename = "D:/data/voting.arff"; Instances tempDataset = null; try { FileReader fileReader = new FileReader(tempFilename); tempDataset = new Instances(fileReader); fileReader.close(); } catch (Exception ee) { System.out.println("Cannot read the file: " + tempFilename + "\r\n" + ee); System.exit(0); } // Of try Instances[] tempDatasets = splitTrainingTesting(tempDataset, paraTrainingFraction); NaiveBayes tempLearner = new NaiveBayes(tempDatasets[0]); tempLearner.setDataType(NOMINAL); tempLearner.calculateClassDistribution(); tempLearner.calculateConditionalProbabilities(); double tempAccuracy = tempLearner.classify(tempDatasets[1]); System.out.println("The accuracy is: " + tempAccuracy); }// Of testNominal }// Of class NaiveBayes
第 59 天: 数值型数据的 NB 算法
- 今天把数值型数据处理的代码加上去.
- 假设所有属性的属性值都服从高斯分布. 也可以做其它假设.
- 将概率密度当成概率值直接使用 Bayes 公式.
- 可以看到, 数值型数据的处理并不会比符号型的复杂.
第 60 天: 小结
描述这 10 天的学习体会, 不少于 10 条.
免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://haidsoft.com/101025.html