mapreduce中怎么實(shí)現(xiàn)K-M類聚

發(fā)布時間：2021-08-10 11:27:07 來源：億速云閱讀：100 作者：Leah 欄目：云計算

mapreduce中怎么實(shí)現(xiàn)K-M類聚，針對這個問題，這篇文章詳細(xì)介紹了相對應(yīng)的分析和解答，希望可以幫助更多想解決這個問題的小伙伴找到更簡單易行的方法。

首先是map

public static class KMmap extends Mapper<LongWritable, Text, IntWritable, Text>{
        //中心集合
        //這里的聚簇集合是自己設(shè)定的    centersPath就是集合在hdfs中存放的路徑
        ArrayList<ArrayList<Double>> centers = null;
        //用k個中心
        int k = 0;
        //讀取中心
        protected void setup(Context context)throws IOException, InterruptedException {
            //getCentersFromHDFS方法就是傳入一個Path，得到一個ArrayList<ArrayList<Double>>集合
             centers = Utils.getCentersFromHDFS(context.getConfiguration().get("centersPath"),false);
             k = centers.size();
        }
         /**
          * 1.每次讀取一條要分類的條記錄與中心做對比，歸類到對應(yīng)的中心
          * 2.以中心ID為key，中心包含的記錄為value輸出(例如： 1 0.2 。  1為聚類中心的ID，0.2為靠近聚類中心的某個值)
          */
        @Override
        protected void map(LongWritable key, Text value,Context context)
                throws IOException, InterruptedException {
            ArrayList<Double> fileds = Utils.textToArray(value);
            //textToArray方法將map進(jìn)來的一行value根據(jù)“,”分割后轉(zhuǎn)化為ArrayList<Double>的集合
            int sizeOfFileds = fileds.size();
            double minDistance = 99999999;
            int centerIndex = 0;
            //依次取出k個中心點(diǎn)與當(dāng)前讀取的記錄做計算
            for(int i=0;i<k;i++){
                double currentDistance = 0;
                for(int j=0;j<sizeOfFileds;j++){
                    //centers中存放的是中心點(diǎn)集合
                    //fileds中存放的是被分析數(shù)據(jù)的每行的數(shù)據(jù)集合
                    double centerPoint = Math.abs(centers.get(i).get(j));
                    double filed = Math.abs(fileds.get(j));
                    //根據(jù)類聚的公式先對[(h-k)/(h+k)]2累加
                    currentDistance += Math.pow((centerPoint - filed) / (centerPoint + filed), 2);
                }
                //循環(huán)找出距離該記錄最接近的中心點(diǎn)的ID
                if(currentDistance<minDistance){
                    minDistance = currentDistance;
                    centerIndex = i;
                }
            }
            //以中心點(diǎn)為Key 將記錄原樣輸出
            context.write(new IntWritable(centerIndex+1), value);
        }
    }

reduce

    //利用reduce的歸并功能以中心為Key將記錄歸并到一起
    public static class KMreduce extends Reducer<IntWritable, Text, Text, Text>{

          /**
            * 1.Key為聚類中心的ID value為該中心的記錄集合
            * 2.計數(shù)所有記錄元素的平均值，求出新的中心
            */
        
        protected void reduce(IntWritable key, Iterable<Text> values,
    Context context)throws IOException, InterruptedException {
             ArrayList<ArrayList<Double>> filedsList = new ArrayList<ArrayList<Double>>();
            //依次讀取記錄集，每行為一個ArrayList<Double>
             for(Iterator<Text> it = values.iterator();it.hasNext();){
                 ArrayList<Double> tempList = Utils.textToArray(it.next());
                 filedsList.add(tempList);
             }
             //計算新的中心
             //每行的元素個數(shù)
             int filedSize = filedsList.get(0).size();
             double[] avg = new double[filedSize];
             for(int i=0;i<filedSize;i++){
                //求每列的平均值
                 double sum = 0;
                 int size = filedsList.size();
                 for(int j=0;j<size;j++){
                     sum += filedsList.get(j).get(i);
                 }
                 avg[i] = sum/size;
             }
            context.write(new Text(""), new Text((Arrays.toString(avg).replace("[", "").replace("]", ""))));
        }
    }

最后是其中所用到的util類，主要是提供一些讀取文件和操作字符串的方法

public class Utils {
    
    //讀取中心文件的數(shù)據(jù)
    public static ArrayList<ArrayList<Double>> getCentersFromHDFS(String centersPath,boolean isDirectory)
                    throws IOException{
        ArrayList<ArrayList<Double>> result = new ArrayList<ArrayList<Double>>();
        Path path = new Path(centersPath);
        Configuration conf = new Configuration();
                  
        FileSystem fileSystem = path.getFileSystem(conf);
        
        if(isDirectory){    
            FileStatus[] listFile = fileSystem.listStatus(path);
            for (int i = 0; i < listFile.length; i++) {
                result.addAll(getCentersFromHDFS(listFile[i].getPath().toString(),false));
                }
            return result;
        }
        FSDataInputStream fsis = fileSystem.open(path);
        LineReader lineReader = new LineReader(fsis, conf);
        Text line = new Text();
          while(lineReader.readLine(line) > 0){
                      ArrayList<Double> tempList = textToArray(line);
                          result.add(tempList);
                      }
                      lineReader.close();
            return result;
    }
    
    //刪掉文件
     public static void deletePath(String pathStr) throws IOException{
                Configuration conf = new Configuration();
                Path path = new Path(pathStr);
                FileSystem hdfs = path.getFileSystem(conf);
                hdfs.delete(path ,true);
              }
     
     
     public static ArrayList<Double> textToArray(Text text){
          ArrayList<Double> list = new ArrayList<Double>();
          String[] fileds = text.toString().split("\t");
          for(int i=0;i<fileds.length;i++){
              list.add(Double.parseDouble(fileds[i]));
          }
          return list;
      }
     public static boolean compareCenters(String centerPath,String newPath) throws IOException{
                  
                  List<ArrayList<Double>> oldCenters = Utils.getCentersFromHDFS(centerPath,false);
                  List<ArrayList<Double>> newCenters = Utils.getCentersFromHDFS(newPath,true);
                  
                    int size = oldCenters.size();
                    int fildSize = oldCenters.get(0).size();
                    double distance = 0;
                    for(int i=0;i<size;i++){
                        for(int j=0;j<fildSize;j++){
                            double t1 = Math.abs(oldCenters.get(i).get(j));
                            double t2 = Math.abs(newCenters.get(i).get(j));
                            distance += Math.pow((t1 - t2) / (t1 + t2), 2);
                        }
                    }
                   
                   if(distance == 0.0){
                       //刪掉新的中心文件以便最后依次歸類輸出
                        Utils.deletePath(newPath);
                        return true;
                    }else{
                        //先清空中心文件，將新的中心文件復(fù)制到中心文件中，再刪掉中心文件
                        
                        Configuration conf = new Configuration();
                        Path outPath = new Path(centerPath);
                        FileSystem fileSystem = outPath.getFileSystem(conf);
                        
                        FSDataOutputStream overWrite = fileSystem.create(outPath,true);
                        overWrite.writeChars("");
                        overWrite.close();
                        Path inPath = new Path(newPath);
                        FileStatus[] listFiles = fileSystem.listStatus(inPath);
                        for (int i = 0; i < listFiles.length; i++) {                
                            FSDataOutputStream out = fileSystem.create(outPath);
                            FSDataInputStream in = fileSystem.open(listFiles[i].getPath());
                            IOUtils.copyBytes(in, out, 4096, true);
                        }
                        //刪掉新的中心文件以便第二次任務(wù)運(yùn)行輸出
                        Utils.deletePath(newPath);
                    }
                    return false;
                }
     
}

關(guān)于mapreduce中怎么實(shí)現(xiàn)K-M類聚問題的解答就分享到這里了，希望以上內(nèi)容可以對大家有一定的幫助，如果你還有很多疑惑沒有解開，可以關(guān)注億速云行業(yè)資訊頻道了解更多相關(guān)知識。

向AI問一下細(xì)節(jié)

mapreduce中怎么實(shí)現(xiàn)K-M類聚

猜你喜歡

最新資訊

相關(guān)推薦

相關(guān)標(biāo)簽