2023-04-28

#! https://zhuanlan.zhihu.com/p/381295624

title: Java HDFS统计每个单词的个数
date: 2021-06-16 11:32:04
tags:

Hadoop
Java

Java HDFS统计每个单词的个数

创建Java Maven项目

更新pom.xml添加package

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-core</artifactId>
    <version>3.3.0</version>
</dependency>
    
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>3.3.0</version>
</dependency>

添加WordsCountMapper.java

package com.meekou;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordsCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
    private final static IntWritable one = new IntWritable(1);
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        String line = value.toString();
        String[] words = line.split(" ");
        for (String word : words) {
            context.write(new Text(word), one);
        }      
    }
}

添加WordsCountReducer.java

package com.meekou;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordsCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> words,
            Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        int frequencyForCountry = 0;
        for (IntWritable val : words) {
            frequencyForCountry += val.get();
        }
        context.write(key, new IntWritable(frequencyForCountry));
    }
}

更新App.java

package com.meekou;

import java.io.IOException;
import java.net.URL;
import java.net.URLDecoder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Hello world!
 */
public final class App {
    private App() {
    }

    /**
     * Says hello to the world.
     * @param args The arguments of the program.
     * @throws IOException
     * @throws InterruptedException
     * @throws ClassNotFoundException
     */
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        try {
            System.out.println("Words Count Start!");
            //System.setProperty("hadoop.home.dir", "C:/Users/xx/Meekou/Meekou.hadoop/hadoop-3.2.2.tar/hadoop-3.2.2");
            URL url = App.class.getClassLoader().getResource("wordcount.txt");
            Path inputPath = new Path(URLDecoder.decode(url.getFile(),"UTF-8") );
            Configuration config = new Configuration();
            Job job = Job.getInstance(config, "calculatewordscount");
            job.setJarByClass(App.class);
            job.setMapperClass(WordsCountMapper.class);
            job.setReducerClass(WordsCountReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class); 
            FileInputFormat.addInputPath(job, inputPath);
            FileOutputFormat.setOutputPath(job, new Path(inputPath.getParent().toString() + "/result"));
            boolean r =  job.waitForCompletion(true);
            System.out.println("Words Count Complete!");
        } catch (Exception e) {
            System.out.println(e);
        }

    }
}

在resources文件夹下创建文本wordcount.txt并输入下面的内容
1
2
3
4
tom jack mary
rose anly billo anly
billo mary zoor
zoor poly

运行项目并查看结果

anly	2
billo	2
jack	1
mary	2
poly	1
rose	1
tom	1
zoor	2

米可爱分享

#! https://zhuanlan.zhihu.com/p/381295624

Java HDFS统计每个单词的个数

创建Java Maven项目

在`resources`文件夹下创建文本`wordcount.txt`并输入下面的内容

#! https://zhuanlan.zhihu.com/p/381295624

Java HDFS统计每个单词的个数

创建Java Maven项目

在resources文件夹下创建文本wordcount.txt并输入下面的内容

在`resources`文件夹下创建文本`wordcount.txt`并输入下面的内容