#! https://zhuanlan.zhihu.com/p/381295624

title: Java HDFS统计每个单词的个数
date: 2021-06-16 11:32:04
tags:

  • Hadoop
  • Java

Java HDFS统计每个单词的个数

创建Java Maven项目

  • 更新pom.xml添加package
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-core</artifactId>
    <version>3.3.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>3.3.0</version>
    </dependency>
  • 添加WordsCountMapper.java
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    package com.meekou;

    import java.io.IOException;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;

    public class WordsCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
    private final static IntWritable one = new IntWritable(1);
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
    throws IOException, InterruptedException {
    String line = value.toString();
    String[] words = line.split(" ");
    for (String word : words) {
    context.write(new Text(word), one);
    }
    }
    }

  • 添加WordsCountReducer.java
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    package com.meekou;

    import java.io.IOException;

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;

    public class WordsCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> words,
    Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
    int frequencyForCountry = 0;
    for (IntWritable val : words) {
    frequencyForCountry += val.get();
    }
    context.write(key, new IntWritable(frequencyForCountry));
    }
    }

  • 更新App.java
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    package com.meekou;

    import java.io.IOException;
    import java.net.URL;
    import java.net.URLDecoder;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

    /**
    * Hello world!
    */
    public final class App {
    private App() {
    }

    /**
    * Says hello to the world.
    * @param args The arguments of the program.
    * @throws IOException
    * @throws InterruptedException
    * @throws ClassNotFoundException
    */
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    try {
    System.out.println("Words Count Start!");
    //System.setProperty("hadoop.home.dir", "C:/Users/xx/Meekou/Meekou.hadoop/hadoop-3.2.2.tar/hadoop-3.2.2");
    URL url = App.class.getClassLoader().getResource("wordcount.txt");
    Path inputPath = new Path(URLDecoder.decode(url.getFile(),"UTF-8") );
    Configuration config = new Configuration();
    Job job = Job.getInstance(config, "calculatewordscount");
    job.setJarByClass(App.class);
    job.setMapperClass(WordsCountMapper.class);
    job.setReducerClass(WordsCountReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, new Path(inputPath.getParent().toString() + "/result"));
    boolean r = job.waitForCompletion(true);
    System.out.println("Words Count Complete!");
    } catch (Exception e) {
    System.out.println(e);
    }

    }
    }
  • resources文件夹下创建文本wordcount.txt并输入下面的内容

    1
    2
    3
    4
    tom jack mary
    rose anly billo anly
    billo mary zoor
    zoor poly
  • 运行项目并查看结果
    1
    2
    3
    4
    5
    6
    7
    8
    anly	2
    billo 2
    jack 1
    mary 2
    poly 1
    rose 1
    tom 1
    zoor 2