Lab3_BigData-MapReduce
Lab3_BigData-MapReduce
MapReduce
Objective:
The objective of this lab is to implement a basic Word Count program using Hadoop MapReduce.
Students will go through the process of setting up a Hadoop project, defining dependencies, writing
Mapper and Reducer classes, running the job, and verifying the results.
Prerequisites:
− Java Development Environment: Ensure that you have Java installed on your machine, and the
Java development environment is set up.
− Apache Maven: Maven should be installed to manage the project build and
dependencies. Participants should have a basic understanding of Maven.
− Hadoop Installation: A Hadoop cluster or a local Hadoop installation should be available.
Hadoop binaries and configurations should be properly set up.
− Text Editor or IDE: Choose a text editor or integrated development environment (IDE) for
editing code and managing the project.
− Basic Understanding of Hadoop MapReduce: Participants should have a basic understanding of
the MapReduce programming model and its key components such as Mapper, Reducer, and
the overall workflow..
Note :
− Adjust paths based on your specific project setup.
− Ensure that you have the necessary permissions to perform the operations.
Lab Tasks:
1. Open your java IDE and create a maven project “WordCount”
2. Open the pom.xml and add the following dependencies
<groupId>org.codenouhayla</groupId>
<artifactId>WordCount</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.2.2</version>
</dependency>
</dependencies>
</project>
4. Create the WC_Mapper class and the add the following code
package org.codenouhayla;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
@Override
public void map(LongWritable key,
Text value,
OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
5. Create the WC_Reducer class and the add the following code
package org.codenouhayla;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class WC_Reducer extends MapReduceBase implements Reducer<Text, IntWritable,
Text, IntWritable> {
@Override
public void reduce(Text key,
Iterator<IntWritable> values,
OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
6. Create the WC_Runner class and the add the following code
package org.codenouhayla;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class WC_Runner {
public static void main(String[] args) throws IOException {
if (args.length < 2) {
System.err.println("Usage: WC_Runner <input path> <output path>");
System.exit(-1);
}
JobConf conf = new JobConf(WC_Runner.class);
conf.setJobName("WordCount");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(WC_Mapper.class);
conf.setCombinerClass(WC_Reducer.class);
conf.setReducerClass(WC_Reducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}