Introducing MapReduce | 75
/** Specify the input and output locations to use for this MapReduce
job. These two arguments will pass through command line during run
time as argument 0 as HDFS input path and argument 1 as HDFS output
path respectively
**/
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
/** Submit the job and wait for it to finish. The argument specifies
whether to print progress information to output. (true means to do
so.)
**/
job.waitForCompletion(true);
}
}
2. Map Class
/** Mapper for word count.
* The base class Mapper is parameterized by
<in key type, in value type, out key type, out value type>Thus,
this mapper takes (Textkey, Text value) pairs and output(Text key,
LongWritable value) pairs. The input keys are assumed to be identifiers
for documentsi.e called file offset number, which are ignored, and
the valuesto be the content of documents. The output keys are words
foundwithin each document, and the output values are the number of
times a word appeared within a document
*/
public class WordCountMap extends Mapper<Text, Text, Text,
LongWritable> {
/** Regex pattern to find words (alphanumeric + _). */
final static Pattern WORD_PATTERN = Pattern.compile(“\w+”);
/** Constant 1 as a LongWritable value. */
private final static LongWritable ONE = new LongWritable(1L);
/** Text object to store a word to write to output. */
private Text word = new Text();
/** Actual map function. Takes one document’s text and emits key-value
pairs for each word found in the document.
@param key Document identifier (ignored)i.e file offset number.
@param value Text of the current document.
M04 Big Data Simplified XXXX 01.indd 75 5/10/2019 9:58:19 AM