大数据 MapReduce基础实战

devtools/2024/11/29 7:45:27/

一、关于此次实践

1、实战简介

MapReduce是Hadoop的核心功能之一,掌握它对学习Hadoop至关重要。Hadoop Map/Reduce是一个使用简易的软件框架,基于它写出来的应用程序能够运行在由上千个商用机器组成的大型集群上,并以一种可靠容错的方式并行处理上T级别的数据集。

本章我们来通过几个示例来学习MapReduce的用法。

2、全部任务

二、实践详解

1、第 1 关:成绩统计

命令行
touch file01
echo Hello World Bye World
cat file01
echo Hello World Bye World >file01
cat file01
touch file02
echo Hello Hadoop Goodbye Hadoop >file02
cat file02
start-dfs.sh
hadoop fs -mkdir /usr
hadoop fs -mkdir /usr/input
hadoop fs -ls /usr/output
hadoop fs -ls /
hadoop fs -ls /usr
hadoop fs -put file01 /usr/input
hadoop fs -put file02 /usr/input
hadoop fs -ls /usr/input
import java.io.IOException;
import java.util.StringTokenizer;import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;public class WordCount {/********** Begin **********///Mapper函数public static class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private Text word = new Text();private int maxValue = 0;public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {StringTokenizer itr = new StringTokenizer(value.toString(),"\n");while (itr.hasMoreTokens()) {String[] str = itr.nextToken().split(" ");String name = str[0];one.set(Integer.parseInt(str[1]));word.set(name);context.write(word,one);}//context.write(word,one);}}public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOException, InterruptedException {int maxAge = 0;int age = 0;for (IntWritable intWritable : values) {maxAge = Math.max(maxAge, intWritable.get());}result.set(maxAge);context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = new Job(conf, "word count");job.setJarByClass(WordCount.class);job.setMapperClass(TokenizerMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);String inputfile = "/user/test/input";String outputFile = "/user/test/output/";FileInputFormat.addInputPath(job, new Path(inputfile));FileOutputFormat.setOutputPath(job, new Path(outputFile));job.waitForCompletion(true);/********** End **********/}
}

2、第 2 关:文件内容合并去重

import java.io.IOException;import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;public class Merge {/*** @param args* 对A,B两个文件进行合并,并剔除其中重复的内容,得到一个新的输出文件C*///在这重载map函数,直接将输入中的value复制到输出数据的key上 注意在map方法中要抛出异常:throws IOException,InterruptedExceptionpublic static class Map  extends Mapper<Object, Text, Text, Text>{/********** Begin **********/public void map(Object key, Text value, Context content) throws IOException, InterruptedException {  Text text1 = new Text();Text text2 = new Text();StringTokenizer itr = new StringTokenizer(value.toString());while (itr.hasMoreTokens()) {text1.set(itr.nextToken());text2.set(itr.nextToken());content.write(text1, text2);}}  /********** End **********/} //在这重载reduce函数,直接将输入中的key复制到输出数据的key上  注意在reduce方法上要抛出异常:throws IOException,InterruptedExceptionpublic static class  Reduce extends Reducer<Text, Text, Text, Text> {/********** Begin **********/public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {Set<String> set = new TreeSet<String>();for(Text tex : values){set.add(tex.toString());}for(String tex : set){context.write(key, new Text(tex));}}  /********** End **********/}public static void main(String[] args) throws Exception{// TODO Auto-generated method stubConfiguration conf = new Configuration();conf.set("fs.default.name","hdfs://localhost:9000");Job job = Job.getInstance(conf,"Merge and duplicate removal");job.setJarByClass(Merge.class);job.setMapperClass(Map.class);job.setCombinerClass(Reduce.class);job.setReducerClass(Reduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);String inputPath = "/user/tmp/input/";  //在这里设置输入路径String outputPath = "/user/tmp/output/";  //在这里设置输出路径FileInputFormat.addInputPath(job, new Path(inputPath));FileOutputFormat.setOutputPath(job, new Path(outputPath));System.exit(job.waitForCompletion(true) ? 0 : 1);}}

3、第 3 关:信息挖掘 - 挖掘父子关系

import java.io.IOException;
import java.util.*;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;public class simple_data_mining {public static int time = 0;/*** @param args* 输入一个child-parent的表格* 输出一个体现grandchild-grandparent关系的表格*///Map将输入文件按照空格分割成child和parent,然后正序输出一次作为右表,反序输出一次作为左表,需要注意的是在输出的value中必须加上左右表区别标志public static class Map extends Mapper<Object, Text, Text, Text>{public void map(Object key, Text value, Context context) throws IOException,InterruptedException{/********** Begin **********/String line = value.toString();String[] childAndParent = line.split(" ");List<String> list = new ArrayList<>(2);for (String childOrParent : childAndParent) {if (!"".equals(childOrParent)) {list.add(childOrParent);} } if (!"child".equals(list.get(0))) {String childName = list.get(0);String parentName = list.get(1);String relationType = "1";context.write(new Text(parentName), new Text(relationType + "+"+ childName + "+" + parentName));relationType = "2";context.write(new Text(childName), new Text(relationType + "+"+ childName + "+" + parentName));}/********** End **********/}}public static class Reduce extends Reducer<Text, Text, Text, Text>{public void reduce(Text key, Iterable<Text> values,Context context) throws IOException,InterruptedException{/********** Begin **********///输出表头if (time == 0) {context.write(new Text("grand_child"), new Text("grand_parent"));time++;}//获取value-list中value的child
List<String> grandChild = new ArrayList<>();//获取value-list中value的parentList<String> grandParent = new ArrayList<>();//左表,取出child放入grand_childfor (Text text : values) {String s = text.toString();String[] relation = s.split("\\+");String relationType = relation[0];String childName = relation[1];String parentName = relation[2];if ("1".equals(relationType)) {grandChild.add(childName);} else {grandParent.add(parentName);}}//右表,取出parent放入grand_parentint grandParentNum = grandParent.size();int grandChildNum = grandChild.size();if (grandParentNum != 0 && grandChildNum != 0) {for (int m = 0; m < grandChildNum; m++) {for (int n = 0; n < grandParentNum; n++) {//输出结果context.write(new Text(grandChild.get(m)), new Text(grandParent.get(n)));}}}/********** End **********/}}public static void main(String[] args) throws Exception{// TODO Auto-generated method stubConfiguration conf = new Configuration();Job job = Job.getInstance(conf,"Single table join");job.setJarByClass(simple_data_mining.class);job.setMapperClass(Map.class);job.setReducerClass(Reduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);String inputPath = "/user/reduce/input";   //设置输入路径String outputPath = "/user/reduce/output";   //设置输出路径FileInputFormat.addInputPath(job, new Path(inputPath));FileOutputFormat.setOutputPath(job, new Path(outputPath));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}


http://www.ppmy.cn/devtools/137864.html

相关文章

定长子串中元音的最大数目

力扣链接:1456. 定长子串中元音的最大数目 - 力扣&#xff08;LeetCode&#xff09; 给你字符串 s 和整数 k 。 请返回字符串 s 中长度为 k 的单个子字符串中可能包含的最大元音字母数。 英文中的 元音字母 为&#xff08;a, e, i, o, u&#xff09;。 示例1: 输入&#xf…

分布式MQTT代理中使用布隆过滤器管理通配符主题

论文标题&#xff1a;Wildcard Topic Management using Bloom Filter in Distributed MQTT Brokers 中文标题&#xff1a;分布式MQTT代理中使用布隆过滤器管理通配符主题 作者信息&#xff1a; Ryohei Banno&#xff0c;Hitotsubashi University, Graduate School of Social…

计算机网络的发展

目录 起源与早期发展 ARPANET与TCP/IP协议的诞生 互联网的诞生与普及 高速互联网与无线网络的兴起 移动互联网与云计算的崛起 物联网、区块链与自动驾驶技术的兴起 起源与早期发展 计算机网络的雏形最早可以追溯到20世纪60年代&#xff0c;主要是为了共享大型计算资源。当…

力扣2978. 对称坐标

一、数据 2978. 对称坐标 表&#xff1a; Coordinates ------------------- | Column Name | Type | ------------------- | X | int | | Y | int | ------------------- 每一行包括 X 和 Y&#xff0c;都是整数。表格可能包含重复值。如果两个坐标 (…

共享售卖机语音芯片方案选型:WTN6020引领智能化交互新风尚

在共享经济蓬勃发展的今天&#xff0c;共享售卖机作为便捷购物的新形式&#xff0c;正逐步渗透到人们生活的各个角落。为了提升用户体验&#xff0c;增强设备的智能化和互动性&#xff0c;增加共享售卖机的语音功能就显得尤为重要。 共享售卖机语音方案选型&#xff1a; WTN602…

微知-lspci访问到指定的PCIe设备的几种方式?(lspci -s bus;lspci -d devices)

通过bdf号查看 -s &#xff08;bus&#xff09; lspci -s 03:00.0通过vendor id或者device id等设备查看 -d &#xff08;device&#xff09; lspci -d 15b3: #这里是vendor号&#xff0c;所以在前面 lspci -d :1021 #这里是设备号&#xff0c;所以要:在前vendorid和deviceid…

Xilinx Blockset Gateway In 和Gateway out模块使用及参数配置

目录 一、Gateway InSimulink数据到System Generator数据的转换Gateway BlocksBlock Parameters&#xff08;模块参数&#xff09;Basic选项卡参数Implementation选项卡参数 二、Gateway OutGateway BlocksBlock Parameters&#xff08;模块参数&#xff09;Basic选项卡参数Imp…

软文实战技巧:如何利用媒体平台资源提升品牌影响力?

在数字化时代&#xff0c;媒体资源不仅是品牌传播的重要工具&#xff0c;更是提升品牌影响力的关键所在。借助媒体的力量&#xff0c;品牌可以更广泛地触达目标受众&#xff0c;更精准地传递品牌理念&#xff0c;更有效地与消费者建立情感连接。 而软文作为一种低成本、高效率…