Windows10
Ubuntu Kylin 16.04
Java8
Hadoop-2.7.1
Hive1.2.2
IDEA 2020.2.3
Pycharm 2021.1.3
Eclipse3.8
1.1 統(tǒng)計(jì)四大一線城市房價(jià)的最值通過MapReduce對最值、排序、TopN、自定義分區(qū)排序、二次排序、自定義類、占比等8個(gè)方面的統(tǒng)計(jì)分析
二手房房價(jià)的最值是體現(xiàn)一個(gè)城市經(jīng)濟(jì)的重要因素,也是顧客購買的衡量因素之一。
Driver端:
public class MaxMinTotalPriceByCityDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "MaxMinTotalPriceByCity");
job.setJarByClass(MaxMinTotalPriceByCityDriver.class);
job.setMapperClass(MaxMinTotalPriceByCityMapper.class);
job.setReducerClass(MaxMinTotalPriceByCityReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("datas/tb_house.txt"));
FileOutputFormat.setOutputPath(job, new Path("MapReduce/out/MaxMinTotalPriceByCity"));
job.waitForCompletion(true);
}
}
public class MaxMinTotalPriceByCityMapper extends Mapper
Reducer端:
public class MaxMinTotalPriceByCityReducer extends Reducer{
@Override
protected void reduce(Text key, Iterablevalues, Context context) throws IOException, InterruptedException {
ListtotalList = new ArrayList();
Iteratoriterator = values.iterator();
while (iterator.hasNext()) {
totalList.add(iterator.next().get());
}
Collections.sort(totalList);
int max = totalList.get(totalList.size() - 1);
int min = totalList.get(0);
Text outv = new Text();
outv.set("房子總價(jià)大、小值分別為:" + String.valueOf(max) + "萬元," + String.valueOf(min) + "萬元");
context.write(key, outv);
}
}
結(jié)果:
二手房的數(shù)量是了解房子基本情況的維度之一,數(shù)量的多少在一定程度上體現(xiàn)了房子的受歡迎度。
Driver端:
public class HouseCntByCityDriver {
public static void main(String[] args) throws Exception {
args = new String[] { "/input/datas/tb_house.txt", "/output/HouseCntByCity" };
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://node01:9000");
Job job = Job.getInstance(conf, "HouseCntByCity");
job.setJarByClass(HouseCntByCityDriver.class);
job.setMapperClass(HouseCntByCityMapper.class);
job.setReducerClass(HouseCntByCityReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setPartitionerClass(CityPartitioner.class);
job.setNumReduceTasks(4);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Mapper端:
public class HouseCntByCityMapper extends Mapper
Reducer端:
public class HouseCntByCityReducer extends Reducer{
@Override
protected void reduce(Text key, Iterablevalues, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) sum += val.get();
context.write(key, new IntWritable(sum));
}
}
二手房的信息發(fā)布時(shí)間是了解房子基本情況的維度之一,在一定程度上,顧客傾向于最新的房源信息。
Driver端:
public class AcessHousePubTimeSortDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration()
Job job = Job.getInstance(conf, "AcessHousePubTimeSort");
job.setJarByClass(AcessHousePubTimeSortDriver.class);
job.setMapperClass(AcessHousePubTimeSortMapper.class);
job.setReducerClass(AcessHousePubTimeSortReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("datas/tb_house.txt"));
FileOutputFormat.setOutputPath(job, new Path("MapReduce/out/AcessHousePubTimeSort"));
job.waitForCompletion(true);
}
}
Mapper端:
public class AcessHousePubTimeSortMapper extends Mapper
Reducer端:
public class AcessHousePubTimeSortReducer extends Reducer{
@Override
protected void reduce(Text key, Iterablevalues, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) sum += val.get();
context.write(key, new IntWritable(sum));
}
}
TopN是MapReduce分析最常見且必不可少的一個(gè)例子。
Driver端:
public class TotalPriceTop5ByCityDriver {
public static void main(String[] args) throws Exception {
args = new String[] { "datas/tb_house.txt", "MapReduce/out/TotalPriceTop5ByCity" };
Configuration conf = new Configuration();
if (args.length != 2) {
System.err.println("Usage: TotalPriceTop5ByCity");
System.exit(2);
}
Job job = Job.getInstance(conf);
job.setJarByClass(TotalPriceTop5ByCityDriver.class);
job.setMapperClass(TotalPriceTop5ByCityMapper.class);
job.setReducerClass(TotalPriceTop5ByCityReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setNumReduceTasks(1);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Mapper端:
public class TotalPriceTop5ByCityMapper extends Mapper
Reducer端:
public class TotalPriceTop5ByCityReducer extends Reducer{
private Text outv = new Text();
private int len = 0;
@Override
protected void reduce(Text key, Iterablevalues, Context context) throws IOException, InterruptedException {
ListtotalPriceList = new ArrayList();
Iteratoriterator = values.iterator();
while (iterator.hasNext()) {
totalPriceList.add(iterator.next().get());
}
Collections.sort(totalPriceList);
int size = totalPriceList.size();
String top5Str = "二手房總價(jià)Top5:";
for (int i = 1; i<= 5; i++) {
if (i == 5) {
top5Str += totalPriceList.get(size - i) + "萬元";
} else {
top5Str += totalPriceList.get(size - i) + "萬元, ";
}
}
outv.set(String.valueOf(top5Str));
context.write(key, outv);
}
}
自定義分區(qū)全排序可以實(shí)現(xiàn)不同于以往的排序方式,展示效果與默認(rèn)全排序可以體現(xiàn)出一定的差別。
public class TotalOrderingPartition extends Configured implements Tool {
static class SimpleMapper extends Mapper
…
…
…
某些時(shí)候按照一個(gè)字段的排序方式并不能讓我們滿意,二次排則是解決這個(gè)問題的一個(gè)方法。
Driver端:
Mapper端:
Reducer端:
某些字段通過MapReduce不可以直接統(tǒng)計(jì)得到,這時(shí)采用自定義類的方式便可以做到。
自定義類:
public class HouseCntByPositionTopListBean implements Writable {
private Text info;
private IntWritable cnt;
public Text getInfo() {
return info;
}
public void setInfo(Text info) {
this.info = info;
}
public IntWritable getCnt() {
return cnt;
}
public void setCnt(IntWritable cnt) {
this.cnt = cnt;
}
@Override
public void readFields(DataInput in) throws IOException {
this.cnt = new IntWritable(in.readInt());
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(cnt.get());
}
@Override
public String toString() {
String infoStr = info.toString();
int idx = infoStr.indexOf("-");
String city = infoStr.substring(0, idx);
String position = infoStr.substring(idx + 1);
return city + "#" + "[" + position + "]" + "#" + cnt;
}
}
Driver端:
Mapper端:
Reducer端:
占比分析同樣是MapReduce統(tǒng)計(jì)分析的一大常用方式。
Driver端:
public class TagRatioByCityDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args = new String[] {"datas/tb_house.txt", "MapReduce/out/TagRatioByCity" };
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(TagRatioByCityDriver.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(TagRatioByCityMapper.class);
job.setReducerClass(TagRatioByCityReducer.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Mapper端:
public class TagRatioByCityMapper extends Mapper
Reducer端:
public class TagRatioByCityReducer extends Reducer{
private Text outv = new Text();
private int sum = 0;
@Override
protected void reduce(Text key, Iterablevalues, Context context) throws IOException, InterruptedException {
DecimalFormat df = new DecimalFormat("0.00");
int cnt = 0;
for (IntWritable value : values) {
cnt += value.get();
}
String s = key.toString();
String format = "";
if (s.contains("上海")) {
sum = 2995;
format = df.format((double) cnt / sum * 100) + "%";
} else if (s.contains("北京")) {
sum = 2972;
format = df.format((double) cnt / sum * 100) + "%";
} else if (s.contains("廣州")) {
sum = 2699;
format = df.format((double) cnt / sum * 100) + "%";
} else {
sum = 2982;
format = df.format((double) cnt / sum * 100) + "%";
}
outv.set(format);
context.write(key, outv);
}
}
tp
2、數(shù)據(jù)及源代碼Github
Gitee
MapReduce統(tǒng)計(jì)分析過程需要比較細(xì)心,「根據(jù)二手房信息發(fā)布時(shí)間排序統(tǒng)計(jì)」這個(gè)涉及到Java中日期類SimpleDateFormat
和Date
的使用,需要慢慢調(diào)試得出結(jié)果;統(tǒng)計(jì)最值和占比的難度并不高,主要在于統(tǒng)計(jì)要計(jì)算的類別的數(shù)量和總數(shù)量,最后二者相處即可;二次排序和自定義類難度較高,但一步一步來還是可以實(shí)現(xiàn)的。
結(jié)束!
你是否還在尋找穩(wěn)定的海外服務(wù)器提供商?創(chuàng)新互聯(lián)www.cdcxhl.cn海外機(jī)房具備T級(jí)流量清洗系統(tǒng)配攻擊溯源,準(zhǔn)確流量調(diào)度確保服務(wù)器高可用性,企業(yè)級(jí)服務(wù)器適合批量采購,新人活動(dòng)首月15元起,快前往官網(wǎng)查看詳情吧