在有些情况下,我们需要数据分开输出,即指定输出路径,这时就需要重写outputformat。
测试数据(部分):
1374609798.19 1374609798.20 1374609798.20 1374609798.51 110 5 8615103869897 460029934830160 3559380454939260 2 460 0 14443 15406 10.184.49.172 220.181.112.82 55351 80 6 cmnet 1 221.177.233.5 221.177.217.145 221.177.233.6 221.177.217.155 mobads-logs.baidu.com http://mobads-logs.baidu.com/ad.log?url2=nH0drHn_PjRsrasvnWc3PHnvQjczrjc_nW0sQjc_nasYmW6kmhubn7qWTZc_PAc3nyFhuj0_RLK-mv-9U7P8whqzRy-dTv9GQZP4UyFGmy3_FBmkPBmhn0&extra2=nj0snjDsnj0snj0snj0snisznjD1njTzPj0Ynj0sn0&rnd=93528145.000000 NOKIA6120ci/UCWEB8.9.0.253/28/999 GET 200 1217 366 1 3 0 0 1 3 0 0 0 0 http://mobads-logs.baidu.com/ad.log?url2=nH0drHn_PjRsrasvnWc3PHnvQjczrjc_nW0sQjc_nasYmW6kmhubn7qWTZc_PAc3nyFhuj0_RLK-mv-9U7P8whqzRy-dTv9GQZP4UyFGmy3_FBmkPBmhn0&extra2=nj0snjDsnj0snj0snj0snisznjD1njTzPj0Ynj0sn0&rnd=93528145.000000 5903897840525807627 5903904035903819787 5915956
1374609778.91 1374609779.15 1374609779.15 1374609779.15 134 591 8615103869897 460029934830160 3559380454939260 2 460 0 14443 15406 10.184.49.172 111.13.12.15 55390 80 6 cmnet 1 221.177.233.5 221.177.217.145 221.177.233.6 221.177.217.155 m.baidu.com http://m.baidu.com/bd_page_type=1/pu=sz%40240%5F320%2Cta%40middle%5F%5F3%2E1%5F1%5F8%2E9%2Cusm%400/uid=DF457D5FC05AAC3ECA12096D8BBFB663/t=wap/w=0_10_%E8%89%B2%E7%9C%AF%E7%9C%AF/ssid=0/from=2001a/l=0/tc?func=nextp&pi=2&m=128&pn=11&src=http%3A%2F%2Fwww%2E0901s%2Ecom%2Farticlelist%2F%3F23%2D7%2Ehtml NOKIA6120ci/UCWEB8.9.0.253/28/999 GET 200 1693 4080 4 6 0 0 4 6 0 0 0 0 http://m.baidu.com/bd_page_type=1/pu=sz%40240%5F320%2Cta%40middle%5F%5F3%2E1%5F1%5F8%2E9%2Cusm%400/uid=DF457D5FC05AAC3ECA12096D8BBFB663/t=wap/w=0_10_%E8%89%B2%E7%9C%AF%E7%9C%AF/ssid=0/from=2001a/l=0/tc?func=nextp&pi=2&m=128&pn=11&src=http%3A%2F%2Fwww%2E09 5903897840525807627 5903904043347443723 5915956
1374609732.67 1374609741.82 1374609741.82 1374609741.82 134 591 8615103869897 460029934830160 3559380454939260 2 460 0 14443 15406 10.184.49.172 111.13.12.15 46666 80 6 cmnet 1 221.177.233.5 221.177.217.145 221.177.233.6 221.177.217.155 m.baidu.com http://m.baidu.com/bd_page_type=1/pu=sz%40240%5F320%2Cta%40middle%5F%5F3%2E1%5F1%5F8%2E9%2Cusm%400/uid=DF457D5FC05AAC3ECA12096D8BBFB663/t=wap/w=0_10_%E8%89%B2%E7%9C%AF%E7%9C%AF/ssid=0/from=2001a/l=0/tc?pn=11&m=128&src=www%2E0901s%2Ecom%2Farticle%2F%3F5616%2Ehtml NOKIA6120ci/UCWEB8.9.0.253/28/999 GET 200 1614 1091 3 3 0 0 3 3 0 0 0 0 http://m.baidu.com/bd_page_type=1/pu=sz%40240%5F320%2Cta%40middle%5F%5F3%2E1%5F1%5F8%2E9%2Cusm%400/uid=DF457D5FC05AAC3ECA12096D8BBFB663/t=wap/w=0_10_%E8%89%B2%E7%9C%AF%E7%9C%AF/ssid=0/from=2001a/l=0/tc?pn=11&m=128&src=www%2E0901s%2Ecom%2Farticle%2F%3F5616 5903897840525807627 5903903844056297483 5915956
1374609771.39 1374609771.57 1374609771.57 1374609771.57 134 591 8615103869897 460029934830160 3559380454939260 2 460 0 14443 15406 10.184.49.172 111.13.12.15 40314 80 6 cmnet 1 221.177.233.5 221.177.217.145 221.177.233.6 221.177.217.155 m.baidu.com http://m.baidu.com/bd_page_type=1/pu=sz%40240%5F320%2Cta%40middle%5F%5F3%2E1%5F1%5F8%2E9%2Cusm%400/uid=DF457D5FC05AAC3ECA12096D8BBFB663/t=wap/w=0_10_%E8%89%B2%E7%9C%AF%E7%9C%AF/ssid=0/from=2001a/l=0/tc?pn=11&m=128&src=www%2E0901s%2Ecom%2Farticlelist%2F%3F23%2D7%2Ehtml NOKIA6120ci/UCWEB8.9.0.253/28/999 GET 200 1660 5832 4 7 0 0 4 7 0 0 0 0 http://m.baidu.com/bd_page_type=1/pu=sz%40240%5F320%2Cta%40middle%5F%5F3%2E1%5F1%5F8%2E9%2Cusm%400/uid=DF457D5FC05AAC3ECA12096D8BBFB663/t=wap/w=0_10_%E8%89%B2%E7%9C%AF%E7%9C%AF/ssid=0/from=2001a/l=0/tc?pn=11&m=128&src=www%2E0901s%2Ecom%2Farticlelist%2F%3F 5903897840525807627 5903904010625413131 5915956
1374609798.85 1374609798.86 1374609798.86 1374609799.03 110 5 8615103869897 460029934830160 3559380454939260 2 460 0 14443 15406 10.184.49.172 220.181.112.82 40887 80 6 cmnet 1 221.177.233.5 221.177.217.145 221.177.233.6 221.177.217.155 mobads-logs.baidu.com http://mobads-logs.baidu.com/ad.log?url2=nHDLP1n_PHD4PBsvn16vnWcvQjcvPHR_nH0sQjc_nasYmW6kmhubn7qWTZc_PAc3nyFhuj0_RLK-mv-9U7P8whqzRy-dTv9GQZP4UyFGmy3_FBmkPBmhn0&extra2=nj0snjDsnj0snj0snj0snisznjD1njTzPj0Ynj0sn0&rnd=56776839.000000 NOKIA6120ci/UCWEB8.9.0.253/28/999 GET 200 1217 366 1 3 0 0 1 3 0 0 0 0 http://mobads-logs.baidu.com/ad.log?url2=nHDLP1n_PHD4PBsvn16vnWcvQjcvPHR_nH0sQjc_nasYmW6kmhubn7qWTZc_PAc3nyFhuj0_RLK-mv-9U7P8whqzRy-dTv9GQZP4UyFGmy3_FBmkPBmhn0&extra2=nj0snjDsnj0snj0snj0snisznjD1njTzPj0Ynj0sn0&rnd=56776839.000000 5903897840525807627 5903904128756236299 5915956
1374609777.43 1374609777.44 1374609777.44 1374609777.61 110 5 8615103869897 460029934830160 3559380454939260 2 460 0 14443 15406 10.184.49.172 220.181.112.82 44427 80 6 cmnet 1 221.177.233.5 221.177.217.145 221.177.233.6 221.177.217.155 mobads-logs.baidu.com http://mobads-logs.baidu.com/ad.log?url2=nHDYnHc_Pjb3rasvnHbYPjcsQjcdPH0_nW0sQjc_QHfdnHRsrjDsPzsYmW6kmhubn7qWTZc_PAc3nyFhuj0_n1csnDFjnYR4nDmkwjKarDNAPH0LwWFAnYRswRujPjn_TL-Vmh-9UBYkQA4Epv-9QHmknWKWpimhnHmhFWcvPBmh&__mobads_ta=mLwzrW0_mywJIgPYrW00&__mobads_qk=51eee16b6e10202ef7e2968dc4708590193811c6&exp_id=gd,zl,&extra2=nj0snjDsnj0snj0snj0snisznjD1njTzPj0Ynjcdnf&rnd=609569073 NOKIA6120ci/UCWEB8.9.0.253/28/999 GET 200 1365 366 1 3 0 0 1 3 0 0 0 0 http://mobads-logs.baidu.com/ad.log?url2=nHDYnHc_Pjb3rasvnHbYPjcsQjcdPH0_nW0sQjc_QHfdnHRsrjDsPzsYmW6kmhubn7qWTZc_PAc3nyFhuj0_n1csnDFjnYR4nDmkwjKarDNAPH0LwWFAnYRswRujPjn_TL-Vmh-9UBYkQA4Epv-9QHmknWKWpimhnHmhFWcvPBmh&__mobads_ta=mLwzrW0_mywJIgPYrW00&__mobads 5903897840525807627 5903904035903823883 5915956
1374609776.77 1374609776.78 1374609776.78 1374609777.07 110 5 8615103869897 460029934830160 3559380454939260 2 460 0 14443 15406 10.184.49.172 220.181.112.82 55105 80 6 cmnet 1 221.177.233.5 221.177.217.145 221.177.233.6 221.177.217.155 mobads-logs.baidu.com http://mobads-logs.baidu.com/ad.log?url2=nH0vnH6_PjRknasvnWc3PHnvQjczrjf_nW0sQjc_QHfdnHRsrjDsPzsYmW6kmhubn7qWTZc_PAc3nyFhuj0_n1csnDFjnYR4nDmkwjKarDNAPH0LwWFAnYRswRujPjn_TL-Vmh-9UBYkQA4Epv-9QHmknWKWpimhnHmhFWcvPBmh&__mobads_ta=mLwzrW0_mywJIgPYrW00&__mobads_qk=51eee16b6e10202ef7e2968dc4708590193811c6&exp_id=gd,zl,&extra2=nj0snjDsnj0snj0snj0snisznjD1njTzPj0Ynjcdnf&rnd=1685946172 NOKIA6120ci/UCWEB8.9.0.253/28/999 GET 200 1366 366 1 3 0 0 1 3 0 0 0 0 http://mobads-logs.baidu.com/ad.log?url2=nH0vnH6_PjRknasvnWc3PHnvQjczrjf_nW0sQjc_QHfdnHRsrjDsPzsYmW6kmhubn7qWTZc_PAc3nyFhuj0_n1csnDFjnYR4nDmkwjKarDNAPH0LwWFAnYRswRujPjn_TL-Vmh-9UBYkQA4Epv-9QHmknWKWpimhnHmhFWcvPBmh&__mobads_ta=mLwzrW0_mywJIgPYrW00&__mobads 5903897840525807627 5903903694712119307 5915956
1374609806.06 1374609806.10 1374609806.10 1374609807.38 110 4 8613526051568 460003760137902 8674910129582223 2 460 0 14254 2844 10.88.83.12 10.0.0.172 40793 80 6 cmwap 1 221.177.217.135 221.177.217.145 221.177.217.136 221.177.217.149 rc.dxsvr.com http://rc.dxsvr.com/get?dv=1.4.5&is=460003760137902&model=8150&op=46000&lp=1&locale=zh_CN&pkg=com.dianxinos.dxbb&net=2&tk=GVTQONILweFpMcxchEcj4g==&h=800&w=480&v=5003&ie=867491012958222&lc=D6PogbkVGkUYW1fJ&sdk=10&dpi=240&rv=1.1 Apache-HttpClient/UNAVAILABLE (java 1.4) GET 200 520 443 3 2 0 0 3 2 0 0 0 0 http://rc.dxsvr.com/get?dv=1.4.5&is=460003760137902&model=8150&op=46000&lp=1&locale=zh_CN&pkg=com.dianxinos.dxbb&net=2&tk=GVTQONILweFpMcxchEcj4g==&h=800&w=480&v=5003&ie=867491012958222&lc=D6PogbkVGkUYW1fJ&sdk=10&dpi=240&rv=1.1 5903899536962572299 5903904150061305867 5926819
1374609808.54 1374609808.61 1374609808.61 1374609809.38 110 362 8613592017377 460003796093791 8606460252897478 2 460 0 18737 26732 10.88.114.208 10.0.0.172 45268 80 6 cmwap 1 221.177.156.71 221.177.217.145 221.177.156.71 221.177.217.150 bufferfly.mqsng.qq.com http://bufferfly.mqsng.qq.com/analytics/upload POST 200 600 535 4 2 0 0 4 2 0 0 0 0 http://bufferfly.mqsng.qq.com/analytics/upload 5903903713112543243 5903904168439287819 5967156
1
数据库(部分)里的数据:
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=w100&quality=60&sec=1374609621&di=66db0b184da63c1c76d87f8b243d07c9&src=http://i3.baidu.com/it/u=1069655089,3312248484&fm=21&gp=0.jpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-2img.cn/timg?tc&size=w304&sec=1374609877&di=5da5f6be8f6aa1605fe035eacaafc235&imgtype=0&quality=80&src=http%3A%2F%2Fpics%2Ewajiw%2Ecom%2Fimg%5F02%2Fa%5F8%2F235097135%5F1%5F0%2Ejpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-2img.cn/timg?tc&size=w304&sec=1374609877&di=b4d535480eb486257dbb05a0b9c986f3&imgtype=0&quality=80&src=http%3A%2F%2Fpics%2Ewajiw%2Ecom%2Fimg%5F02%2Fa%5F8%2F775906177%5F1%5F0%2Ejpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-2img.cn/timg?tc&size=w304&sec=1374609877&di=da5b8374285c90c693ff1916e42b12f4&imgtype=0&quality=80&src=http%3A%2F%2Fpics%2Ewajiw%2Ecom%2Fimg%5F02%2Fa%5F8%2F518671053%5F1%5F0%2Ejpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://wap.wapreach.com/upload/view/2013/06/20130621194639803.png', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://www.17caifu.com/BestAugury/bazi/Images/2007_Q&A_job.gif', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://house60.3g.qq.com/g/s?sid=AULxRr6syQFZz-P3wHdxoEZG&3G_UIN=1324479092&saveURL=0&aid=home_self&g_f=595', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://m1.baidu.com/bd_page_type=1/pu=sz%40224%5F220%2Cta%40middle%5F%5F%5F%5F%2Cusm%400/uid=CBF7BD93377D6EDDE086B9855274FEA8/t=wap/w=0_10_Www%2ECcc36%2ECom/ssid=0/from=643e/l=0/tc?pn=11&m=0&src=www%2Eccc36%2Ecom%2Fqiangjian%2F20130118%2F133645%2Ehtml', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://mobads-logs.baidu.com:80/ad.log?url2=nHD3n1R_PHcvrisdPWRLPWn4QjcvrHD_nW0sQjc_QHndPWmLPH63PisYmW6kmhubn7qWTZc_PAc3nyFhuj0_wbfdn1bvPW0Yf1RkwHuDrHNjfWKDn16vnYRknWcLfRn_TL-Vmh-9UBYkQA4Epv-9QHRzn1nhFWDvFBmzPHnhFBfb&__mobads_ta=mLwzrW0_mywJIgPYrW00&__mobads_qk=51eee0fcf01684baf7e2968db627d5d410ae827f&exp_id=gd,zl,&extra2=nj0snjDsnj0snj0snj0snisznjD1njTzPj0YnjDsn0&rnd=1386322966', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://r3.sinaimg.cn/10170/2013/0723/8d/3/52441965/360x532x75x0.jpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://r3.sinaimg.cn/10170/2013/0723/b6/e/53444068/360x532x75x0.jpg', 'somecontent');
代码:
重写outputformate:
package com.zsy.mr.logenhance;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LogEnhanceOutPutFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job)
throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(job.getConfiguration());
Path enHancePath = new Path("hdfs://hadoop01:9000/logenhance/echancelog/log.data");
Path urlPath = new Path("hdfs://hadoop01:9000/logenhance/NeedGrabBag/url.data");
FSDataOutputStream enHanceOs = fs.create(enHancePath);
FSDataOutputStream urlOs = fs.create(urlPath);
return new LogEnhanceRecordWriter(enHanceOs, urlOs);
}
static class LogEnhanceRecordWriter extends RecordWriter<Text, NullWritable> {
FSDataOutputStream enHanceOs = null;
FSDataOutputStream urlOs = null;
public LogEnhanceRecordWriter() {
}
public LogEnhanceRecordWriter(FSDataOutputStream enHanceOs, FSDataOutputStream urlOs) {
this.enHanceOs = enHanceOs;
this.urlOs = urlOs;
}
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
String data = key.toString();
// 如果包括"NeedGrabBag",数据写入hfds://hadoop01:9000/logenhance/NeedGrabBag/url.data 路径
if (data.contains(("NeedGrabBag"))) {
urlOs.write(data.getBytes());
} else {
// 写入hfds://hadoop01:9000/logenhance/echancelog/log.data
enHanceOs.write(data.getBytes());
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
if (urlOs != null) {
urlOs.close();
}
if (enHanceOs != null) {
enHanceOs.close();
}
}
}
}
dbutils代码:
package com.zsy.mr.utils;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.Map;
public class DBUtils {
public static Connection getConn() {
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://192.168.31.11:3306/mytest";
String username = "root";
String password = "123456";
Connection conn = null;
try {
Class.forName(driver); // classLoader,加载对应驱动
conn = (Connection) DriverManager.getConnection(url, username, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
public static Map<String, String> getUrlRuleData() {
Connection connection = getConn();
Statement statement = null;
ResultSet resultSet = null;
Map<String, String> result = new HashMap<String, String>(128);
try {
statement = connection.createStatement();
String sql = "select url,content from url_rule";
resultSet = statement.executeQuery(sql);
while (resultSet.next()) {
result.put(resultSet.getString(1), resultSet.getString(2));
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (resultSet != null) {
try {
resultSet.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if (statement != null) {
try {
statement.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if (connection != null) {
try {
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
return result;
}
}
mr代码:
package com.zsy.mr.logenhance;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.zsy.mr.utils.DBUtils;
public class LogEnhance {
static class LogenhanceMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
Map<String, String> ruleMap = new HashMap<String, String>();
Text k = new Text();
NullWritable v = NullWritable.get();
@Override
protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
ruleMap = DBUtils.getUrlRuleData();// 这里可与进行查询数据库,来获取url键值对
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
Counter counter = context.getCounter("feifa", "feifaline");
String line = value.toString();
String[] strs = StringUtils.split(line);
try {
String url = strs[26];
String content = ruleMap.get(url);
// 判断url是否在数据库中存在,如果为空:待处理数据;不为空,进行日志增强
if (StringUtils.isBlank(content)) {
k.set(url + "\t" + "NeedGrabBag" + "\n");
context.write(k, v);
} else {
k.set(line + "\t" + content + "\n");
context.write(k, v);
}
} catch (Exception e) {
counter.increment(1);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
/*
* conf.set("mapreduce.framework.name", "yarn");
* conf.set("yarn.resoucemanger.hostname", "hadoop01");
*/
Job job = Job.getInstance(conf);
job.setJarByClass(LogEnhance.class);
// 指定本业务job要使用的业务类
job.setMapperClass(LogenhanceMapper.class);
// job.setReducerClass(LogEnhanceReducer.class);
// 指定mapper输出的k v类型 如果map的输出和reduce的输出一样,只需要设置输出即可
// job.setMapOutputKeyClass(Text.class);
// job.setMapOutputValueClass(IntWritable.class);
// 指定最终输出kv类型(reduce输出类型)
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 控制不同的数据写出不同地方路径(数据库、hdfs等),可以使用自定义的OutputFormat实现
job.setOutputFormatClass(LogEnhanceOutPutFormat.class);
// 指定job的输入文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 指定job的输出结果目录,虽然重写了outputforamt,但是还是要写outputPath因为还需要在该文件里输出SUCCESS文件
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 不需要reduce ,设置为0即可
job.setNumReduceTasks(0);
// 将job中配置的相关参数,以及job所有的java类所在 的jar包,提交给yarn去运行
// job.submit();无结果返回,建议不使用它
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
结果:
url.data:
log.data:
outputPath下文件:
以上就是hadoop的outputformat自定义内容
转载自原文链接, 如需删除请联系管理员。
原文链接:hadoop入门8:自定义OutputFormat,根据需求数据输出不同的路径,转载请注明来源!