传参关键代码:
//从配置文件获取参数,必须在作业创建的前面
conf.addResource("hadoop-bigdata.xml"); keepUrl=conf.get("KeepUrlString",""); filterUrl=conf.get("FilterUrlString","");conf.set("FilterUrl", filterUrl);conf.set("KeepUrl", keepUrl);
//获取参数String fstr=context.getConfiguration().get("FilterUrl");String kstr=context.getConfiguration().get("KeepUrl");
package org.apache.hadoop.examples;import java.io.IOException;import java.util.StringTokenizer;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class FilterUrl { public static class FilterUrlMap extends Mapper
需要从配置文件获取的参数:
KeepUrlString anjueke.com|soufun.com FilterUrlString .js|.jpg|.jpeg|.gif|.png|.css|error.html