windows 本地构建hadoop-spark运行环境(hadoop-2.6, spark2.0)
            
            
                    
                        标签:问题   source   evel   http   group   udf   cti   lease   .sql   
- 下载hadoop
 
- http://hadoop.apache.org/releases.html --> http://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz
 
- 安装hadoop,配置HADOOP_HOME, 把${HADOOP_HOME}/bin放到path
 
- 下载spark
 
- http://spark.apache.org/downloads.html --> https://d3kbcqa49mib13.cloudfront.net/spark-2.0.2-bin-hadoop2.6.tgz 注意与hadoop版本匹配
 
- 安装,配置SPARK_HOME,把${SPARK_HOME}/bin放到path
 
- 在运行spark程序时,会报找不到 winutils.exe
 
- 下载 https://github.com/srccodes/hadoop-common-2.2.0-bin.git 放到${HADOOP_HOME}/bin下
 
- 运行时设置本地运行即可
 
- spark样例:
 
 
LocalSparkContext.scala
 
- import org.apache.spark.{SparkConf, SparkContext}
 
- import org.scalatest._
 
- trait LocalSparkContext extends BeforeAndAfterAll {
 
-     self: Suite =>
 
-     @transient var sc: SparkContext = _
 
-     override def beforeAll() {
 
-         val conf = new SparkConf()
 
-                 .setMaster("local[2]")
 
-                 .setAppName("test")
 
-         sc = new SparkContext(conf)
 
-     }
 
-     override def afterAll() {
 
-         if (sc != null) {
 
-             sc.stop()
 
-         }
 
-     }
 
- }
 
 
 
SparkWCSuit.scala
- import org.apache.spark.rdd.RDD
 
- import org.apache.spark.sql.{Row, SQLContext}
 
- import org.apache.spark.util.LongAccumulator
 
- import org.scalatest.FunSuite
 
- import tool.LocalSparkContext
 
- import algos.{MergedPCtr, PCtrUtils}
 
- class SparkWCSuit extends FunSuite with LocalSparkContext {
 
- //rdd wordCount
 
-     test("test rdd wc") {
 
-         sc.setLogLevel("ERROR")
 
-         val rdd = sc.makeRDD(Seq("a", "b", "b"))
 
-         val res = rdd.map((_, 1)).reduceByKey(_ + _).collect().sorted
 
-         assert(res === Array(("a", 1), ("b", 2)))
 
-     }
 
- }
 
 
 
build.sbt
- name := "doc_rank"
 
- version := "1.0"
 
- scalaVersion := "2.10.5"
 
- libraryDependencies += "org.apache.spark" % "spark-core_2.10" % "2.0.2"
 
- libraryDependencies += "org.apache.spark" % "spark-mllib_2.10" % "2.0.2"
 
- libraryDependencies += "commons-cli" % "commons-cli" % "1.2"
 
- libraryDependencies ++= Seq(
 
-     "org.scalanlp" %% "breeze" % "0.11.2",
 
-     "org.scalanlp" %% "breeze-natives" % "0.11.2",
 
-     "org.scalanlp" %% "breeze-viz" % "0.11.2"
 
- )
 
- libraryDependencies ++= Seq(
 
-     "org.apache.hadoop" % "hadoop-core" % "2.6.0-mr1-cdh5.4.4",
 
-     "org.apache.hbase" % "hbase-client" % "1.0.0-cdh5.4.4",
 
-     "org.apache.hbase" % "hbase-common" % "1.0.0-cdh5.4.4",
 
-     "org.apache.hbase" % "hbase-server" % "1.0.0-cdh5.4.4",
 
-     "org.apache.hbase" % "hbase-protocol" % "1.0.0-cdh5.4.4"
 
- )
 
- resolvers += "Akka Repository" at "http://repo.akka.io/releases/";
 
- resolvers += "cloudera-repo-releases" at "https://repository.cloudera.com/artifactory/repo/";
 
- resolvers ++= Seq(
 
-     "Sonatype Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/";,
 
-     "Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/";
 
- )
 
 
 
 
- hadoop样例
 
        
目录结构:
src/
├── main
│   ├── java
│   │   ├── io
│   │   │   └── longwind
│   │   │       └── mapreduce
│   │   │           ├── main
│   │   │           │   └── Main.java
│   │   │           ├── mapreduce
│   │   │           │   └── InfoidUniquer.java
│   │   │           └── utils
│   │   │               ├── Constant.java
│   │   │               └── HadoopUtils.java
│   │   └── org
│   │       └── apache
│   │           └── hadoop
│   │               ├── io
│   │               │   └── nativeio
│   │               │       └── NativeIO.java
│   │               └── mapred
│   │                   ├── ClientCache.java
│   │                   ├── ClientServiceDelegate.java
│   │                   ├── NotRunningJob.java
│   │                   ├── ResourceMgrDelegate.java
│   │                   ├── YarnClientProtocolProvider.java
│   │                   └── YARNRunner.java
│   └── resources
│       └── log4j.properties
└── test
    ├── java
    │   └── test
    └── resources
        └── log4j.properties
 
pom.xml中关键依赖
org.apache.hadoop
hadoop-common
2.6.0-cdh5.4.4
 
org.apache.hadoop
hadoop-mapreduce-client-core
2.6.0-cdh5.4.4
 
org.apache.hadoop
hadoop-mapreduce-client-common
2.6.0-cdh5.4.4
 
代码方面:
上面目录结构显示的org.apache.hadoop.* 那些是从hadoop源码包里拷出来的,注意是2.6.0-cdh5.4.4版本的
程序运行起来报错access0,如果是NativeIO.java 那应该是权限问题,需要手动修改NativeIO.java 中的
 
public static boolean access(String path, AccessRight desiredAccess)throws IOException {
    return true;//修改后
    //return access0(path, desiredAccess.accessRight());//修改前
}
 
这样,就能在windows本地,轻松进行hadoop, spark开发调试了,顺便吐槽一下mrunit不是很给力,问题一般是版本,包冲突,权限。
 
参考:
- 平野大荒 http://www.cnblogs.com/tq03/p/5101916.html --windows上的mapreduce运行环境
 
- 在前进的路上 http://blog.csdn.net/congcong68/article/details/42043093 -- access0 问题解决
 
- xuweimdm http://blog.csdn.net/u011513853/article/details/52865076 -- spark在windows上
 
 
windows 本地构建hadoop-spark运行环境(hadoop-2.6, spark2.0)
标签:问题   source   evel   http   group   udf   cti   lease   .sql   
原文地址:http://www.cnblogs.com/longwind09/p/7681102.html
                    
             
            
            
            
            
            
                                
评论