- Added instructions on how to build a min Apache Spark cluster

d3ca980b · Luca Morandini · a590acce · d3ca980b · d3ca980b · d3ca980b
Commit d3ca980b authored 7 years ago by Luca Morandini
--- a/spark-docker/README.md
+++ b/spark-docker/README.md
+# Build and deployment of Spark 2 on Docker
+
+These are the steps to follow in order to simulate a Spark cluster on a single computer.
+
+
+## Prerequirements
+
+1. Install Docker CE `https://docs.docker.com/engine/installation/`
+2. Install Docker Compose `https://docs.docker.com/compose/install/`
+3. About 2GB of RAM available for this cluster
+
+
+## Building of spark image
+
+```
+  docker build spark-2 --tag spark-2:2.1.0
+```
+
+
+## Cluster creation and start (1 master, 1 worker)
+
+```
+  docker-compose up
+```
+
+
+## Word-count example on generated data
+
+Open a new shell to execute these commands
+  
+```
+export mastercont=`docker ps | grep spark-master | cut -f1 -d' '`
+docker exec -ti ${mastercont} /bin/bash
+pyspark
+execfile('/root/wc.py')
+exit()
+exit
+```
+
+To look at the computation tasks and stages, point your browser
+to: `http://localhost:4040`
+
+To look at the cluster workers, point your browser to: `http://localhost:8080`
+
+
+## Interactive Spark session with the Python shell
+
+```
+export mastercont=`docker ps | grep spark-master | cut -f1 -d' '`
+docker exec -ti ${mastercont} /bin/bash
+pyspark
+```
+(Execute `exit()` to leave the shell.)
+
+
+## Cluster stop and re-start
+
+```
+  docker-compose stop
+  docker-compose start
+```
+
--- a/spark-docker/docker-compose.yml
+++ b/spark-docker/docker-compose.yml
+version: "2.1"
+
+services:        
+  spark-master:
+    image: spark-2:2.1.0
+    command: bin/spark-class org.apache.spark.deploy.master.Master -h spark-master
+    networks:
+      sparknet:
+        ipv4_address: "173.17.2.2"
+    hostname: spark-master
+    environment:
+      MASTER: spark://spark-master:7077
+      SPARK_CONF_DIR: /conf
+      SPARK_PUBLIC_DNS: 173.17.2.2
+    expose:
+      - 7001
+      - 7002
+      - 7003
+      - 7004
+      - 7005
+      - 7006
+      - 7077
+    ports:
+      - 4040:4040
+      - 6066:6066
+      - 7077:7077
+      - 8080:8080
+    volumes:
+      - ./conf/spark-master:/conf
+      - ./data:/tmp/data  
+
+  spark-worker-1:
+    image: spark-2:2.1.0
+    command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
+    networks:
+      sparknet:
+        ipv4_address: "173.17.2.3"
+    hostname: spark-worker-1
+    environment:
+      SPARK_CONF_DIR: /conf
+      SPARK_PUBLIC_DNS: 173.17.2.3
+      SPARK_WORKER_CORES: 2
+      SPARK_WORKER_MEMORY: 1g
+      SPARK_WORKER_PORT: 8881
+      SPARK_WORKER_WEBUI_PORT: 8081
+    links:
+      - spark-master
+    expose:
+      - 7012
+      - 7013
+      - 7014
+      - 7015
+      - 7016
+    ports:
+      - 8081:8081
+    volumes:
+      - ./conf/spark-worker-1:/conf
+      - ./data:/tmp/data
+      
+networks:
+  sparknet:
+    driver: bridge
+    driver_opts:
+      com.docker.network.enable_ipv6: "false"
+    ipam:
+      driver: default
+      config:
+        - subnet: "173.17.2.0/24"
+          gateway: "173.17.2.1"
--- a/spark-docker/spark-2/Dockerfile
+++ b/spark-docker/spark-2/Dockerfile
+FROM ubuntu:17.10
+
+RUN apt-get update
+RUN apt-get install -y curl software-properties-common apt-utils
+#RUN add-apt-repository ppa:openjdk-r/ppa
+#RUN apt-get update
+RUN apt-get install -y openjdk-8-jdk
+#RUN add-apt-repository ppa:webupd8team/java
+#RUN apt-get install -y oracle-java9-installer
+
+#ARG JAVA_ARCHIVE=http://download.oracle.com/otn-pub/java/jdk/10.0.1+10/fb4372174a714e6b8c52526dc134031e/jdk-10.0.1_linux-x64_bin.tar.gz
+RUN java -version
+#ENV JAVA_HOME /usr/local/jdk-8.0.4
+#ENV PATH $PATH:$JAVA_HOME/bin
+#RUN curl -sL --retry 3 --insecure \
+#  --header "Cookie: oraclelicense=accept-securebackup-cookie;" $JAVA_ARCHIVE \
+#  | tar -xz -C /usr/local/ && ln -s $JAVA_HOME /usr/local/java
+
+ARG SPARK_ARCHIVE=http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz
+RUN curl -s $SPARK_ARCHIVE | tar -xz -C /usr/local/
+RUN apt-get install -y python 
+
+ENV SPARK_HOME /usr/local/spark-2.1.0-bin-hadoop2.7
+ENV PATH $PATH:$SPARK_HOME/bin
+
+# WorcCount script and data
+COPY wc.py /root/
+COPY wc.txt /root/
+
+EXPOSE 4040 6066 7077 8080
+
+WORKDIR $SPARK_HOME
\ No newline at end of file
--- a/spark-docker/spark-2/wc.py
+++ b/spark-docker/spark-2/wc.py
+from pyspark import SparkConf, SparkContext
+import re
+
+words= re.split('\W+', open("/root/wc.txt").read())
+wordsRDD= sc.parallelize(words)
+countsRDD = wordsRDD.map(lambda w: (w, 1)).reduceByKey(lambda a, b: a + b)
+
+wc = countsRDD.collect()
+print(wc)    
\ No newline at end of file
--- a/spark-docker/spark-2/wc.txt
+++ b/spark-docker/spark-2/wc.txt