Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hive-testbench #61

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions hive/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,9 @@ RUN apt-get -q update \
&& apt-get -q install -y --no-install-recommends \
keychain ssh openssh-server sudo

# Install python and packages dependencies
RUN apt-get -q update \
&& apt-get -q install -y --no-install-recommends \
python3-pip python3-venv python3-wheel vim

RUN pip3 install pandas sqlparse fastparquet pyarrow thrift

# install build essential for compiling within docker
RUN apt-get -q update \
&& apt-get -q install -y build-essential maven autoconf automake libtool vim
&& apt-get -q install -y build-essential maven autoconf automake libtool vim git unzip

# Install protobuf 2.5.0 for tez
ADD /protobuf-2.5.0.tar.gz /tmp
Expand All @@ -65,6 +58,19 @@ RUN cd /tmp/tez \
&& mkdir -p ${TEZ_HOME} \
&& tar -xvf tez-dist/target/tez-0.9.2-minimal.tar.gz -C ${TEZ_HOME}

# Install python and packages dependencies
RUN apt-get -q update \
&& apt-get -q install -y \
python3-dev python3-pip python3-venv python3-wheel \
libsasl2-dev libsasl2-2 libsasl2-modules-gssapi-mit

RUN pip3 install pandas sqlparse fastparquet pyarrow thrift thrift_sasl sasl pyhive

# build hive-testbench
ADD /hive-testbench /tmp/hive-testbench
RUN cd /tmp/hive-testbench \
&& ./tpcds-build.sh

# Fix multiple SLF4J binding conflict.
RUN rm ${TEZ_HOME}/lib/slf4j-log4j12-1.7.10.jar
RUN rm ${HIVE_HOME}/lib/log4j-slf4j-impl-2.6.2.jar
Expand Down
26 changes: 21 additions & 5 deletions hive/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ DOCKER_DIR=${ROOT_DIR}
DOCKER_FILE="${DOCKER_DIR}/Dockerfile"
DOCKER_NAME=qflock-hive-2.3.8

USER_NAME=${SUDO_USER:=$USER}
USER_ID=$(id -u "${USER_NAME}")
GROUP_ID=$(id -g "${USER_NAME}")

# Download Hadoop
ENV_HADOOP_VERSION=2.7.2
if [ ! -f ${DOCKER_DIR}/hadoop-${ENV_HADOOP_VERSION}.tar.gz ]
Expand All @@ -35,9 +39,10 @@ ENV_TEZ_VERSION=0.9.2
rm -rf tez
echo "Download tez source code and switch to branch-0.9.2"
git clone https://github.com/apache/tez.git
cd tez
pushd tez
git switch branch-0.9.2
git apply ../qflock-tez.patch
popd

# Download protobuf 2.5.0 required by tez
ENV_PROTOBUF_VERSION=2.5.0
Expand All @@ -47,13 +52,23 @@ then
curl -L https://github.com/google/protobuf/releases/download/v${ENV_PROTOBUF_VERSION}/protobuf-${ENV_PROTOBUF_VERSION}.tar.gz --output ${DOCKER_DIR}/protobuf-${ENV_PROTOBUF_VERSION}.tar.gz
fi

# Doanload hive-testbench
rm -rf hive-testbench
echo "Download hive-testbench for tpcds"
git clone https://github.com/hortonworks/hive-testbench.git
pushd hive-testbench
git switch hive14
# Remove a patch already applied to prevent reverse patch
rm tpcds-gen/patches/all/tpcds_misspelled_header_guard.patch
git apply ../tpcds.patch
# Fix the problem tpcds_kit.zip no longer available for download
# Need to request from tpc.org
cp ../tpcds_kit.zip tpcds-gen/tpcds_kit.zip
popd

DOCKER_CMD="docker build -t ${DOCKER_NAME} --build-arg HADOOP_VERSION -f $DOCKER_FILE $DOCKER_DIR"
eval "$DOCKER_CMD"

USER_NAME=${SUDO_USER:=$USER}
USER_ID=$(id -u "${USER_NAME}")
GROUP_ID=$(id -g "${USER_NAME}")

# Set the home directory in the Docker container.
DOCKER_HOME_DIR=${DOCKER_HOME_DIR:-/home/${USER_NAME}}

Expand All @@ -70,6 +85,7 @@ WORKDIR "${DOCKER_HOME_DIR}"
RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
RUN cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
RUN chmod 0600 ~/.ssh/authorized_keys
RUN sudo chown -R ${USER_NAME}:${USER_NAME} /tmp/hive-testbench
UserSpecificDocker

popd
9 changes: 9 additions & 0 deletions hive/docker/run_services.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,18 @@ sleep 1
# show databases;
# show tables from tpcds;

sleep 1
echo "Start hiveserver2 ..."
nohup $HIVE_HOME/bin/hive --service hiveserver2 &>/tmp/hs2.log &

#echo "Generate tpcds test data for hive testbench ..."
cd /tmp
#./tpcds-setup.sh 2

echo "HADOOP_READY"
echo "HADOOP_READY" > /opt/volume/status/HADOOP_STATE
echo "RUNNING_MODE $RUNNING_MODE"
echo "please go inside hive docker and go to /tmp/hive-testbench and run ./tpcds-setup.sh <scale_size> to generate tpcds data"

if [ "$RUNNING_MODE" = "daemon" ]; then
sleep infinity
Expand Down
15 changes: 15 additions & 0 deletions hive/docker/tpcds.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
diff --git a/tpcds-gen/Makefile b/tpcds-gen/Makefile
index f61c2b2..bc8cb7d 100644
--- a/tpcds-gen/Makefile
+++ b/tpcds-gen/Makefile
@@ -9,8 +9,8 @@ target/tpcds_kit.zip: tpcds_kit.zip
cp tpcds_kit.zip target/tpcds_kit.zip

tpcds_kit.zip:
- curl http://dev.hortonworks.com.s3.amazonaws.com/hive-testbench/tpcds/README
- curl --output tpcds_kit.zip http://dev.hortonworks.com.s3.amazonaws.com/hive-testbench/tpcds/TPCDS_Tools.zip
+# curl http://dev.hortonworks.com.s3.amazonaws.com/hive-testbench/tpcds/README
+# curl --output tpcds_kit.zip http://dev.hortonworks.com.s3.amazonaws.com/hive-testbench/tpcds/TPCDS_Tools.zip

target/lib/dsdgen.jar: target/tools/dsdgen
cd target/; mkdir -p lib/; ( jar cvf lib/dsdgen.jar tools/ || gjar cvf lib/dsdgen.jar tools/ )
Binary file added hive/docker/tpcds_kit.zip
Binary file not shown.
4 changes: 4 additions & 0 deletions hive/start_hive.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ set -e # exit on error
pushd "$(dirname "$0")" # connect to root

ROOT_DIR=$(pwd)
PARENT_DIR=$(dirname ${ROOT_DIR})
echo "ROOT_DIR ${ROOT_DIR}"
echo "PARENT_DIR ${PARENT_DIR}"

if [ -z "$1" ]
then
Expand Down Expand Up @@ -64,6 +66,7 @@ HADOOP_ROOT_LOGGER=WARN,DRFA

DOCKER_RUN="docker run --rm=true ${DOCKER_IT} \
-v ${ROOT_DIR}/data:/data \
-v ${PARENT_DIR}/benchmark:/opt/benchmark \
-v ${ROOT_DIR}/volume/namenode:/opt/volume/namenode \
-v ${ROOT_DIR}/volume/datanode0:/opt/volume/datanode \
-v ${ROOT_DIR}/volume/metastore:/opt/volume/metastore \
Expand Down Expand Up @@ -94,6 +97,7 @@ DOCKER_RUN="docker run --rm=true ${DOCKER_IT} \
-e RUNNING_MODE=${RUNNING_MODE} \
-e HADOOP_ROOT_LOGGER=${HADOOP_ROOT_LOGGER} \
-e HIVE_AUX_JARS_PATH=${HIVE_HOME}/lib \
-e FORMAT=parquet \
--network qflock-net-${DC} \
--name qflock-hive-${HIVE_VERSION} --hostname qflock-hive \
qflock-hive-${HIVE_VERSION}-${USER_NAME} ${CMD}"
Expand Down