Merge pull request #589 from argonne-lcf/feature/Cerebras_R_2.4.0_upd…

…ates Feature/cerebras r 2.4.0 updates
argonne-lcf · Dec 20, 2024 · 978f4ff · 978f4ff
2 parents 334b606 + 9262374
commit 978f4ff
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 58 deletions.
diff --git a/docs/ai-testbed/cerebras/customizing-environment.md b/docs/ai-testbed/cerebras/customizing-environment.md
@@ -5,15 +5,16 @@
 #### To make a PyTorch virtual environment for Cerebras
 
 ```console
-mkdir ~/R_2.3.0
-cd ~/R_2.3.0
+mkdir ~/R_2.4.0
+cd ~/R_2.4.0
 # Note: "deactivate" does not actually work in scripts.
 deactivate
 rm -r venv_cerebras_pt
 /software/cerebras/python3.8/bin/python3.8 -m venv venv_cerebras_pt
 source venv_cerebras_pt/bin/activate
 pip install --upgrade pip
-pip install cerebras_pytorch==2.3.0
+pip install cerebras_pytorch==2.4.0
+pip install --editable git+https://github.com/Cerebras/modelzoo#egg=cerebras_modelzoo 'murmurhash==1.0.10' 'thinc==8.2.2' 'cymem<2.0.10'
 ```
 
 <!--- No longer any TensorFlow wheel
@@ -25,7 +26,7 @@ pip install cerebras_pytorch==2.3.0
 To activate a virtual environments
 
 ```console
-source ~/R_2.3.0/venv_cerebras_pt/bin/activate
+source ~/R_2.4.0/venv_cerebras_pt/bin/activate
 ```
 
 To deactivate a virtual environment,

diff --git a/docs/ai-testbed/cerebras/example-programs.md b/docs/ai-testbed/cerebras/example-programs.md
@@ -4,15 +4,15 @@
 Make a working directory and a local copy of the Cerebras **modelzoo** and **anl_shared** repository, if not previously done, as follows.
 
 ```bash
-mkdir ~/R_2.3.0
-cd ~/R_2.3.0
+mkdir ~/R_2.4.0
+cd ~/R_2.4.0
 git clone https://github.com/Cerebras/modelzoo.git
 cd modelzoo
 git tag
-git checkout Release_2.3.0
+git checkout Release_2.4.0
 ```
 <!---
-cp -r /software/cerebras/model_zoo/anl_shared/ ~/R_2.3.0/anl_shared
+cp -r /software/cerebras/model_zoo/anl_shared/ ~/R_2.4.0/anl_shared
 --->
 
 <!---
@@ -23,18 +23,18 @@ To run Unet with the <a href="https://www.kaggle.com/c/severstal-steel-defect-de
 First, source a Cerebras PyTorch virtual environment and make sure that requirements are installed.
 
 ```console
-source ~/R_2.3.0/venv_cerebras_pt/bin/activate
-pip install -r ~/R_2.3.0/modelzoo/requirements.txt
+source ~/R_2.4.0/venv_cerebras_pt/bin/activate
+pip install -r ~/R_2.4.0/modelzoo/requirements.txt
 ```
 
 Then
 
 ```console
-cd ~/R_2.3.0/modelzoo/src/cerebras/modelzoo/models/nlp/bert
+cd ~/R_2.4.0/modelzoo/src/cerebras/modelzoo/models/nlp/bert
 cp /software/cerebras/dataset/severstal-steel-defect-detection/params_severstal_binary_rawds.yaml configs/params_severstal_binary_rawds.yaml
 export MODEL_DIR=model_dir_unet
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=unet_pt --params configs/params_severstal_binary_rawds.yaml --model_dir $MODEL_DIR --mode train --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.3.0/modelzoo/ --compile_dir $(whoami) |& tee mytest.log 
+python run.py CSX --job_labels name=unet_pt --params configs/params_severstal_binary_rawds.yaml --model_dir $MODEL_DIR --mode train --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.4.0/modelzoo/ --compile_dir $(whoami) |& tee mytest.log 
 ```
 --->
 
@@ -49,7 +49,7 @@ The BraggNN model has two versions:<br>
 
 ```console
 TODO
-cd ~/R_2.3.0/anl_shared/braggnn/tf
+cd ~/R_2.4.0/anl_shared/braggnn/tf
 # This yaml has a correct path to a BraggNN dataset
 cp /software/cerebras/dataset/BraggN/params_bragg_nonlocal_sampleds.yaml configs/params_bragg_nonlocal_sampleds.yaml
 export MODEL_DIR=model_dir_braggnn
@@ -69,20 +69,20 @@ source /software/cerebras/venvs/venv_cerebras_pt/bin/activate
 # or your personal venv
 --->
 ```console
-source ~/R_2.3.0/venv_cerebras_pt/bin/activate
-pip install -r ~/R_2.3.0/modelzoo/requirements.txt
+source ~/R_2.4.0/venv_cerebras_pt/bin/activate
+pip install -r ~/R_2.4.0/modelzoo/requirements.txt
 ```
 
 Then
 
 ```console
-cd ~/R_2.3.0/modelzoo/src/cerebras/modelzoo/models/nlp/bert
+cd ~/R_2.4.0/modelzoo/src/cerebras/modelzoo/models/nlp/bert
 cp /software/cerebras/dataset/bert_large/bert_large_MSL128_sampleds.yaml configs/bert_large_MSL128_sampleds.yaml
 export MODEL_DIR=model_dir_bert_large_pytorch
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=bert_pt --params configs/bert_large_MSL128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_2.3.0/modelzoo/src --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=bert_pt --params configs/bert_large_MSL128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_2.4.0/modelzoo/src --compile_dir $(whoami) |& tee mytest.log
 ```
-Note: the vocabulary file referenced in `/software/cerebras/dataset/bert_large/bert_large_MSL128_sampleds.yaml` is the same as the one at `/home/$(whoami)/R_2.3.0/modelzoo/modelzoo/transformers/vocab/google_research_uncased_L-12_H-768_A-12.txt`. 
+Note: the vocabulary file referenced in `/software/cerebras/dataset/bert_large/bert_large_MSL128_sampleds.yaml` is the same as the one at `/home/$(whoami)/R_2.4.0/modelzoo/modelzoo/transformers/vocab/google_research_uncased_L-12_H-768_A-12.txt`. 
 
 The last parts of the output should resemble the following, with messages about cuda that should be ignored and are not shown.
 
@@ -113,18 +113,18 @@ This PyTorch GPT-J 6B parameter pretraining sample uses 2 CS2s.
 First, source a Cerebras PyTorch virtual environment and make sure that the requirements are installed:
 
 ```console
-source ~/R_2.3.0/venv_cerebras_pt/bin/activate
-pip install -r ~/R_2.3.0/modelzoo/requirements.txt
+source ~/R_2.4.0/venv_cerebras_pt/bin/activate
+pip install -r ~/R_2.4.0/modelzoo/requirements.txt
 ```
 
 Then
 
 ```console
-cd ~/R_2.3.0/modelzoo/src/cerebras/modelzoo/models/nlp/gptj
+cd ~/R_2.4.0/modelzoo/src/cerebras/modelzoo/models/nlp/gptj
 cp /software/cerebras/dataset/gptj/params_gptj_6B_sampleds.yaml configs/params_gptj_6B_sampleds.yaml
 export MODEL_DIR=model_dir_gptj
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=gptj_pt --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.3.0/modelzoo/src --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=gptj_pt --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.4.0/modelzoo/src --compile_dir $(whoami) |& tee mytest.log
 ```
 
 The last parts of the output should resemble the following:
@@ -140,22 +140,22 @@ The last parts of the output should resemble the following:
 2023-11-29 21:14:30,142 INFO:   Processed 24000 sample(s) in 910.883781998 seconds.
 ```
 
-## Llama-7B 
-The Cerebras llama7B model implementation can be found at modelzoo/modelzoo/transformers/pytorch/llama and its overview at [https://github.com/Cerebras/modelzoo/blob/main/src/cerebras/modelzoo/models/nlp/llama/README.md#configs-included-for-this-model](https://github.com/Cerebras/modelzoo/blob/main/src/cerebras/modelzoo/models/nlp/llama/README.md#configs-included-for-this-model). This set up will use a subset of pile data (preprocessed at path /software/datasets/llama_data_32K/) to train with a 32K vocab size. 
+## Llama2-7B 
+The Cerebras llama2 7B model implementation can be found at modelzoo/modelzoo/transformers/pytorch/llama and its overview at [https://github.com/Cerebras/modelzoo/blob/main/src/cerebras/modelzoo/models/nlp/llama/README.md#configs-included-for-this-model](https://github.com/Cerebras/modelzoo/blob/main/src/cerebras/modelzoo/models/nlp/llama/README.md#configs-included-for-this-model). This set up will use a subset of pile data (preprocessed at path /software/datasets/llama_data_32K/) to train with a 32K vocab size. 
 
 
 First, source a Cerebras PyTorch virtual environment and make sure that the requirements are installed:
 ```bash
-source ~/R_2.3.0/venv_cerebras_pt/bin/activate
-pip install -r ~/R_2.3.0/modelzoo/requirements.txt
+source ~/R_2.4.0/venv_cerebras_pt/bin/activate
+pip install -r ~/R_2.4.0/modelzoo/requirements.txt
 ```
 Instructions for training:
 ```bash
-cd ~/R_2.3.0/modelzoo/src/cerebras/modelzoo/models/nlp/llama
-cp /software/cerebras/dataset/params_llama_7b.yaml configs/params_llama_7b.yaml
-export MODEL_DIR=model_dir_llamma
+cd ~/R_2.4.0/modelzoo/src/cerebras/modelzoo/models/nlp/llama
+cp /software/cerebras/dataset/params_llama2_7b.yaml configs/params_llama2_7b.yaml
+export MODEL_DIR=model_dir_llama2_7b
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=llama_7b --params configs/params_llama_7b.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /projects /home/ /software --python_paths /home/$(whoami)/R_2.3.0/modelzoo/src  --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=llama2_7b --params configs/params_llama2_7b.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /projects /home/ /software --python_paths /home/$(whoami)/R_2.4.0/modelzoo/src  --compile_dir $(whoami) |& tee mytest.log
 ```
 
 Please find a sample output
@@ -255,16 +255,16 @@ The Cerebras ESM-2 model implementation can be found at `modelzoo/src/cerebras/m
 
 First, source a Cerebras PyTorch virtual environment and make sure that the requirements are installed:
 ```bash
-source ~/R_2.3.0/venv_cerebras_pt/bin/activate
-pip install -r ~/R_2.3.0/modelzoo/requirements.txt
+source ~/R_2.4.0/venv_cerebras_pt/bin/activate
+pip install -r ~/R_2.4.0/modelzoo/requirements.txt
 ```
 Instructions for training (for 400 steps):
 ```bash
-cd ~/R_2.3.0/modelzoo/src/cerebras/modelzoo/models/nlp/esm2
+cd ~/R_2.4.0/modelzoo/src/cerebras/modelzoo/models/nlp/esm2
 export MODEL_DIR=model_dir_esm2
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
 cp /software/cerebras/dataset/ESM-2/params_esm2_t12_35M_UR50D_modified.yaml configs/params_esm2_t12_35M_UR50D_modified.yaml
-python run.py CSX --job_labels name=esm2_t12_35m --params configs/params_esm2_t12_35M_UR50D_modified.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/$(whoami)/ /software --python_paths /home/$(whoami)/R_2.3.0/modelzoo/src --compile_dir /$(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=esm2_t12_35m --params configs/params_esm2_t12_35M_UR50D_modified.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/$(whoami)/ /software --python_paths /home/$(whoami)/R_2.4.0/modelzoo/src --compile_dir /$(whoami) |& tee mytest.log
 ```
 
 Sample output

diff --git a/docs/ai-testbed/cerebras/running-a-model-or-program.md b/docs/ai-testbed/cerebras/running-a-model-or-program.md
@@ -25,58 +25,46 @@ Follow these instructions to compile and train the `fc_mnist` PyTorch sample. Th
 
 First, make a virtual environment for Cerebras for PyTorch.
 See [Customizing Environments](./customizing-environment.md) for the procedures for making PyTorch virtual environments for Cerebras.
-If an environment is made in ```~/R_2.3.0/```, it would be activated as follows:
+If an environment is made in ```~/R_2.4.0/```, it would be activated as follows:
 ```console
-source ~/R_2.3.0/venv_cerebras_pt/bin/activate
+source ~/R_2.4.0/venv_cerebras_pt/bin/activate
 ```
 
 ### Clone the Cerebras modelzoo
 
 ```console
-mkdir ~/R_2.3.0
-cd ~/R_2.3.0
+mkdir ~/R_2.4.0
+cd ~/R_2.4.0
 git clone https://github.com/Cerebras/modelzoo.git
 cd modelzoo
 git tag
-git checkout Release_2.3.0
+git checkout Release_2.4.0
 ```
 ## Running a Pytorch sample
 
 ### Activate your PyTorch virtual environment, install modelzoo requirements, and change to the working directory
 
 ```console
-source ~/R_2.3.0/venv_cerebras_pt/bin/activate
-pip install -r ~/R_2.3.0/modelzoo/requirements.txt
-cd ~/R_2.3.0/modelzoo/src/cerebras/modelzoo/fc_mnist/pytorch
+source ~/R_2.4.0/venv_cerebras_pt/bin/activate
+pip install -r ~/R_2.4.0/modelzoo/requirements.txt
+cd ~/R_2.4.0/modelzoo/src/cerebras/modelzoo/models/nlp/gpt3
 ```
 
-Next, edit configs/params.yaml, making the following changes:
+Next, copy a sample config file. This is for a small GPT3 model, modified to use a preprocessed dataset and to run for fewer steps. 
 
 ```text
- train_input:
--    data_dir: "./mnist"
-+    data_dir: "/software/cerebras/dataset/fc_mnist/data/mnist/train"
+cp /software/cerebras/dataset/OWT/Pytorch/111m_modified.yaml configs/Cerebras_GPT/111m_modified.yaml
 ```
 
-and
-
-```text
- eval_input:
--    data_dir: "./mnist"
-+    data_dir: "/software/cerebras/dataset/fc_mnist/data/mnist/train"
-```
-
-If you want to have the sample download the dataset, you will need to specify absolute paths for the "data_dir"s.
-
 ### Running a sample PyTorch training job
 
 To run the sample:
 
 ```console
-export MODEL_DIR=model_dir
+export MODEL_DIR=model_dir_gpt3_111m
 # deletion of the model_dir is only needed if sample has been previously run
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=fc_mnist --params configs/params.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/$(whoami)/ /software --python_paths /home/$(whoami)/R_2.3.0/modelzoo/src --compile_dir /$(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=gpt3_111m --params configs/Cerebras_GPT/111m_modified.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.4.0/modelzoo/src --compile_dir $(whoami) |& tee mytest.log
 ```
 
 A successful fc_mnist PyTorch training run should finish with output resembling the following: