From 33f2c4f8eedb6ee39eeb8940c993f313de4f55b7 Mon Sep 17 00:00:00 2001
From: Jiawei <jiawei@caltech.edu>
Date: Thu, 14 Mar 2024 22:25:03 -0700
Subject: [PATCH] update readme and pip package

---
 README.md            | 56 ++++++++++++++++++++++++++++++++++++++++++++
 exp_requirements.txt | 14 +++++++++++
 requirements.txt     | 15 ++----------
 3 files changed, 72 insertions(+), 13 deletions(-)
 create mode 100755 exp_requirements.txt
diff --git a/README.md b/README.md
index 436cb9a..0c0d08b 100644
--- a/README.md
+++ b/README.md
@@ -9,17 +9,50 @@ As a gradient projection method, GaLore is independent of the choice of optimize
   <img src="imgs/galore_code_box.png" alt="Image 2" style="width: 550px; margin: 0 auto;">
 </div>
 
+## News
+Thanks everyone for the interest in GaLore! 
 
+**We are working on the offical release of GaLore.** In the meanwhile, please feel free to try the pre-release version and provide feedback to us. Currently, the pre-release version (e.g., GaLore optimizers) should provide a decent memory reduction and accurate simulation of GaLore algorithm. 
+
+The official release of GaLore will include:
+
+1. Per-layer weight updates for multi-GPU training (DDP and FSDP) (working with [PyTorch](https://pytorch.org/)).
+2. Memory-efficient low-rank gradient accumulation (working with [PyTorch](https://pytorch.org/)).
+3. Optimized `GaLoreAdamW8bit` (working with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)).
+
+We would like to express our gratitude to the community members who have been actively working on integrating GaLore into different platforms, including [HuggingFace](https://github.com/huggingface/transformers/pull/29588), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl/pull/1370). Join our Slack workspace [GaLore-Social](https://join.slack.com/t/galore-social/shared_invite/zt-2ev152px0-DguuQ5WRTLQjtq2C88HBvQ) to engage in discussions with us.
+
+## Discussion [(GaLore-Social)](https://join.slack.com/t/galore-social/shared_invite/zt-2ev152px0-DguuQ5WRTLQjtq2C88HBvQ)
+
+We welcome any discussions, questions, and feedback on GaLore. Please join our Slack workspace [GaLore-Social](https://join.slack.com/t/galore-social/shared_invite/zt-2ev152px0-DguuQ5WRTLQjtq2C88HBvQ) to discuss with us and the community.
 
 
 ## Installation
 
+### Install GaLore optimizer
+Install from pip:
 ```bash 
+pip install galore-torch
+```
+
+or if you want to install from source:
+
+```bash
+git clone git@github.com:jiaweizzhao/GaLore.git
+cd GaLore
 pip install -e .
 ```
 
+### Install experiment dependencies
+
+```bash
+pip install -r exp_requirements.txt
+```
+
 ## Usage
 
+### Save optimizer memory using GaLore optimizers
+
 ```python
 from galore_torch import GaLoreAdamW, GaLoreAdamW8bit, GaLoreAdafactor
 # define param groups as galore_params and non_galore_params
@@ -27,6 +60,29 @@ param_groups = [{'params': non_galore_params},
                 {'params': galore_params, 'rank': 128, 'update_proj_gap': 200, 'scale': 0.25, 'proj_type': 'std'}]
 optimizer = GaLoreAdamW(param_groups, lr=0.01)
 ```
+### Save weight gradient memory using per-layer weight updates
+
+We use `register_post_accumulate_grad_hook` provided by [PyTorch](https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html) to enable per-layer weight updates. An example is shown below:
+
+```python
+# define an optimizer for each parameter p, and store them in optimizer_dict
+for p in model.parameters():
+    if p.requires_grad:
+        optimizer_dict[p] = GaLoreAdamW([{'params': p, 'rank': 128, 'update_proj_gap': 200, 'scale': 0.25, 'proj_type': 'std'}], lr=0.01)
+
+# define a hook function to update the parameter p during the backward pass
+def optimizer_hook(p):
+    if p.grad is None: 
+        return
+    optimizer_dict[p].step()
+    optimizer_dict[p].zero_grad()
+
+# Register the hook onto every parameter
+for p in model.parameters():
+    if p.requires_grad:
+        p.register_post_accumulate_grad_hook(optimizer_hook)
+```
+More details can be found in [torchrun_main.py](https://github.com/jiaweizzhao/GaLore/blob/a6bc1650984b1c090a4e108d7c0e3109ee7ad844/torchrun_main.py#L334).
 
 ## Benchmark 1: Pre-Training LLaMA on C4 dataset
 `torchrun_main.py` is the main script for training LLaMA models on C4 with GaLore. Our benchmark scripts for various sizes of models are in `scripts/benchmark_c4` folder.
diff --git a/exp_requirements.txt b/exp_requirements.txt
new file mode 100755
index 0000000..3e2d695
--- /dev/null
+++ b/exp_requirements.txt
@@ -0,0 +1,14 @@
+torch
+transformers==4.31.0
+tokenizers
+datasets
+peft
+wandb
+loguru
+nvitop
+lion-pytorch
+matplotlib
+bitsandbytes
+scipy
+scikit-learn
+evaluate
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 3e2d695..c60a4fc 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,3 @@
 torch
-transformers==4.31.0
-tokenizers
-datasets
-peft
-wandb
-loguru
-nvitop
-lion-pytorch
-matplotlib
-bitsandbytes
-scipy
-scikit-learn
-evaluate
\ No newline at end of file
+transformers
+bitsandbytes
\ No newline at end of file