rotary positions under all circumstances complete

lucidrains · Feb 20, 2024 · bafbe46 · bafbe46
1 parent eb9a1de
commit bafbe46
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -65,8 +65,8 @@ $ python assert.py
 - [x] move flash attention back to key / value column traversal on outer loop and save on ring communication
     - [x] backwards
     - [x] forwards
+- [x] fix rotary positions for striped ring attention when flash buckets > 1
 
-- [ ] fix rotary positions for striped ring attention when flash buckets > 1
 - [ ] option to auto-decide ring sequence size based on world size
     - [ ] allow for finely specifying how to distribute sharding of batch and sequence, depending on world size
 - [ ] allow for variable ring passes per layer, for <a href="https://arxiv.org/abs/2007.03356">local -> global attention</a> in ring transformer as one goes up the layers.

diff --git a/assert.py b/assert.py
@@ -40,7 +40,7 @@ def start(
         ring_attn = True,
         striped_ring_attn = striped_ring_attn,
         ring_seq_size = ceil(seq_len / world_size),
-        bucket_size = ceil(seq_len / world_size),
+        bucket_size = ceil(seq_len / world_size / 2),
     )
 
     flash_attention_net = RingTransformer(
@@ -114,7 +114,7 @@ def start(
     batch_size_var_len = False
     use_cuda = False
     causal = True
-    striped_ring_attn = True
+    striped_ring_attn = False
 
     assert not use_cuda or torch.cuda.device_count() <= world_size
 

diff --git a/ring_attention_pytorch/ring_attention.py b/ring_attention_pytorch/ring_attention.py
@@ -5,7 +5,7 @@
 import torch.nn.functional as F
 from torch.nn import Module, ModuleList
 
-from einops import rearrange
+from einops import rearrange, repeat
 
 from ring_attention_pytorch.ring import (
     all_ring_pass,
@@ -375,20 +375,27 @@ def forward(
         # rotary positions
         # taking into account ring and striping
 
-        maybe_chunk_seq_len = x.shape[-1]
-
-        pos = torch.arange(maybe_chunk_seq_len, device = device)
+        pos = None
+        curr_seq_len = x.shape[-1]
 
         if auto_shard_seq:
             if self.striped_ring_attn:
-                ring_stride = get_world_size()
-                ring_offset = 1
-            else:
-                ring_stride = 1
-                ring_offset = maybe_chunk_seq_len
+                buckets = self.ring_seq_size // self.bucket_size
+                ring_stride = get_world_size() * buckets
+                ring_offset = buckets
+
+                pos = torch.arange(curr_seq_len // buckets, device = device)
+                pos = repeat(pos, 'n -> n b', b = buckets)
 
-            pos *= ring_stride
-            pos += ring_offset * get_rank()
+                pos = pos * ring_stride
+                pos += torch.arange(buckets, device = device) + (get_rank() * buckets)
+                pos = rearrange(pos, 'n b -> (b n)')
+
+            else:
+                pos = torch.arange(curr_seq_len, device = device)
+                pos += curr_seq_len * get_rank()
+        else:
+            pos = torch.arange(curr_seq_len, device = device)
 
         rotary_emb = self.rotary_emb(pos)
 

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'ring-attention-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.1.11',
+  version = '0.1.12',
   license='MIT',
   description = 'Ring Attention - Pytorch',
   author = 'Phil Wang',