[Kernel][Model] Varlen prefill + Prefill chunking support for mamba kernels and Jamba model (#8533)

2024-09-30 00:35:58 +03:00 · 2024-09-30 00:35:58 +03:00 · f13a07b1f8
commit f13a07b1f8
parent 6c9ba48fde
13 changed files with 1176 additions and 894 deletions
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@ -39,8 +39,6 @@
 template<typename input_t, typename weight_t>
 void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
 template <typename input_t, typename weight_t>
 void causal_conv1d_channellast_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
 template<typename input_t, typename weight_t>
 void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream);
@ -55,8 +53,11 @@ void set_conv_params_fwd(ConvParamsBase &params,
                         const at::Tensor x,
                         const at::Tensor weight,
                         const at::Tensor out,
-                         void* bias_ptr,
+                         const c10::optional<at::Tensor>& bias,
-                         bool silu_activation) {
+                         bool silu_activation,
                         const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
                         const c10::optional<at::Tensor>& cache_indices = std::nullopt,
                         const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
    // Reset the parameters
    memset(&params, 0, sizeof(params));
@ -71,26 +72,31 @@ void set_conv_params_fwd(ConvParamsBase &params,
    // Set the pointers and strides.
    params.x_ptr = x.data_ptr();
    params.weight_ptr = weight.data_ptr();
-    params.bias_ptr = bias_ptr;
+    params.bias_ptr = bias.has_value() ? bias.value().data_ptr() : nullptr;
    params.out_ptr = out.data_ptr();
    // All stride are in elements, not bytes.
-    params.x_batch_stride = x.stride(0);
+    params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
-    params.x_c_stride = x.stride(1);
+    params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
-    params.x_l_stride = x.stride(-1);
+    params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
    const bool varlen = params.query_start_loc_ptr != nullptr;
    params.x_batch_stride = x.stride(varlen ? 1 : 0);
    params.x_c_stride = x.stride(varlen ? 0 : 1);
    params.x_l_stride = x.stride(varlen ? 1 : -1);
    params.weight_c_stride = weight.stride(0);
    params.weight_width_stride = weight.stride(1);
-    params.out_batch_stride = out.stride(0);
+    params.out_batch_stride = out.stride(varlen ? 1 : 0);
-    params.out_c_stride = out.stride(1);
+    params.out_c_stride = out.stride(varlen ? 0 : 1);
-    params.out_l_stride = out.stride(-1);
+    params.out_l_stride = out.stride(varlen ? 1 : -1);
 }
 at::Tensor
 causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
                  const c10::optional<at::Tensor> &bias_,
-                  const c10::optional<at::Tensor> &seq_idx_,
+                  const c10::optional<at::Tensor> &conv_states,
-                  const c10::optional<at::Tensor> &initial_states_,
+                  const c10::optional<at::Tensor> &query_start_loc,
-                  const c10::optional<at::Tensor> &final_states_out_,
+                  const c10::optional<at::Tensor> &cache_indices,
                  const c10::optional<at::Tensor> &has_initial_state,
                  bool silu_activation) {
    auto input_type = x.scalar_type();
    auto weight_type = weight.scalar_type();
@ -100,23 +106,21 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
    TORCH_CHECK(x.is_cuda());
    TORCH_CHECK(weight.is_cuda());
    const bool varlen = query_start_loc.has_value() ? true : false;
    const auto sizes = x.sizes();
-    const int batch_size = sizes[0];
+    const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
-    const int dim = sizes[1];
+    const int dim = varlen ? sizes[0] : sizes[1];
-    const int seqlen = sizes[2];
+    const int seqlen = varlen ? sizes[1] : sizes[2];
    const int width = weight.size(-1);
-
+    if (varlen){
        CHECK_SHAPE(x, dim, seqlen);
    }
    else {
        CHECK_SHAPE(x, batch_size, dim, seqlen);
    }
    CHECK_SHAPE(weight, dim, width);
    TORCH_CHECK(x.stride(2) == 1 || x.stride(1) == 1);
    const bool is_channel_last = x.stride(1) == 1 && x.stride(2) > 1;
    if (is_channel_last) {
        TORCH_CHECK(dim % 8 == 0, "causal_conv1d only supports channel dimension divisible by 8 for now");
        TORCH_CHECK(x.stride(2) % 8 == 0 and x.stride(0) % 8 == 0, "causal_conv1d with channel last layout requires strides (x.stride(0) and x.stride(2)) to be multiples of 8");
    }
    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
    if (bias_.has_value()) {
        auto bias = bias_.value();
@ -126,56 +130,50 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
        CHECK_SHAPE(bias, dim);
    }
-    if (seq_idx_.has_value()) {
+
-        TORCH_CHECK(is_channel_last, "seq_idx is only supported for channel last layout");
+    if (has_initial_state.has_value()) {
-        auto seq_idx = seq_idx_.value();
+        auto has_initial_state_ = has_initial_state.value();
-        TORCH_CHECK(seq_idx.scalar_type() == torch::kInt32);
+        TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
-        TORCH_CHECK(seq_idx.is_cuda());
+        TORCH_CHECK(has_initial_state_.is_cuda());
-        TORCH_CHECK(seq_idx.is_contiguous());
+        CHECK_SHAPE(has_initial_state_, batch_size);
-        CHECK_SHAPE(seq_idx, batch_size, seqlen);
+    }
    if (query_start_loc.has_value()) {
        auto query_start_loc_ = query_start_loc.value();
        TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
        TORCH_CHECK(query_start_loc_.is_cuda());
    }
    if (cache_indices.has_value()) {
        auto cache_indices_ = cache_indices.value();
        TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
        TORCH_CHECK(cache_indices_.is_cuda());
        CHECK_SHAPE(cache_indices_, batch_size);
    }
    at::Tensor out = torch::empty_like(x);
    ConvParamsBase params;
    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
-                        bias_.has_value() ? bias_.value().data_ptr() : nullptr,
+                        bias_,
-                        silu_activation);
+                        silu_activation, 
                        query_start_loc,
                        cache_indices,
                        has_initial_state
                        );
-    if (seq_idx_.has_value()) {
+    if (conv_states.has_value()) {
-        params.seq_idx_ptr = seq_idx_.value().data_ptr();
+        auto conv_states_ = conv_states.value();
        TORCH_CHECK(conv_states_.scalar_type() == input_type);
        TORCH_CHECK(conv_states_.is_cuda());
        params.conv_states_ptr = conv_states_.data_ptr();
        params.conv_states_batch_stride = conv_states_.stride(0);
        params.conv_states_c_stride = conv_states_.stride(1);
        params.conv_states_l_stride = conv_states_.stride(2);
    } else {
-        params.seq_idx_ptr = nullptr;
+        params.conv_states_ptr = nullptr;
    }
    if (initial_states_.has_value()) {
        TORCH_CHECK(is_channel_last, "initial_states is only supported for channel last layout");
        auto initial_states = initial_states_.value();
        TORCH_CHECK(initial_states.scalar_type() == input_type);
        TORCH_CHECK(initial_states.is_cuda());
        CHECK_SHAPE(initial_states, batch_size, dim, width - 1);
        TORCH_CHECK(initial_states.stride(1) == 1);
        params.initial_states_ptr = initial_states.data_ptr();
        params.initial_states_batch_stride = initial_states.stride(0);
        params.initial_states_c_stride = initial_states.stride(1);
        params.initial_states_l_stride = initial_states.stride(2);
    } else {
        params.initial_states_ptr = nullptr;
    }
    if (final_states_out_.has_value()) {
        TORCH_CHECK(is_channel_last, "final_states is only supported for channel last layout");
        auto final_states = final_states_out_.value();
        TORCH_CHECK(final_states.scalar_type() == input_type);
        TORCH_CHECK(final_states.is_cuda());
        CHECK_SHAPE(final_states, batch_size, dim, width - 1);
        TORCH_CHECK(final_states.stride(1) == 1);
        params.final_states_ptr = final_states.data_ptr();
        params.final_states_batch_stride = final_states.stride(0);
        params.final_states_c_stride = final_states.stride(1);
        params.final_states_l_stride = final_states.stride(2);
    } else {
        params.final_states_ptr = nullptr;
    }
    // Otherwise the kernel will be launched from cuda:0 device
@ -183,11 +181,7 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
            if (!is_channel_last) {
            causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
            } else {
                causal_conv1d_channellast_fwd_cuda<input_t, weight_t>(params, stream);
            }
    });
    return out;
 }
@ -199,6 +193,7 @@ causal_conv1d_update(const at::Tensor &x,
                     const at::Tensor &weight,
                     const c10::optional<at::Tensor> &bias_,
                     bool silu_activation,
                     const c10::optional<at::Tensor> &cache_seqlens_,
                     const c10::optional<at::Tensor> &conv_state_indices_) {
    auto input_type = x.scalar_type();
    auto weight_type = weight.scalar_type();
@ -214,9 +209,12 @@ causal_conv1d_update(const at::Tensor &x,
    const auto sizes = x.sizes();
    const int batch_size = sizes[0];
    const int dim = sizes[1];
    const int seqlen = sizes[2];
    const int width = weight.size(-1);
    const int conv_state_len = conv_state.size(2);
    TORCH_CHECK(conv_state_len >= width - 1);
-    CHECK_SHAPE(x, batch_size, dim);
+    CHECK_SHAPE(x, batch_size, dim, seqlen);
    CHECK_SHAPE(weight, dim, width);
    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
@ -232,15 +230,27 @@ causal_conv1d_update(const at::Tensor &x,
    at::Tensor out = torch::empty_like(x);
    ConvParamsBase params;
-    set_conv_params_fwd(params, batch_size, dim, /*seqlen=*/1, width, x, weight, out,
+    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
-                        bias_.has_value() ? bias_.value().data_ptr() : nullptr,
+                        bias_,
                        silu_activation);
    params.conv_state_ptr = conv_state.data_ptr();
    params.conv_state_len = conv_state_len;
    // All stride are in elements, not bytes.
    params.conv_state_batch_stride = conv_state.stride(0);
    params.conv_state_c_stride = conv_state.stride(1);
    params.conv_state_l_stride = conv_state.stride(2);
    if (cache_seqlens_.has_value()) {
        auto cache_seqlens = cache_seqlens_.value();
        TORCH_CHECK(cache_seqlens.scalar_type() == torch::kInt32);
        TORCH_CHECK(cache_seqlens.is_cuda());
        TORCH_CHECK(cache_seqlens.stride(-1) == 1);
        CHECK_SHAPE(cache_seqlens, batch_size);
        params.cache_seqlens = cache_seqlens.data_ptr<int32_t>();
    } else {
        params.cache_seqlens = nullptr;
    }
    if (conv_state_indices_.has_value()) {
        auto conv_state_indices = conv_state_indices_.value();
        TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
@ -249,11 +259,11 @@ causal_conv1d_update(const at::Tensor &x,
        CHECK_SHAPE(conv_state_indices, batch_size);
        int conv_state_entries = conv_state.size(0);
-        CHECK_SHAPE(conv_state, conv_state_entries, dim, width);
+        CHECK_SHAPE(conv_state, conv_state_entries, dim, conv_state_len);
        params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
    } else {
-        CHECK_SHAPE(conv_state, batch_size, dim, width);
+        CHECK_SHAPE(conv_state, batch_size, dim, conv_state_len);
        params.conv_state_indices_ptr = nullptr;
    }
@ -296,7 +306,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
    constexpr int kWidth = Ktraits::kWidth;
    constexpr int kNThreads = Ktraits::kNThreads;
    constexpr int kNElts = Ktraits::kNElts;
-    static constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
+    constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
    using input_t = typename Ktraits::input_t;
    using vec_t = typename Ktraits::vec_t;
    using weight_t = typename Ktraits::weight_t;
@ -309,20 +319,39 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
    auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
    vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
    const bool kVarlen = params.query_start_loc_ptr != nullptr;
    const int tidx = threadIdx.x;
    const int batch_id = blockIdx.x;
    const int channel_id = blockIdx.y;
-    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+    const int *query_start_loc = kVarlen ? reinterpret_cast<int *>(params.query_start_loc_ptr) : nullptr;
    const int sequence_start_index = kVarlen ? query_start_loc[batch_id] : batch_id;
    const int seqlen = kVarlen ? query_start_loc[batch_id + 1] - sequence_start_index : params.seqlen;
    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + sequence_start_index * params.x_batch_stride
        + channel_id * params.x_c_stride;
    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
-    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
        + channel_id * params.out_c_stride;
    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
    bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
        : reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
    int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
        : reinterpret_cast<int *>(params.cache_indices_ptr);
    int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
    input_t *conv_states = params.conv_states_ptr == nullptr ? nullptr
        : reinterpret_cast<input_t *>(params.conv_states_ptr) + cache_index * params.conv_states_batch_stride + channel_id * params.conv_states_c_stride;
    // Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
    if (tidx == 0) {
-        input_t zeros[kNElts] = {0};
+        input_t initial_state[kNElts] = {0};
-        smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(zeros)[0];
+        if (has_initial_state) {
            #pragma unroll
            for (int w = 0; w < kWidth - 1; ++w){ initial_state[kNElts - 1 - (kWidth - 2) + w ] = conv_states[w]; }
        }
        smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(initial_state)[0];
    }
    float weight_vals[kWidth];
@ -330,14 +359,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
    constexpr int kChunkSize = kNThreads * kNElts;
-    const int n_chunks = (params.seqlen + kChunkSize - 1) / kChunkSize;
+    const int n_chunks = (seqlen + kChunkSize - 1) / kChunkSize;
    for (int chunk = 0; chunk < n_chunks; ++chunk) {
        input_t x_vals_load[2 * kNElts] = {0};
        if constexpr(kIsVecLoad) {
-            typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (params.seqlen - chunk * kChunkSize) / kNElts);
+            typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (seqlen - chunk * kChunkSize) / kNElts);
        } else {
            __syncthreads();
-            typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), params.seqlen - chunk * kChunkSize);
+            typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), seqlen - chunk * kChunkSize);
        }
        x += kChunkSize;
        __syncthreads();
@ -375,19 +404,57 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
        #pragma unroll
        for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
        if constexpr(kIsVecLoad) {
-            typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (params.seqlen - chunk * kChunkSize) / kNElts);
+            typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (seqlen - chunk * kChunkSize) / kNElts);
        } else {
-            typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, params.seqlen - chunk * kChunkSize);
+            typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
        }
        out += kChunkSize;
    }
    // Final state is stored in the smem_exchange last token slot,
    // in case seqlen < kWidth, we would need to take the final state from the 
    // initial state which is stored in conv_states
    // in case seqlen > kWidth, we would need to load the last kWidth - 1 data
    // and load it into conv_state accordingly
    int last_thread =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize) / kNElts;
    if (conv_states != nullptr && tidx == last_thread) { 
        input_t x_vals_load[kNElts * 2] = {0};
        // in case we are on the first kWidth tokens
        if (last_thread == 0 && seqlen < kWidth){
            // Need to take the initial state
            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[0];
            const int offset = seqlen - (kWidth - 1);
            #pragma unroll
            for (int w = 0; w < kWidth - 1; ++w){
                // pad the existing state
                if ((w - seqlen) >= 0 && has_initial_state) { conv_states[w - seqlen] = conv_states[w]; }
                else if ((w - seqlen) >= 0 && !has_initial_state) { conv_states[w - seqlen] = input_t(0.0f); }
            }
            #pragma unroll
            for (int w = 0; w < kWidth - 1; ++w){
                if (offset + w >= 0) 
                    conv_states[w] = x_vals_load[offset + w ];
            }
        }
        else {
            // in case the final state is in between the threads data
            reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
            const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
            #pragma unroll
            for (int w = 0; w < kWidth - 1; ++w){
                conv_states[w] = x_vals_load[offset + w ];
            }
        }
    }
 }
 template<int kNThreads, int kWidth, typename input_t, typename weight_t>
 void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
    static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
-    BOOL_SWITCH(params.seqlen % kNElts == 0, kIsVecLoad, [&] {
+    const bool kVarlen = params.query_start_loc_ptr != nullptr;
    BOOL_SWITCH(params.seqlen % kNElts == 0 && !kVarlen, kIsVecLoad, [&] {
        using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
        constexpr int kSmemSize = Ktraits::kSmemSize;
        dim3 grid(params.batch, params.dim);
@ -422,220 +489,11 @@ void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
    }
 }
 template<int kNThreads_, int kWidth_, int kChunkSizeL_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
 struct Causal_conv1d_channellast_fwd_kernel_traits {
    // The cache line is 128 bytes, and we try to read 16 bytes per thread.
    // So we have 8 threads per "row", so 32 or 64 elements in the channel dimension.
    // That leaves 4 columns per warp, and so 16 columns per block (assuming each block has 128
    // threads). Each each load is 16 x 32|64 elements in the L x C dimensions.
    using input_t = input_t_;
    using weight_t = weight_t_;
    static constexpr int kNThreads = kNThreads_;
    static_assert(kNThreads % 32 == 0);
    static constexpr int kNWarps = kNThreads / 32;
    static constexpr int kWidth = kWidth_;
    static constexpr int kChunkSizeL = kChunkSizeL_;
    static constexpr int kNBytes = sizeof(input_t);
    static_assert(kNBytes == 2 || kNBytes == 4);
    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
    static constexpr int kNEltsPerRow = 128 / kNBytes;
    static constexpr int kNThreadsPerRow = kNEltsPerRow / kNElts;  // Always 8 for now
    static_assert(kNThreadsPerRow * kNBytes * kNElts == 128);
    static constexpr int kNColsPerWarp = 32 / kNThreadsPerRow;  // Always 4 for now
    static_assert(kNColsPerWarp * kNThreadsPerRow == 32);
    static constexpr int kNColsPerLoad = kNColsPerWarp * kNWarps;
    static constexpr int kNLoads = kChunkSizeL / kNColsPerLoad;
    static_assert(kNLoads * kNColsPerLoad == kChunkSizeL);
    static constexpr bool kIsVecLoad = kIsVecLoad_;
    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
    // using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
    // using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
    // static constexpr int kSmemSize = std::max({sizeof(typename BlockLoadT::TempStorage),
    //                                            sizeof(typename BlockStoreT::TempStorage)});
    // static constexpr int kSmemSize = kChunkSizeL * kNEltsPerRow * kNBytes;
 };
 template<typename Ktraits, bool kHasSeqIdx>
 __global__ __launch_bounds__(Ktraits::kNThreads)
 void causal_conv1d_channellast_fwd_kernel(ConvParamsBase params) {
    constexpr int kWidth = Ktraits::kWidth;
    constexpr int kNThreads = Ktraits::kNThreads;
    constexpr int kNElts = Ktraits::kNElts;
    constexpr int kNThreadsPerC = Ktraits::kNThreadsPerRow;
    constexpr int kLPerLoad = Ktraits::kNColsPerLoad;
    constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
    constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
    using input_t = typename Ktraits::input_t;
    using vec_t = typename Ktraits::vec_t;
    using weight_t = typename Ktraits::weight_t;
    // Shared memory.
    __shared__ input_t x_smem[kWidth - 1 + kChunkSizeL][kChunkSizeC + kNElts];
    const int batch_id = blockIdx.x;
    const int chunk_l_id = blockIdx.y;
    const int chunk_c_id = blockIdx.z;
    const int tid = threadIdx.x;
    const int l_idx = tid / kNThreadsPerC;
    const int c_idx = tid % kNThreadsPerC;
    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
        + (chunk_l_id * kChunkSizeL + l_idx) * params.x_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr)
        + chunk_c_id * kChunkSizeC * params.weight_c_stride;
    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
        + (chunk_l_id * kChunkSizeL + l_idx) * params.out_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
    int *seq_idx = !kHasSeqIdx ? nullptr : reinterpret_cast<int *>(params.seq_idx_ptr)
        + batch_id * params.seqlen + chunk_l_id * kChunkSizeL;
    input_t *initial_states = params.initial_states_ptr == nullptr || chunk_l_id > 0 ? nullptr
        : reinterpret_cast<input_t *>(params.initial_states_ptr) + batch_id * params.initial_states_batch_stride + l_idx * params.initial_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
    // The last L-chunk will also have enough info to write to final states, since it also contain a few x values
    // from the previous L-chunk.
    input_t *final_states = params.final_states_ptr == nullptr || chunk_l_id < gridDim.y - 1 ? nullptr
        : reinterpret_cast<input_t *>(params.final_states_ptr) + batch_id * params.final_states_batch_stride + l_idx * params.final_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
    #pragma unroll
    for (int l = 0; l < Ktraits::kNLoads; ++l) {
        input_t x_vals_load[kNElts] = {0};
        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + l * kLPerLoad * params.x_l_stride);
        }
        reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
    }
    // Load the elements from the previous chunk that are needed for convolution.
    if (l_idx < kWidth - 1) {
        input_t x_vals_load[kNElts] = {0};
        if (chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) >= 0
            && chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < params.seqlen
            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x - (kWidth - 1) * params.x_l_stride);
        } else if (initial_states != nullptr
                   && chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < 0
                   && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(initial_states);
        }
        reinterpret_cast<vec_t *>(x_smem[l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
    }
    __syncthreads();
    if (final_states != nullptr
        && l_idx < kWidth - 1
        && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
        // x_smem[0] contains element at index chunk_l_id * kChunkSizeL - (kWidth - 1)
        // So last few elements (index params.seqlen - kWidth + 1 + l_idx) are stored in x_smem[params.seqlen - kWidth + 1 + l_idx - (chunk_l_id * kChunkSizeL - kWidth + 1)][c_idx]
        *reinterpret_cast<vec_t *>(final_states) = reinterpret_cast<vec_t *>(x_smem[params.seqlen + l_idx - chunk_l_id * kChunkSizeL])[c_idx];
    }
    constexpr int kLPerThread = constexpr_min(kChunkSizeL * kChunkSizeC / kNThreads, kChunkSizeL);
    static_assert(kLPerThread * kNThreads == kChunkSizeL * kChunkSizeC);
    constexpr int kNThreadsPerRow = kChunkSizeL / kLPerThread;
    static_assert(kNThreadsPerRow * kLPerThread == kChunkSizeL);
    // kChunkSizeL, kLPerThread, kNThreadsPerRow should be powers of 2 for simplicity
    static_assert((kChunkSizeL & (kChunkSizeL - 1)) == 0);
    static_assert((kLPerThread & (kLPerThread - 1)) == 0);
    static_assert((kNThreadsPerRow & (kNThreadsPerRow - 1)) == 0);
    static_assert(kNThreadsPerRow <= 32);
    const int row_idx = tid / kNThreadsPerRow;
    const int col_idx = tid % kNThreadsPerRow;
    float bias_val = params.bias_ptr == nullptr || chunk_c_id * kChunkSizeC + row_idx >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[chunk_c_id * kChunkSizeC + row_idx]);
    float weight_vals[kWidth] = {0};
    if (chunk_c_id * kChunkSizeC + row_idx < params.dim) {
        #pragma unroll
        for (int w = 0; w < kWidth; ++w) {
            weight_vals[w] = weight[row_idx * params.weight_c_stride + w * params.weight_width_stride];
        }
    }
    float x_vals[kWidth - 1 + kLPerThread];
    #pragma unroll
    for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
        x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
    }
    int seq_idx_thread[kWidth - 1 + kLPerThread];
    if constexpr (kHasSeqIdx) {
        #pragma unroll
        for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
            seq_idx_thread[i] = chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i - (kWidth - 1) >= 0 ? seq_idx[col_idx * kLPerThread + i - (kWidth - 1)] : -1;
        }
    }
    float out_vals[kLPerThread];
    #pragma unroll
    for (int i = 0; i < kLPerThread; ++i) {
        out_vals[i] = bias_val;
        const int seq_idx_cur = !kHasSeqIdx ? 0 : seq_idx_thread[i + kWidth - 1];
        #pragma unroll
        for (int w = 0; w < kWidth; ++w) {
            if constexpr (!kHasSeqIdx) {
                out_vals[i] += weight_vals[w] * x_vals[i + w];
            } else {
                out_vals[i] += seq_idx_thread[i + w] == seq_idx_cur ? weight_vals[w] * x_vals[i + w] : 0.f;
            }
        }
        if (params.silu_activation) {out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i])); }
    }
    __syncthreads();
    #pragma unroll
    for (int i = 0; i < kLPerThread; ++i) { x_smem[col_idx * kLPerThread + i][row_idx] = out_vals[i]; }
    __syncthreads();
    #pragma unroll
    for (int l = 0; l < Ktraits::kNLoads; ++l) {
        input_t out_vals_store[kNElts];
        reinterpret_cast<vec_t *>(out_vals_store)[0] = reinterpret_cast<vec_t *>(x_smem[l * kLPerLoad + l_idx])[c_idx];
        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
            *reinterpret_cast<vec_t *>(out + l * kLPerLoad * params.out_l_stride) = reinterpret_cast<vec_t *>(out_vals_store)[0];
        }
    }
 }
 template<int kNThreads, int kWidth, typename input_t, typename weight_t>
 void causal_conv1d_channellast_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
    BOOL_SWITCH(params.seq_idx_ptr != nullptr, kHasSeqIdx, [&] {
        using Ktraits = Causal_conv1d_channellast_fwd_kernel_traits<kNThreads, kWidth, 64, true, input_t, weight_t>;
        // constexpr int kSmemSize = Ktraits::kSmemSize;
        constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
        constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
        const int n_chunks_L = (params.seqlen + kChunkSizeL - 1) / kChunkSizeL;
        const int n_chunks_C = (params.dim + kChunkSizeC - 1) / kChunkSizeC;
        dim3 grid(params.batch, n_chunks_L, n_chunks_C);
        dim3 block(Ktraits::kNThreads);
        auto kernel = &causal_conv1d_channellast_fwd_kernel<Ktraits, kHasSeqIdx>;
        // if (kSmemSize >= 48 * 1024) {
        //     C10_CUDA_CHECK(cudaFuncSetAttribute(
        //         kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
        //     }
        // kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
        kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
    });
 }
 template<typename input_t, typename weight_t>
 void causal_conv1d_channellast_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
    if (params.width == 2) {
        causal_conv1d_channellast_fwd_launch<128, 2, input_t, weight_t>(params, stream);
    } else if (params.width == 3) {
        causal_conv1d_channellast_fwd_launch<128, 3, input_t, weight_t>(params, stream);
    } else if (params.width == 4) {
        causal_conv1d_channellast_fwd_launch<128, 4, input_t, weight_t>(params, stream);
    }
 }
 template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
 template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
 template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
 template void causal_conv1d_channellast_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
 template void causal_conv1d_channellast_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
 template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
 ///////
@ -649,7 +507,7 @@ struct Causal_conv1d_update_kernel_traits {
    static_assert(kNBytes == 2 || kNBytes == 4);
 };
-template<typename Ktraits>
+template<typename Ktraits, bool kIsCircularBuffer>
 __global__ __launch_bounds__(Ktraits::kNThreads)
 void causal_conv1d_update_kernel(ConvParamsBase params) {
    constexpr int kWidth = Ktraits::kWidth;
@ -660,6 +518,8 @@ void causal_conv1d_update_kernel(ConvParamsBase params) {
    const int tidx = threadIdx.x;
    const int batch_id = blockIdx.x;
    const int channel_id = blockIdx.y * kNThreads + tidx;
    if (channel_id >= params.dim) return;
    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
        + channel_id * params.x_c_stride;
@ -675,35 +535,70 @@ void causal_conv1d_update_kernel(ConvParamsBase params) {
    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
        + channel_id * params.out_c_stride;
-    float bias_val = params.bias_ptr == nullptr || channel_id >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
+    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
    int state_len = params.conv_state_len;
    int advance_len = params.seqlen;
    int cache_seqlen = kIsCircularBuffer ? params.cache_seqlens[batch_id] % state_len : 0;
    int update_idx = cache_seqlen - (kWidth - 1);
    update_idx = update_idx < 0 ? update_idx + state_len : update_idx;
    float weight_vals[kWidth] = {0};
    if (channel_id < params.dim) {
    #pragma unroll
    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
    }
    float x_vals[kWidth] = {0};
-    if (channel_id < params.dim) {
+    if constexpr (!kIsCircularBuffer) {
-        #pragma unroll
+        #pragma unroll 2
-        for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = float(conv_state[(i + 1) * params.conv_state_l_stride]); }
+        for (int i = 0; i < state_len - advance_len - (kWidth - 1); ++i) {
-        x_vals[kWidth - 1] = float(x[0]);
+            conv_state[i * params.conv_state_l_stride] = conv_state[(i + advance_len) * params.conv_state_l_stride];
        #pragma unroll
        for (int i = 0; i < kWidth; ++i) { conv_state[i * params.conv_state_l_stride] = input_t(x_vals[i]); }
        }
-
+        #pragma unroll
        for (int i = 0; i < kWidth - 1; ++i) {
            input_t state_val = conv_state[(state_len - (kWidth - 1) + i) * params.conv_state_l_stride];
            if (i < advance_len + (kWidth - 1) && state_len - advance_len - (kWidth - 1) + i >= 0) {
                conv_state[(state_len - advance_len - (kWidth - 1) + i) * params.conv_state_l_stride] = state_val;
            }
            x_vals[i] = float(state_val);
        }
    } else {
        #pragma unroll
        for (int i = 0; i < kWidth - 1; ++i, update_idx = update_idx + 1 >= state_len ? update_idx + 1 - state_len : update_idx + 1) {
            input_t state_val = conv_state[update_idx * params.conv_state_l_stride];
            x_vals[i] = float(state_val);
        }
    }
    #pragma unroll 2
    for (int i = 0; i < params.seqlen; ++i) {
        input_t x_val = x[i * params.x_l_stride];
        if constexpr (!kIsCircularBuffer) {
            if (i < advance_len && state_len - advance_len + i >= 0) {
                conv_state[(state_len - advance_len + i) * params.conv_state_l_stride] = x_val;
            }
        } else {
            conv_state[update_idx * params.conv_state_l_stride] = x_val;
            ++update_idx;
            update_idx = update_idx >= state_len ? update_idx - state_len : update_idx;
        }
        x_vals[kWidth - 1] = float(x_val);
        float out_val = bias_val;
        #pragma unroll
-    for (int i = 0; i < kWidth; ++i) { out_val += weight_vals[i] * x_vals[i]; }
+        for (int j = 0; j < kWidth; ++j) { out_val += weight_vals[j] * x_vals[j]; }
        if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
-    if (channel_id < params.dim) { out[0] = input_t(out_val); }
+        out[i * params.out_l_stride] = input_t(out_val);
        // Shift the input buffer by 1
        #pragma unroll
        for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = x_vals[i + 1]; }
    }
 }
 template<int kNThreads, int kWidth, typename input_t, typename weight_t>
 void causal_conv1d_update_launch(ConvParamsBase &params, cudaStream_t stream) {
    using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
    dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
-    auto kernel = &causal_conv1d_update_kernel<Ktraits>;
+    auto kernel = params.cache_seqlens == nullptr
        ? &causal_conv1d_update_kernel<Ktraits, false>
        : &causal_conv1d_update_kernel<Ktraits, true>;
    kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
--- a/csrc/mamba/causal_conv1d/causal_conv1d.h
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.h
@ -24,6 +24,7 @@ struct ConvParamsBase {
    index_t out_c_stride;
    index_t out_l_stride;
    int conv_state_len;
    index_t conv_state_batch_stride;
    index_t conv_state_c_stride;
    index_t conv_state_l_stride;
@ -35,6 +36,10 @@ struct ConvParamsBase {
    void *__restrict__ out_ptr;
    void *__restrict__ conv_state_ptr;
    void *__restrict__ query_start_loc_ptr;
    void *__restrict__ has_initial_state_ptr;
    void *__restrict__ cache_indices_ptr;
    int32_t *__restrict__ cache_seqlens;
    // For the continuous batching case. Makes it so that the mamba state for 
    // the current batch doesn't need to be a contiguous tensor.
@ -52,6 +57,11 @@ struct ConvParamsBase {
    index_t final_states_batch_stride;
    index_t final_states_l_stride;
    index_t final_states_c_stride;
    void *  conv_states_ptr;
    index_t conv_states_batch_stride;
    index_t conv_states_l_stride;
    index_t conv_states_c_stride;
 };
--- a/csrc/mamba/mamba_ssm/selective_scan.h
+++ b/csrc/mamba/mamba_ssm/selective_scan.h
@ -54,10 +54,14 @@ struct SSMParamsBase {
    void *__restrict__ delta_ptr;
    void *__restrict__ delta_bias_ptr;
    void *__restrict__ out_ptr;
-    void *__restrict__ x_ptr;
+    void *__restrict__ ssm_states_ptr;
    void *__restrict__ z_ptr;
    void *__restrict__ out_z_ptr;
-    void *__restrict__ index_ptr;
+
    void *__restrict__ query_start_loc_ptr;
    void *__restrict__ cache_indices_ptr;
    void *__restrict__ has_initial_state_ptr;
 };
@ -201,7 +205,7 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
                                  typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
                                  typename Ktraits::BlockLoadT::TempStorage &smem_load,
                                  int seqlen) {
-    if constexpr (Ktraits::kIsEvenLen) {
+    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
        auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
        using vec_t = typename Ktraits::vec_t;
        typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
@ -217,21 +221,6 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
    }
 }
 template<typename Ktraits>
 inline __device__ void load_index(int *u,
                                  int (&u_vals)[Ktraits::kNItems],
                                  typename Ktraits::BlockLoadIndexT::TempStorage &smem_load_index,
                                  int seqlen) {
    if constexpr (Ktraits::kIsEvenLen) {
        auto& smem_load_index_vec = reinterpret_cast<typename Ktraits::BlockLoadIndexVecT::TempStorage&>(smem_load_index);
        Ktraits::BlockLoadIndexVecT(smem_load_index_vec).Load(
            reinterpret_cast<uint4*>(u),
            reinterpret_cast<uint4(&)[Ktraits::kNLoadsIndex]>(u_vals)
       );
    } else {
        Ktraits::BlockLoadIndexT(smem_load_index).Load(u, u_vals, seqlen, 0);
    }
 }
 template<typename Ktraits>
 inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
@ -240,7 +229,7 @@ inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
                                   int seqlen) {
    constexpr int kNItems = Ktraits::kNItems;
    typename Ktraits::input_t B_vals_load[kNItems];
-    if constexpr (Ktraits::kIsEvenLen) {
+    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
        auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
        using vec_t = typename Ktraits::vec_t;
        typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
@ -263,7 +252,7 @@ inline __device__ void store_output(typename Ktraits::input_t *out,
    typename Ktraits::input_t write_vals[Ktraits::kNItems];
    #pragma unroll
    for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
-    if constexpr (Ktraits::kIsEvenLen) {
+    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
        auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
        using vec_t = typename Ktraits::vec_t;
        typename Ktraits::BlockStoreVecT(smem_store_vec).Store(
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -23,7 +23,7 @@
 template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
         bool kIsVariableB_, bool kIsVariableC_,
-         bool kHasZ_, bool kUseIndex_, typename input_t_, typename weight_t_>
+         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
 struct Selective_Scan_fwd_kernel_traits {
    static_assert(kNItems_ % 4 == 0);
    using input_t = input_t_;
@ -38,22 +38,19 @@ struct Selective_Scan_fwd_kernel_traits {
    static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
    static_assert(kNItems % kNElts == 0);
    static constexpr int kNLoads = kNItems / kNElts;
-    static constexpr bool kIsEvenLen = kIsEvenLen_;
+    static constexpr bool kIsEvenLen = kVarlen_ ? false : kIsEvenLen_;
    static constexpr bool kIsVariableB = kIsVariableB_;
    static constexpr bool kIsVariableC = kIsVariableC_;
    static constexpr bool kHasZ = kHasZ_;
-    static constexpr bool kUseIndex = kUseIndex_;
+    static constexpr bool kVarlen = kVarlen_;
-    static constexpr bool kDirectIO = kIsEvenLen && kNLoads == 1;
+    static constexpr bool kDirectIO = kVarlen_ ? false : kIsEvenLen && kNLoads == 1;
    static constexpr int kNLoadsIndex = kNItems / 4;
    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
    using scan_t = float2;
    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
    using BlockLoadIndexT = cub::BlockLoad<int, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
    using BlockLoadIndexVecT = cub::BlockLoad<uint4, kNThreads, kNLoadsIndex,
        !(kIsEvenLen && kNLoadsIndex == 1) ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
    using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, kNItems , cub::BLOCK_LOAD_WARP_TRANSPOSE>;
    using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads ,
        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE  : cub::BLOCK_LOAD_DIRECT>;
@ -65,8 +62,6 @@ struct Selective_Scan_fwd_kernel_traits {
    using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
    static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
                                                 sizeof(typename BlockLoadVecT::TempStorage),
                                                 sizeof(typename BlockLoadIndexT::TempStorage),
                                                 sizeof(typename BlockLoadIndexVecT::TempStorage),
                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
                                                 sizeof(typename BlockStoreT::TempStorage),
@ -80,7 +75,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    constexpr bool kIsVariableB = Ktraits::kIsVariableB;
    constexpr bool kIsVariableC = Ktraits::kIsVariableC;
    constexpr bool kHasZ = Ktraits::kHasZ;
-    constexpr bool kUseIndex = Ktraits::kUseIndex;
+    constexpr bool kVarlen = Ktraits::kVarlen;
    constexpr int kNThreads = Ktraits::kNThreads;
    constexpr int kNItems = Ktraits::kNItems;
    constexpr int kNRows = Ktraits::kNRows;
@ -97,7 +92,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
    auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
    auto& smem_load_index = reinterpret_cast<typename Ktraits::BlockLoadIndexT::TempStorage&>(smem_);
    auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
    auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
@ -108,17 +102,29 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    const int batch_id = blockIdx.x;
    const int dim_id = blockIdx.y;
    const int group_id = dim_id / (params.dim_ngroups_ratio);
-    input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + batch_id * params.u_batch_stride
+    int seqlen = params.seqlen;
    int sequence_start_index = batch_id;
    if constexpr (kVarlen){
        int *query_start_loc = reinterpret_cast<int *>(params.query_start_loc_ptr);
        sequence_start_index = query_start_loc[batch_id];
        seqlen = query_start_loc[batch_id + 1] - sequence_start_index;
    }
    const bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
        : reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
    const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
        : reinterpret_cast<int *>(params.cache_indices_ptr);
    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
    input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + sequence_start_index * params.u_batch_stride
        + dim_id * kNRows * params.u_d_stride;
-    input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + batch_id * params.delta_batch_stride
+    input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + sequence_start_index * params.delta_batch_stride
        + dim_id * kNRows * params.delta_d_stride;
    weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
    weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
-    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + batch_id * params.B_batch_stride + group_id * params.B_group_stride;
+    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
-    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + batch_id * params.C_batch_stride + group_id * params.C_group_stride;
+    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    scan_t *x = reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id * kNRows) * params.n_chunks * params.dstate;
+    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate;
    int *index = !kUseIndex ? nullptr :reinterpret_cast<int *>(params.index_ptr) + batch_id * params.seqlen;
    float D_val[kNRows] = {0};
    if (params.D_ptr != nullptr) {
@ -142,9 +148,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    // }
    constexpr int kChunkSize = kNThreads * kNItems;
-    for (int chunk = 0; chunk < params.n_chunks; ++chunk) {
+    const int n_chunks = (seqlen + 2048 - 1) / 2048;
    for (int chunk = 0; chunk < n_chunks; ++chunk) {
        input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
        int index_vals_load[kNRows][kNItems];
        __syncthreads();
        #pragma unroll
@ -152,15 +158,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
            if constexpr (!kDirectIO) {
                if (r > 0) { __syncthreads(); }
            }
-            load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, params.seqlen - chunk * kChunkSize);
+            load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, seqlen - chunk * kChunkSize);
            if constexpr (!kDirectIO) { __syncthreads(); }
-            load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, params.seqlen - chunk * kChunkSize);
+            load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, seqlen - chunk * kChunkSize);
            if constexpr (kUseIndex) {
                load_index<Ktraits>(index + r * params.delta_d_stride, index_vals_load[r], smem_load_index, params.seqlen - chunk * kChunkSize);
            }
        }
        if constexpr (kUseIndex) {
            index += kChunkSize;
        }
        u += kChunkSize;
        delta += kChunkSize;
@ -197,7 +197,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
            weight_t B_vals[kNItems], C_vals[kNItems];
            if constexpr (kIsVariableB) {
                load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
-                    smem_load_weight, (params.seqlen - chunk * kChunkSize) * (1));
+                    smem_load_weight, (seqlen - chunk * kChunkSize) * (1));
                if constexpr (!kIsVariableC) {
                    #pragma unroll
                    for (int r = 0; r < kNRows; ++r) {
@ -208,7 +208,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
            if constexpr (kIsVariableC) {
                auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
                load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
-                    smem_load_weight_C, (params.seqlen - chunk * kChunkSize) * (1 ));
+                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
                if constexpr (!kIsVariableB) {
                    #pragma unroll
                    for (int r = 0; r < kNRows; ++r) {
@ -232,24 +232,16 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                    thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
                                                 !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
-                    // Reset A bar for cumulative sequences (Real)
+                    if (seqlen % (kNItems * kNThreads) != 0) {  // So that the last state is correct
-                    if constexpr (kUseIndex) {
+                        if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
                        if (index_vals_load[r][i] == 0) {
                            thread_data[i].x = 0.f;
                        }
                    }
                    if constexpr (!Ktraits::kIsEvenLen) {  // So that the last state is correct
                        if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
                            thread_data[i] = make_float2(1.f, 0.f);
                        }
                    }
                }
                // Initialize running total
-                scan_t running_prefix;
+
-                    // If we use WARP_SCAN then all lane 0 of all warps (not just thread 0) needs to read
+                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0);
-                running_prefix = chunk == 0 ? x[(r * params.n_chunks) * params.dstate + state_idx] : ( threadIdx.x % 32 == 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.f, 0.f));
+
                    // running_prefix = chunk > 0 && threadIdx.x == 0 ? smem_running_prefix[state_idx] : make_float2(1.f, 0.f);
                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
                typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
                    thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
@ -258,7 +250,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
                if (threadIdx.x == 0) {
                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
-                    x[(r * params.n_chunks + chunk) * params.dstate + state_idx] = prefix_op.running_prefix;
+                    if (chunk == n_chunks - 1) {
                        ssm_states[state_idx] = input_t(prefix_op.running_prefix.y);
                    }
                }
                #pragma unroll
                for (int i = 0; i < kNItems; ++i) {
@ -270,7 +264,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
            }
        }
-        input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+        input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
            + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
        __syncthreads();
        #pragma unroll
@ -278,26 +272,26 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
            if constexpr (!kDirectIO) {
                if (r > 0) { __syncthreads(); }
            }
-            store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
+            store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
        }
        if constexpr (kHasZ) {
-            input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + batch_id * params.z_batch_stride
+            input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + sequence_start_index * params.z_batch_stride
                + dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
-            input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + batch_id * params.out_z_batch_stride
+            input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + sequence_start_index * params.out_z_batch_stride
                + dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
            #pragma unroll
            for (int r = 0; r < kNRows; ++r) {
                input_t z_vals[kNItems];
                __syncthreads();
-                load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, params.seqlen - chunk * kChunkSize);
+                load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, seqlen - chunk * kChunkSize);
                #pragma unroll
                for (int i = 0; i < kNItems; ++i) {
                    float z_val = z_vals[i];
                    out_vals[r][i] *= z_val / (1 + expf(-z_val));
                }
                __syncthreads();
-                store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
+                store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
            }
        }
@ -316,8 +310,8 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    constexpr bool kIsVariableC = true;
    constexpr bool kHasZ = true;
    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
-        BOOL_SWITCH(params.index_ptr != nullptr , kUseIndex, [&] {
+        BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
-            using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kUseIndex, input_t, weight_t>;
+            using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
            constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
            dim3 grid(params.batch, params.dim / kNRows);
            auto kernel = &selective_scan_fwd_kernel<Ktraits>;
@ -405,12 +399,15 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                        const torch::Tensor out,
                        const torch::Tensor z,
                        const torch::Tensor out_z,
-                        void* D_ptr,
+                        const c10::optional<at::Tensor>& D,
-                        void* delta_bias_ptr,
+                        const c10::optional<at::Tensor>& delta_bias,
-                        void* x_ptr,
+                        const torch::Tensor ssm_states,
                        bool has_z, 
                        bool delta_softplus,
-                        void* index_ptr) {
+                        const c10::optional<at::Tensor>& query_start_loc,
                        const c10::optional<at::Tensor>& cache_indices,
                        const c10::optional<at::Tensor>& has_initial_state,
                        bool varlen) {
    // Reset the parameters
    memset(&params, 0, sizeof(params));
@ -434,18 +431,44 @@ void set_ssm_params_fwd(SSMParamsBase &params,
    params.A_ptr = A.data_ptr();
    params.B_ptr = B.data_ptr();
    params.C_ptr = C.data_ptr();
-    params.D_ptr = D_ptr;
+    params.D_ptr = D.has_value() ? D.value().data_ptr() : nullptr;
-    params.delta_bias_ptr = delta_bias_ptr;
+    params.delta_bias_ptr = delta_bias.has_value() ? delta_bias.value().data_ptr() : nullptr;
    params.out_ptr = out.data_ptr();
-    params.x_ptr = x_ptr;
+    params.ssm_states_ptr = ssm_states.data_ptr();
    params.z_ptr = has_z ? z.data_ptr() : nullptr;
    params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
    params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
    params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
    params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
    params.index_ptr = index_ptr;
    // All stride are in elements, not bytes.
    params.A_d_stride = A.stride(0);
    params.A_dstate_stride = A.stride(1);
    if (varlen){
        params.B_batch_stride = B.stride(2);
        params.B_group_stride = B.stride(0);
        params.B_dstate_stride = B.stride(1);
        params.C_batch_stride = C.stride(2);
        params.C_group_stride = C.stride(0);
        params.C_dstate_stride = C.stride(1);
        params.u_batch_stride = u.stride(1);
        params.u_d_stride = u.stride(0);
        params.delta_batch_stride = delta.stride(1);
        params.delta_d_stride = delta.stride(0);
        if (has_z) {
            params.z_batch_stride = z.stride(1);
            params.z_d_stride = z.stride(0);
            params.out_z_batch_stride = out_z.stride(1);
            params.out_z_d_stride = out_z.stride(0);
        }
        params.out_batch_stride = out.stride(1);
        params.out_d_stride = out.stride(0);
    }
    else{
        if (!is_variable_B) {
            params.B_d_stride = B.stride(0);
        } else {
@ -473,16 +496,18 @@ void set_ssm_params_fwd(SSMParamsBase &params,
        params.out_batch_stride = out.stride(0);
        params.out_d_stride = out.stride(1);
    }
 }
-std::vector<torch::Tensor>
+void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
 selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                  const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
                  const c10::optional<torch::Tensor> &D_,
                  const c10::optional<torch::Tensor> &z_,
                  const c10::optional<torch::Tensor> &delta_bias_,
                  bool delta_softplus,
-                  const c10::optional<torch::Tensor> &index_,
+                  const c10::optional<torch::Tensor> &query_start_loc,
-                  const c10::optional<torch::Tensor> &x) {
+                  const c10::optional<torch::Tensor> &cache_indices,
                  const c10::optional<torch::Tensor> &has_initial_state,
                  const torch::Tensor &ssm_states) {
    auto input_type = u.scalar_type();
    auto weight_type = A.scalar_type();
    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@ -505,23 +530,37 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
    TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
    const auto sizes = u.sizes();
-    const int batch_size = sizes[0];
+    const bool varlen = query_start_loc.has_value();
-    const int dim = sizes[1];
+    const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
-    const int seqlen = sizes[2];
+    const int dim = varlen ? sizes[0] : sizes[1];
    const int seqlen = varlen ? sizes[1] : sizes[2];
    const int dstate = A.size(1);
-    const int n_groups = is_variable_B ? B.size(1) : 1;
+    const int n_groups = varlen ? B.size(0) : B.size(1);
    TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
    if (varlen) {
        CHECK_SHAPE(u, dim, seqlen);
        CHECK_SHAPE(delta, dim, seqlen);
    } else {
        CHECK_SHAPE(u, batch_size, dim, seqlen);
        CHECK_SHAPE(delta, batch_size, dim, seqlen);
    }
    CHECK_SHAPE(A, dim, dstate);
    TORCH_CHECK(is_variable_B, "is_variable_B = False is disabled in favor of reduced binary size")
    if (varlen) {
        CHECK_SHAPE(B, n_groups, dstate, seqlen);
    } else {
        CHECK_SHAPE(B, batch_size, n_groups, dstate, seqlen); 
    }
    TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
    TORCH_CHECK(is_variable_C, "is_variable_C = False is disabled in favor of reduced binary size")
    if (varlen) {
        CHECK_SHAPE(C, n_groups, dstate, seqlen);
    } else {
        CHECK_SHAPE(C, batch_size, n_groups, dstate, seqlen); 
    }
    TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
    if (D_.has_value()) {
@ -539,13 +578,31 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
        TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
        CHECK_SHAPE(delta_bias, dim);
    }
-    if (index_.has_value()) {
+
-        auto index = index_.value();
+
-        TORCH_CHECK(index.scalar_type() == at::ScalarType::Int);
+    if (has_initial_state.has_value()) {
-        TORCH_CHECK(index.is_cuda());
+        auto has_initial_state_ = has_initial_state.value();
-        CHECK_SHAPE(index, batch_size, seqlen);
+        TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
        TORCH_CHECK(has_initial_state_.is_cuda());
        CHECK_SHAPE(has_initial_state_, batch_size);
    }
    if (query_start_loc.has_value()) {
        auto query_start_loc_ = query_start_loc.value();
        TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
        TORCH_CHECK(query_start_loc_.is_cuda());
    }
    if (cache_indices.has_value()) {
        auto cache_indices_ = cache_indices.value();
        TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
        TORCH_CHECK(cache_indices_.is_cuda());
        CHECK_SHAPE(cache_indices_, batch_size);
    }
    at::Tensor z, out_z;
    const bool has_z = z_.has_value();
    TORCH_CHECK(has_z, "has_z = False is disabled in favor of reduced binary size")
@ -553,31 +610,38 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
    TORCH_CHECK(z.scalar_type() == input_type);
    TORCH_CHECK(z.is_cuda());
    TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
    if (varlen){
        CHECK_SHAPE(z, dim, seqlen);
    } else {
        CHECK_SHAPE(z, batch_size, dim, seqlen);
-    out_z = torch::empty_like(z);
+    }
    out_z = z;
    const int n_chunks = (seqlen + 2048 - 1) / 2048;
    // const int n_chunks = (seqlen + 1024 - 1) / 1024;
    // at::Tensor out = torch::empty_like(u);
    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
-    at::Tensor out = torch::empty_like(delta);
+    at::Tensor out = delta;
-    if (x.has_value()){
+    TORCH_CHECK(ssm_states.scalar_type() == input_type);
-        auto _x = x.value();
+    TORCH_CHECK(ssm_states.is_cuda());
-        TORCH_CHECK(_x.scalar_type() == weight_type);
+    TORCH_CHECK(ssm_states.stride(-1) == 1);
-        TORCH_CHECK(_x.is_cuda());
+    CHECK_SHAPE(ssm_states, batch_size, dim, dstate);
        TORCH_CHECK(_x.stride(-1) == 1);
        CHECK_SHAPE(_x, batch_size, dim, n_chunks, dstate * 2);
    }
    SSMParamsBase params;
    set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
                       u, delta, A, B, C, out, z, out_z,
-                       D_.has_value() ? D_.value().data_ptr() : nullptr,
+                       D_,
-                       delta_bias_.has_value() ? delta_bias_.value().data_ptr() : nullptr,
+                       delta_bias_,
-                       x.value().data_ptr(),
+                       ssm_states,
                       has_z,
                       delta_softplus,
-                       index_.has_value() ? index_.value().data_ptr() : nullptr);
+                       query_start_loc,
                       cache_indices,
                       has_initial_state,
                       varlen
                       );
    // Otherwise the kernel will be launched from cuda:0 device
    // Cast to char to avoid compiler warning about narrowing
@ -586,8 +650,5 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
    });
    std::vector<at::Tensor> result = {out};
    if (has_z) { result.push_back(out_z); }
    return result;
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -215,25 +215,30 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
-std::vector<torch::Tensor> selective_scan_fwd(
+void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
-    const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A,
+                        const torch::Tensor& A, const torch::Tensor& B,
-    const torch::Tensor& B, const torch::Tensor& C,
+                        const torch::Tensor& C,
                        const c10::optional<torch::Tensor>& D_,
                        const c10::optional<torch::Tensor>& z_,
-    const c10::optional<torch::Tensor>& delta_bias_, bool delta_softplus,
+                        const c10::optional<torch::Tensor>& delta_bias_,
-    const c10::optional<torch::Tensor>& index_,
+                        bool delta_softplus,
-    const c10::optional<torch::Tensor>& x);
+                        const c10::optional<torch::Tensor>& query_start_loc,
                        const c10::optional<torch::Tensor>& cache_indices,
                        const c10::optional<torch::Tensor>& has_initial_state,
                        const torch::Tensor& ssm_states);
 at::Tensor causal_conv1d_update(
    const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias, bool silu_activation,
+    const c10::optional<at::Tensor>& bias_, bool silu_activation,
-    const c10::optional<at::Tensor>& conv_state_indices);
+    const c10::optional<at::Tensor>& cache_seqlens_,
    const c10::optional<at::Tensor>& conv_state_indices_);
 at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
                             const c10::optional<at::Tensor>& bias_,
-                             const c10::optional<at::Tensor>& seq_idx_,
+                             const c10::optional<at::Tensor>& conv_states,
-                             const c10::optional<at::Tensor>& initial_states_,
+                             const c10::optional<at::Tensor>& query_start_loc,
-                             const c10::optional<at::Tensor>& final_states_out_,
+                             const c10::optional<at::Tensor>& cache_indices,
                             const c10::optional<at::Tensor>& has_initial_state,
                             bool silu_activation);
 #ifndef USE_ROCM
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -273,26 +273,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def(
      "selective_scan_fwd(Tensor! u, Tensor! delta,"
      "Tensor! A, Tensor! B, Tensor! C,"
-      "Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
+      "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
      "bool delta_softplus,"
-      "Tensor? index_, Tensor!? x) -> Tensor[]");
+      "Tensor? query_start_loc,"
      "Tensor? cache_indices,"
      "Tensor? has_initial_state,"
      "Tensor! ssm_states) -> ()");
  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
  ops.def(
      "causal_conv1d_update(Tensor! x,"
      "Tensor! conv_state,"
      "Tensor! weight,"
-      "Tensor? bias,"
+      "Tensor? bias_,"
      "bool silu_activation,"
      "Tensor? cache_seqlens_,"
      "Tensor? conv_state_indices) -> Tensor");
  ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
  ops.def(
      "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
      "Tensor? bias_,"
-      "Tensor? seq_idx_,"
+      "Tensor!? conv_states,"
-      "Tensor? initial_states_,"
+      "Tensor? query_start_loc,"
-      "Tensor!? final_states_out_,"
+      "Tensor? cache_indices,"
      "Tensor? has_initial_state,"
      "bool silu_activation) -> Tensor");
  ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
 #endif
--- a/tests/kernels/test_causal_conv1d.py
+++ b/tests/kernels/test_causal_conv1d.py
@ -3,7 +3,6 @@ from typing import Optional
 import pytest
 import torch
 import torch.nn.functional as F
 from einops import rearrange
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops  # noqa: F401
@ -57,43 +56,72 @@ def causal_conv1d_ref(
    return (out, None) if not return_final_states else (out, final_states_out)
-def causal_conv1d_update_ref(x: torch.Tensor,
+def causal_conv1d_update_ref(x,
-                             conv_state: torch.Tensor,
+                             conv_state,
-                             weight: torch.Tensor,
+                             weight,
-                             bias: Optional[torch.Tensor] = None,
+                             bias=None,
-                             activation: Optional[str] = None):
+                             activation=None,
                             cache_seqlens=None):
    """
-    x: (batch, dim)
+    x: (batch, dim) or (batch, dim, seqlen)
-    conv_state: (batch, dim, width)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
    weight: (dim, width)
    bias: (dim,)
    cache_seqlens: (batch,), dtype int32.
        If not None, the conv_state is treated as a circular buffer.
        The conv_state will be updated by copying x to the 
        conv_state starting at the index
        @cache_seqlens % state_len before performing the convolution.
-    out: (batch, dim)
+    out: (batch, dim) or (batch, dim, seqlen)
    """
    if activation not in [None, "silu", "swish"]:
        raise NotImplementedError("activation must be None, silu, or swish")
    dtype_in = x.dtype
-    batch, dim = x.shape
+    unsqueeze = x.dim() == 2
    if unsqueeze:
        x = x.unsqueeze(-1)
    batch, dim, seqlen = x.shape
    width = weight.shape[1]
-    assert conv_state.shape == (batch, dim, width)
+    state_len = conv_state.shape[-1]
    assert conv_state.shape == (batch, dim, state_len)
    assert weight.shape == (dim, width)
-    conv_state.copy_(torch.roll(conv_state, shifts=-1,
+    if cache_seqlens is None:
-                                dims=-1))  # Update state (B D W)
+        x_new = torch.cat([conv_state, x], dim=-1).to(
-    conv_state[:, :, -1] = x
+            weight.dtype)  # (batch, dim, state_len + seqlen)
-    out = torch.sum(conv_state * weight, dim=-1)  # (B D)
+        conv_state.copy_(x_new[:, :, -state_len:])
-    if bias is not None:
+    else:
-        out += bias
+        width_idx = torch.arange(
            -(width - 1), 0, dtype=torch.long,
            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
        width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(
            -1, dim, -1)
        x_new = torch.cat([conv_state.gather(2, width_idx), x],
                          dim=-1).to(weight.dtype)
        copy_idx = torch.arange(
            seqlen, dtype=torch.long,
            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
        copy_idx = torch.remainder(copy_idx,
                                   state_len).unsqueeze(1).expand(-1, dim, -1)
        conv_state.scatter_(2, copy_idx, x)
    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0,
                   groups=dim)[:, :, -seqlen:]
    if unsqueeze:
        out = out.squeeze(-1)
    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
@pytest.mark.parametrize("silu_activation", [True])
@pytest.mark.parametrize("has_bias", [True])
 def causal_conv1d_opcheck_fn(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: Optional[torch.Tensor] = None,
-    seq_idx: Optional[torch.Tensor] = None,
+    cu_seq_len: Optional[torch.Tensor] = None,
-    initial_states: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
-    return_final_states: bool = False,
+    has_initial_state: Optional[torch.Tensor] = None,
-    final_states_out=None,
+    conv_states: Optional[torch.Tensor] = None,
    activation: Optional[str] = "silu",
 ):
    """
@ -109,135 +137,93 @@ def causal_conv1d_opcheck_fn(
    """
    if activation not in [None, "silu", "swish"]:
        raise NotImplementedError("activation must be None, silu, or swish")
-    if x.stride(2) != 1 and x.stride(1) != 1:
+    if x.stride(-1) != 1:
        x = x.contiguous()
    bias = bias.contiguous() if bias is not None else None
    if seq_idx is not None:
        assert (initial_states is
                None), "initial_states must be None if seq_idx is not None"
        assert (not return_final_states
                ), "If seq_idx is not None, we don't return final_states_out"
    seq_idx = seq_idx.contiguous() if seq_idx is not None else None
    if initial_states is not None and (initial_states.stride(2) != 1
                                       and initial_states.stride(1) != 1):
        initial_states = initial_states.contiguous()
    if return_final_states:
        assert (
            x.stride(1) == 1
        ), "Only channel-last layout support returning final_states_out"
        if final_states_out is not None:
            assert (final_states_out.stride(2) == 1
                    or final_states_out.stride(1) == 1)
        else:
            batch, dim, seqlen = x.shape
            width = weight.shape[1]
            final_states_out = torch.empty(batch,
                                           width - 1,
                                           dim,
                                           device=x.device,
                                           dtype=x.dtype).transpose(1, 2)
    else:
        final_states_out = None
-    opcheck(torch.ops._C.causal_conv1d_fwd,
+    opcheck(torch.ops._C.causal_conv1d_fwd, (
-            (x, weight, bias, seq_idx, initial_states, final_states_out,
+        x,
-             activation in ["silu", "swish"]))
+        weight,
        bias,
        conv_states,
        cu_seq_len,
        cache_indices,
        has_initial_state,
        activation in ["silu", "swish"],
    ))
-@pytest.mark.parametrize("return_final_states", [False, True])
+@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
-@pytest.mark.parametrize("has_initial_states", [False, True])
+@pytest.mark.parametrize("silu_activation", [True])
-@pytest.mark.parametrize("channel_last", [False, True])
+@pytest.mark.parametrize("has_bias", [True])
@pytest.mark.parametrize("itype", [torch.bfloat16])
@pytest.mark.parametrize("silu_activation", [False, True])
@pytest.mark.parametrize("has_bias", [False, True])
@pytest.mark.parametrize("width", [4])
-@pytest.mark.parametrize("seqlen", [128, 512, 4096])
+@pytest.mark.parametrize(
-@pytest.mark.parametrize('dim', [64, 4096 + 32])
+    'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
-@pytest.mark.parametrize('batch', [1, 2])
+@pytest.mark.parametrize('dim', [64])
@pytest.mark.parametrize('batch', [1])
 def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
-                       itype, channel_last, has_initial_states,
+                       itype):
                       return_final_states):
    if not channel_last and (has_initial_states or return_final_states):
        pytest.skip(
            "Only channel_last support initial_states or return_final_states")
    device = "cuda"
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
    if itype == torch.bfloat16:
        rtol, atol = 1e-2, 5e-2
    # set seed
    seed_everything(0)
-    if not channel_last:
+    x = torch.randn(batch, dim, seqlen, device=device,
-        x = torch.randn(batch,
+                    dtype=itype).contiguous()
-                        4096 + dim + 64,
+
                        seqlen,
                        device=device,
                        dtype=itype)[:, 4096:4096 + dim, :]
    else:
        x = rearrange(
            torch.randn(batch,
                        seqlen,
                        4096 + dim + 64,
                        device=device,
                        dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s")
    weight = torch.randn(dim, width, device=device, dtype=itype)
    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
    if has_initial_states:
    initial_states = torch.randn(batch,
                                     width - 1,
                                 dim,
                                 width - 1,
                                 device=device,
-                                     dtype=itype).transpose(1, 2)
+                                 dtype=itype)
-    else:
+    x_ref = x.clone()
-        initial_states = None
+    weight_ref = weight.clone()
-    x_ref = x.detach().clone()
+    bias_ref = bias.clone() if bias is not None else None
-    weight_ref = weight.detach().clone()
+    initial_states_ref = initial_states.clone(
    bias_ref = bias.detach().clone() if bias is not None else None
    initial_states_ref = initial_states.detach().clone(
    ) if initial_states is not None else None
    activation = None if not silu_activation else "silu"
-    out, final_states = causal_conv1d_fn(
+    out = causal_conv1d_fn(x,
        x,
                           weight,
                           bias,
-        initial_states=initial_states,
+                           activation=activation,
-        return_final_states=return_final_states,
+                           conv_states=initial_states,
-        activation=activation)
+                           has_initial_state=torch.ones(batch,
                                                        dtype=torch.bool,
                                                        device=x.device))
    out_ref, final_states_ref = causal_conv1d_ref(
        x_ref,
        weight_ref,
        bias_ref,
        initial_states=initial_states_ref,
-        return_final_states=return_final_states,
+        return_final_states=True,
        activation=activation)
-
+    assert initial_states is not None and final_states_ref is not None
-    causal_conv1d_opcheck_fn(x_ref,
+    assert torch.allclose(initial_states,
                             weight_ref,
                             bias_ref,
                             initial_states=initial_states_ref,
                             return_final_states=return_final_states,
                             activation=activation)
    if return_final_states:
        assert final_states is not None and final_states_ref is not None
        assert torch.allclose(final_states,
                          final_states_ref,
                          rtol=rtol,
                          atol=atol)
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
-    if return_final_states:
+    causal_conv1d_opcheck_fn(x,
-        out += F.sigmoid(final_states).sum(dim=-1, keepdim=True)
+                             weight,
-        out_ref += F.sigmoid(final_states_ref).sum(dim=-1, keepdim=True)
+                             bias,
                             activation=activation,
                             conv_states=initial_states,
                             has_initial_state=torch.ones(batch,
                                                          dtype=torch.bool,
                                                          device=x.device))
@pytest.mark.parametrize("itype", [torch.bfloat16])
@pytest.mark.parametrize("silu_activation", [False, True])
@pytest.mark.parametrize("has_bias", [False, True])
-@pytest.mark.parametrize("width", [2, 3, 4])
+@pytest.mark.parametrize("seqlen", [1])
@pytest.mark.parametrize("width", [4])
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
-@pytest.mark.parametrize("batch", [1, 2])
+def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
 def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation,
                              itype):
    device = "cuda"
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
@ -246,8 +232,9 @@ def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation,
    # set seed
    seed_everything(0)
    batch = 2
-    x = torch.randn(batch, dim, device=device, dtype=itype)
+    x = torch.randn(batch, dim, seqlen, device=device, dtype=itype)
-    conv_state = torch.randn(batch, dim, width, device=device, dtype=itype)
+    conv_state = torch.randn(batch, dim, width - 1, device=device, dtype=itype)
    weight = torch.randn(dim,
                         width,
                         device=device,
@ -273,9 +260,15 @@ def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation,
    assert torch.equal(conv_state, conv_state_ref)
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
-    opcheck(
+    opcheck(torch.ops._C.causal_conv1d_update, (
-        torch.ops._C.causal_conv1d_update,
+        x,
-        (x, conv_state, weight, bias, activation in ["silu", "swish"], None))
+        conv_state,
        weight,
        bias,
        activation in ["silu", "swish"],
        None,
        None,
    ))
@pytest.mark.parametrize("itype",
@ -292,16 +285,16 @@ def test_causal_conv1d_update_with_batch_gather(dim, width, seqlen, has_bias,
    if itype == torch.bfloat16:
        rtol, atol = 1e-2, 5e-2
-    # set seed
+    # set )seed
-    torch.random.manual_seed(0)
+    seed_everything(0)
    batch = 64
-    x = torch.randn(batch, dim, device=device, dtype=itype)
+    x = torch.randn(batch, dim, 1, device=device, dtype=itype)
    total_entries = 10 * batch
    conv_state = torch.randn(total_entries,
                             dim,
-                             width,
+                             width - 1,
                             device=device,
                             dtype=itype)
    conv_state_indices = torch.randperm(total_entries)[:batch].to(
@ -332,3 +325,100 @@ def test_causal_conv1d_update_with_batch_gather(dim, width, seqlen, has_bias,
    assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
    opcheck(torch.ops._C.causal_conv1d_update, (
        x,
        conv_state,
        weight,
        bias,
        activation in ["silu", "swish"],
        None,
        conv_state_indices,
    ))
@pytest.mark.parametrize("itype", [torch.bfloat16])
@pytest.mark.parametrize("silu_activation", [True])
@pytest.mark.parametrize("has_bias", [True])
@pytest.mark.parametrize("width", [4])
@pytest.mark.parametrize('seqlen',
                         [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
@pytest.mark.parametrize('dim', [64, 4096])
 def test_causal_conv1d_varlen(dim, seqlen, width, has_bias, silu_activation,
                              itype):
    device = "cuda"
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
    if itype == torch.bfloat16:
        rtol, atol = 1e-2, 5e-2
    # set seed
    seed_everything(0)
    batch = 1
    seqlens = []
    nsplits = 3
    eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
    seqlens.append(
        torch.diff(
            torch.cat(
                [torch.tensor([-1]), eos_pos,
                 torch.tensor([seqlen - 1])])).tolist())
    assert sum(seqlens[-1]) == seqlen
    assert all(s > 0 for s in seqlens[-1])
    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
                          dim=0)
    x = torch.randn(batch, 4096 + dim + 64, seqlen, device=device,
                    dtype=itype)[:, 4096:4096 + dim, :]
    weight = torch.randn(dim, width, device=device, dtype=itype)
    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
    x_ref = x.clone()
    weight_ref = weight.clone()
    bias_ref = bias.clone() if bias is not None else None
    activation = None if not silu_activation else "silu"
    final_states = torch.randn(nsplits + 1,
                               dim,
                               width - 1,
                               device=x.device,
                               dtype=x.dtype)
    final_states_ref = final_states.clone()
    has_initial_states = torch.randint(0,
                                       2, (cumsum.shape[0] - 1, ),
                                       dtype=torch.bool,
                                       device=x.device)
    cache_indices = torch.randperm(cumsum.shape[0] - 1,
                                   dtype=torch.int32,
                                   device=x.device)
    out = causal_conv1d_fn(x.squeeze(0), weight, bias, cumsum.cuda(),
                           cache_indices, has_initial_states, final_states,
                           activation)
    out_ref = []
    out_ref_b = []
    splits = [torch.split(var, seqlens[0], dim=-1) for var in (x_ref)]
    for i in range(len(seqlens[0])):
        x_s = [v[i].unsqueeze(0) for v in splits][0]
        out_ref_b.append(
            causal_conv1d_ref(
                x_s,
                weight_ref,
                bias_ref,
                activation=activation,
                return_final_states=True,
                final_states_out=final_states_ref[cache_indices[i]].unsqueeze(
                    0),
                initial_states=final_states_ref[cache_indices[i]].unsqueeze(0)
                if has_initial_states[i] else None))
    out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2))
    out_ref = torch.cat(out_ref, dim=0)
    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
    print("Output state max diff"
          f":{(final_states - final_states_ref).abs().max()}")
    print("Output state mean diff"
          f":{(final_states - final_states_ref).abs().mean()}")
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
    assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol)
    causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(),
                             cache_indices, has_initial_states, final_states,
                             activation)
--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@ -98,8 +98,8 @@ def selective_scan_ref(u,
                       delta_bias=None,
                       delta_softplus=False,
                       return_last_state=False,
-                       position_indices=None,
+                       prev_state=None,
-                       prev_state=None):
+                       final_state_out=None):
    """
    u: r(B D L)
    delta: r(B D L)
@ -139,11 +139,7 @@ def selective_scan_ref(u,
            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
    if is_variable_C and C.dim() == 4:
        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
    last_state = None
    for i in range(u.shape[2]):
        if position_indices is not None and position_indices[0, i] == 0:
            x = deltaB_u[:, :, i]
        else:
        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
        if not is_variable_C:
            y = torch.einsum('bdn,dn->bd', x, C)
@ -153,14 +149,17 @@ def selective_scan_ref(u,
            else:
                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
        if i == u.shape[2] - 1:
-            last_state = x
+            if final_state_out is None:
                final_state_out = x
            else:
                final_state_out.copy_(x)
        ys.append(y)
    y = torch.stack(ys, dim=2)  # (batch dim L)
    out = y if D is None else y + u * rearrange(D, "d -> d 1")
    if z is not None:
        out = out * F.silu(z)
    out = out.to(dtype=dtype_in)
-    return out if not return_last_state else (out, last_state)
+    return out if not return_last_state else (out, final_state_out)
 def selective_scan_opcheck_fn(u,
@ -172,9 +171,10 @@ def selective_scan_opcheck_fn(u,
                              z=None,
                              delta_bias=None,
                              delta_softplus=False,
-                              return_last_state=False,
+                              cu_seq_len=None,
-                              position_indices=None,
+                              cache_indices=None,
-                              prev_state=None):
+                              has_initial_state=None,
                              ssm_states=None):
    """if return_last_state is True, returns (out, last_state)
    last_state has shape (batch, dim, dstate).
    """
@ -190,36 +190,27 @@ def selective_scan_opcheck_fn(u,
        C = C.contiguous()
    if z is not None and z.stride(-1) != 1:
        z = z.contiguous()
-    if B.dim() == 3:
+    if B.dim() == 3 and cu_seq_len is None:
        B = B.unsqueeze(1)
-    if C.dim() == 3:
+    if B.dim() == 2 and cu_seq_len is not None:
        B = B.unsqueeze(0)
    if C.dim() == 3 and cu_seq_len is None:
        C = C.unsqueeze(1)
-    n_chunks = int((u.shape[-1] + 2048 - 1) / 2048)
+    if C.dim() == 2 and cu_seq_len is not None:
-    x = torch.zeros((
+        C = C.unsqueeze(0)
        u.shape[0],
        u.shape[1],
        n_chunks,
        int(A.shape[1] * 2),
    ),
                    device=u.device,
                    dtype=torch.float32,
                    requires_grad=False)
    x[:, :, 0, 0::2] = 1
    if prev_state is not None:
        x[:, :, 0, 1::2].copy_(prev_state)
    # Disable test_autograd_registration for now as it seems to trigger
    # a bogus error.
    opcheck(torch.ops._C.selective_scan_fwd,
-            (u, delta, A, B, C, D, z, delta_bias, delta_softplus,
+            (u, delta, A, B, C, D, z, delta_bias, delta_softplus, cu_seq_len,
-             position_indices, x),
+             cache_indices, has_initial_state, ssm_states),
            test_utils=["test_schema", "test_faketensor"])
@pytest.mark.parametrize('wtype', [torch.float32])
-@pytest.mark.parametrize('itype', [torch.float32])
+@pytest.mark.parametrize('itype',
                         [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
@pytest.mark.parametrize("return_last_state", [True])
@pytest.mark.parametrize('has_delta_bias', [True])
@pytest.mark.parametrize('delta_softplus', [True])
@pytest.mark.parametrize('has_z', [True])
@ -229,8 +220,8 @@ def selective_scan_opcheck_fn(u,
@pytest.mark.parametrize("is_variable_B", [True])
@pytest.mark.parametrize("scan_chunks", [1, 2, 3])
 def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
-                        has_z, has_delta_bias, delta_softplus,
+                        has_z, has_delta_bias, delta_softplus, seqlen, itype,
-                        return_last_state, seqlen, itype, wtype, scan_chunks):
+                        wtype, scan_chunks):
    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
        pytest.skip()  # This config is not applicable
    device = 'cuda'
@ -243,10 +234,11 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
        atolw = max(atolw, atol)
    # set seed
    seed_everything(0)
-    batch_size = 2
+    batch_size = 1
    dim = 4
    dstate = 8
    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
    A_ref = A.clone()
    if not is_variable_B:
        B_shape = [dim, dstate]
    elif varBC_groups == 1:
@ -256,6 +248,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
    B = torch.randn(B_shape,
                    device=device,
                    dtype=wtype if not is_variable_B else itype)
    B_ref = B.clone()
    if not is_variable_C:
        C_shape = [dim, dstate]
    elif varBC_groups == 1:
@ -265,16 +258,25 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
    C = torch.randn(C_shape,
                    device=device,
                    dtype=wtype if not is_variable_C else itype)
    C_ref = C.clone()
    D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
    D_ref = D.clone()
    z = torch.randn(batch_size, dim, seqlen, device=device,
                    dtype=itype) if has_z else None
    z_ref = z.clone() if has_z else None
    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
                  ) if has_delta_bias else None
    u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
    u_ref = u.clone()
    delta = (0.5 *
             torch.rand(batch_size, dim, seqlen, device=device, dtype=itype))
-    state = None
+    delta_ref = delta.clone()
-    state_ref = None
+    state_shape = (batch_size, u.shape[1], int(A.shape[1]))
    state = torch.randn(state_shape,
                        device=u.device,
                        dtype=itype,
                        requires_grad=False)
    state_ref = state.clone()
    out = None
    out_ref = None
    outs = []
@ -294,7 +296,9 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
        if has_z:
            assert z is not None
            _z = z[..., chunk_start:chunk_end]
-        out, *rest = selective_scan_fn(u[..., chunk_start:chunk_end],
+        out = selective_scan_fn(
            u[..., chunk_start:chunk_end],
            state,
            delta[..., chunk_start:chunk_end],
            A,
            _B,
@ -303,31 +307,29 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
            z=_z,
            delta_bias=delta_bias,
            delta_softplus=delta_softplus,
-                                       return_last_state=return_last_state,
+            has_initial_state=torch.ones(batch_size,
-                                       prev_state=state if c > 0 else None)
+                                         device=u.device,
                                         dtype=torch.bool) if c > 0 else None)
        outs.append(out)
        if return_last_state:
            state = rest[0]
    if len(outs) > 1:
        out = torch.cat(outs, dim=-1)
-    out_ref, *rest = selective_scan_ref(u,
+
-                                        delta,
+    out_ref, state_ref, *rest = selective_scan_ref(
-                                        A,
+        u_ref,
-                                        B,
+        delta_ref,
-                                        C,
+        A_ref,
-                                        D,
+        B_ref,
-                                        z=z,
+        C_ref,
        D_ref,
        z=z_ref,
        delta_bias=delta_bias,
        delta_softplus=delta_softplus,
-                                        return_last_state=return_last_state)
+        return_last_state=True)
    if return_last_state:
        state_ref = rest[0]
    assert out is not None and out_ref is not None
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
    if return_last_state:
    assert state is not None and state_ref is not None
-        assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(state, state_ref.to(itype), rtol=rtol, atol=atol)
    selective_scan_opcheck_fn(u,
                              delta,
@ -335,10 +337,10 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
                              B,
                              C,
                              D,
-                              z=z,
+                              z,
                              delta_bias=delta_bias,
                              delta_softplus=delta_softplus,
-                              return_last_state=return_last_state)
+                              ssm_states=state)
@pytest.mark.parametrize("itype",
@ -391,9 +393,131 @@ def test_selective_state_update(dim, dstate, has_z, itype):
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
@pytest.mark.parametrize('wtype', [torch.float32])
@pytest.mark.parametrize('itype', [torch.float32])
@pytest.mark.parametrize('seqlen', [1, 128, 129, 256, 512, 1024, 2048, 4096])
@pytest.mark.parametrize("return_last_state", [True])
@pytest.mark.parametrize('has_delta_bias', [True])
@pytest.mark.parametrize('delta_softplus', [True])
@pytest.mark.parametrize('has_z', [True])
@pytest.mark.parametrize('has_D', [True])
@pytest.mark.parametrize("varBC_groups", [1, 2])
@pytest.mark.parametrize("is_variable_C", [True])
@pytest.mark.parametrize("is_variable_B", [True])
 def test_selective_scan_varlen(is_variable_B, is_variable_C, varBC_groups,
                               has_D, has_z, has_delta_bias, delta_softplus,
                               return_last_state, seqlen, itype, wtype):
    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
        pytest.skip()  # This config is not applicable
    device = 'cuda'
    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
    if itype == torch.bfloat16:
        rtol, atol = 3e-2, 5e-2
    rtolw, atolw = (1e-3, 1e-3)
    if has_z:  # If we have z, the errors on the weights seem higher
        rtolw = max(rtolw, rtol)
        atolw = max(atolw, atol)
    # set seed
    torch.random.manual_seed(0)
    seqlens = []
    nsplits = 3
    if seqlen < 10:
        nsplits = 0
    eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
    seqlens.append(
        torch.diff(
            torch.cat(
                [torch.tensor([-1]), eos_pos,
                 torch.tensor([seqlen - 1])])).tolist())
    assert sum(seqlens[-1]) == seqlen
    assert all(s > 0 for s in seqlens[-1])
    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
                          dim=0).cuda()
    dim = 4
    dstate = 8
    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
    A_ref = A.clone()
    B_shape = [varBC_groups, dstate, seqlen]
    B = torch.randn(B_shape,
                    device=device,
                    dtype=wtype if not is_variable_B else itype)
    B_ref = B.clone()
    C_shape = [varBC_groups, dstate, seqlen]
    C = torch.randn(C_shape,
                    device=device,
                    dtype=wtype if not is_variable_C else itype)
    C_ref = C.clone()
    D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
    D_ref = D.clone()
    z = torch.randn(dim, seqlen, device=device, dtype=itype)
    z_ref = z.clone()
    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
                  ) if has_delta_bias else None
    u = torch.randn(dim, seqlen, device=device, dtype=itype)
    u_ref = u.clone()
    delta = (0.5 * torch.rand(dim, seqlen, device=device, dtype=itype))
    delta_ref = delta.clone()
    out = None
    out_ref = None
    prev_state_shape = (cumsum.shape[0] - 1, u.shape[0], int(A.shape[1]))
    prev_state = torch.randn(prev_state_shape,
                             device=u.device,
                             dtype=itype,
                             requires_grad=False)
    prev_state_ref = prev_state.clone()
    cache_indices = torch.randperm(cumsum.shape[0] - 1,
                                   dtype=torch.int32,
                                   device=u.device)
    has_initial_state = torch.randint(0,
                                      2, (cumsum.shape[0] - 1, ),
                                      dtype=torch.bool,
                                      device=u.device)
    out = selective_scan_fn(u, prev_state, delta, A, B, C, D, z, delta_bias,
                            delta_softplus, cumsum, cache_indices,
                            has_initial_state)
    outs_ref = []
    splits = [
        torch.split(var, seqlens[0], dim=-1)
        for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
    ]
    for i in range(len(seqlens[0])):
        u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits]
        out_ref_s, _ = selective_scan_ref(
            u_s,
            delta_s,
            A_ref,
            B_s,
            C_s,
            D_ref,
            z=z_s,
            delta_bias=delta_bias,
            delta_softplus=delta_softplus,
            return_last_state=return_last_state,
            prev_state=prev_state_ref[cache_indices[i]].unsqueeze(0)
            if has_initial_state[i] else None,
            final_state_out=prev_state_ref[cache_indices[i]].unsqueeze(0))
        outs_ref.append(out_ref_s)
    out_ref = torch.cat(outs_ref, dim=-1) if len(outs_ref) > 1 else outs_ref[0]
    print("Output diff max", (out - out_ref[0]).max())
    print("Output diff mean", (out - out_ref[0]).mean())
    print("Output state diff max", (prev_state - prev_state_ref).max())
    print("Output state diff mean", (prev_state - prev_state_ref).mean())
    assert torch.allclose(prev_state, prev_state_ref, rtol=rtol, atol=atol)
    assert torch.allclose(out, out_ref[0], rtol=rtol, atol=atol)
    selective_scan_opcheck_fn(u, delta, A, B, C, D, z, delta_bias,
                              delta_softplus, cumsum, cache_indices,
                              has_initial_state, prev_state)
@pytest.mark.parametrize("itype",
                         [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("has_z", [True])
@pytest.mark.parametrize("dstate", [16, 32, 64])
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
 def test_selective_state_update_with_batch_indices(dim, dstate, has_z, itype):
@ -405,7 +529,7 @@ def test_selective_state_update_with_batch_indices(dim, dstate, has_z, itype):
            atol *= 2
    # set seed
    torch.random.manual_seed(0)
-    batch_size = 16
+    batch_size = 3
    total_entries = 10 * batch_size
    state = torch.randn(total_entries, dim, dstate, dtype=itype, device=device)
@ -443,6 +567,11 @@ def test_selective_state_update_with_batch_indices(dim, dstate, has_z, itype):
                                         dt_bias=dt_bias,
                                         dt_softplus=True)
    print("Output diff max", (out - out_ref[0]).max())
    print("Output diff mean", (out - out_ref[0]).mean())
    print("Output state diff max", (state[state_indices, :] - state_ref).max())
    print("Output state diff mean",
          (state[state_indices, :] - state_ref).mean())
    assert torch.allclose(state[state_indices, :],
                          state_ref,
                          rtol=rtol,
@ -465,7 +594,7 @@ def test_selective_state_update_with_heads_with_batch_indices(
        rtol, atol = 1e-1, 1e-1
    # set seed
    torch.random.manual_seed(0)
-    batch_size = 16
+    batch_size = 3
    headdim = 64
    nheads = dim // headdim
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@ -1,18 +1,16 @@
 import pytest
 from vllm.sampling_params import SamplingParams
 from vllm.worker.model_runner import _get_graph_batch_size
 from ...utils import check_outputs_equal
-MODELS = ["ai21labs/Jamba-tiny-random"]
+MODELS = ["ai21labs/Jamba-tiny-dev"]
 # Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl
 # TODO: Fix this with trained model
@pytest.mark.skip()
@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [96])
 def test_models(
    hf_runner,
    vllm_runner,
@ -22,7 +20,14 @@ def test_models(
    max_tokens: int,
 ) -> None:
-    with hf_runner(model, dtype=dtype) as hf_model:
+    with hf_runner(
            model,
            dtype=dtype,
            model_kwargs={
                "use_mamba_kernels":
                False,  # mamba kernels are not installed so HF 
                # don't use them
            }) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    with vllm_runner(model, dtype=dtype) as vllm_model:
@ -38,8 +43,8 @@ def test_models(
@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("max_tokens", [96])
 def test_batching(
    vllm_runner,
    example_prompts,
@ -65,6 +70,107 @@ def test_batching(
    )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [10])
 def test_mamba_prefill_chunking_with_parallel_sampling(
        hf_runner, vllm_runner, example_prompts, model: str, dtype: str,
        max_tokens: int) -> None:
    # Tests prefill chunking in conjunction with n>1, in this case,
    # prefill is populated with decoding tokens and we test that it
    # doesn't fail This test might fail if cache is not allocated
    # correctly for n > 1 decoding steps inside a
    # chunked prefill forward pass (where we have both prefills
    # and decoding together )
    sampling_params = SamplingParams(n=3,
                                     temperature=1,
                                     seed=0,
                                     max_tokens=max_tokens)
    with vllm_runner(
            model,
            dtype=dtype,
            enable_chunked_prefill=True,
            max_num_batched_tokens=30,
            max_num_seqs=10  # forces prefill chunks with decoding
    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10])
 def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
                                model: str, dtype: str,
                                max_tokens: int) -> None:
    # numeric error during prefill chucking produces different generation
    # compared to w/o prefill chunking for those examples, removed them for now
    example_prompts.pop(7)
    example_prompts.pop(2)
    example_prompts.pop(1)
    with hf_runner(
            model,
            dtype=dtype,
            model_kwargs={
                "use_mamba_kernels":
                False,  # mamba kernels are not installed so HF 
                # don't use them
            }) as hf_model:
        non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
    with vllm_runner(model,
                     dtype=dtype,
                     enable_chunked_prefill=True,
                     max_num_batched_tokens=5,
                     max_num_seqs=2) as vllm_model:
        chunked = vllm_model.generate_greedy(example_prompts,
                                             max_tokens=max_tokens)
    check_outputs_equal(
        outputs_0_lst=chunked,
        outputs_1_lst=non_chunked,
        name_0="chunked",
        name_1="non_chunked",
    )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [15])
 def test_parallel_sampling(
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
 ) -> None:
    with vllm_runner(model, dtype=dtype) as vllm_model:
        for_loop_outputs = []
        for _ in range(10):
            for_loop_outputs.append(
                # using example_prompts index 1 instead of 0 since with 0 the
                # logprobs get really close and the test doesn't pass
                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
                [0])
        sampling_params = SamplingParams(n=10,
                                         temperature=0.001,
                                         seed=0,
                                         max_tokens=max_tokens)
        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
                                             sampling_params)
    token_ids, texts = n_lt_1_outputs[0]
    n_lt_1_outputs = [(token_id, text)
                      for token_id, text in zip(token_ids, texts)]
    check_outputs_equal(
        outputs_0_lst=n_lt_1_outputs,
        outputs_1_lst=for_loop_outputs,
        name_0="vllm_n_lt_1_outputs",
        name_1="vllm",
    )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [20])
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -440,9 +440,10 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
    @torch.library.register_fake("_C::causal_conv1d_fwd")
    def causal_conv1d_fwd_fake(x: torch.Tensor, weight: torch.Tensor,
                               bias_: Optional[torch.Tensor],
-                               seq_idx_: Optional[torch.Tensor],
+                               conv_states: Optional[torch.Tensor],
-                               initial_states_: Optional[torch.Tensor],
+                               cu_seq_len: Optional[torch.Tensor],
-                               final_states_out_: Optional[torch.Tensor],
+                               cache_indices: Optional[torch.Tensor],
                               has_initial_state: Optional[torch.Tensor],
                               silu_activation: bool) -> torch.Tensor:
        return torch.empty_like(x)
@ -450,22 +451,22 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
    def causal_conv1d_update_fake(
            x: torch.Tensor, conv_state: torch.Tensor, weight: torch.Tensor,
            bias_: Optional[torch.Tensor], silu_activation: bool,
            cache_seqlens: Optional[torch.Tensor],
            conv_state_indices: Optional[torch.Tensor]) -> torch.Tensor:
        return torch.empty_like(x)
    @torch.library.register_fake("_C::selective_scan_fwd")
-    def selective_scan_fwd_fake(
+    def selective_scan_fwd_fake(u: torch.Tensor, delta: torch.Tensor,
-            u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
+                                A: torch.Tensor, B: torch.Tensor,
-            B: torch.Tensor, C: torch.Tensor, D_: Optional[torch.Tensor],
+                                C: torch.Tensor, D_: Optional[torch.Tensor],
-            z_: Optional[torch.Tensor], delta_bias_: Optional[torch.Tensor],
+                                z_: Optional[torch.Tensor],
-            delta_softplus: bool, index_: Optional[torch.Tensor],
+                                delta_bias_: Optional[torch.Tensor],
-            x: Optional[torch.Tensor]) -> List[torch.Tensor]:
+                                delta_softplus: bool,
-        a = torch.empty_like(u)
+                                cu_seq_len: Optional[torch.Tensor],
-        if z_ is not None:
+                                cache_indices: Optional[torch.Tensor],
-            c = torch.empty_like(z_)
+                                has_initial_state: Optional[torch.Tensor],
-            return [a, c]
+                                ssm_states: Optional[torch.Tensor]) -> None:
-        else:
+        return None
            return [a]
 # cutlass
@ -761,37 +762,37 @@ def ggml_mul_mat_a8(
 # mamba
 def causal_conv1d_fwd(x: torch.Tensor, weight: torch.Tensor,
                      bias_: Optional[torch.Tensor],
-                      seq_idx_: Optional[torch.Tensor],
+                      conv_states: Optional[torch.Tensor],
-                      initial_states_: Optional[torch.Tensor],
+                      query_start_loc: Optional[torch.Tensor],
-                      final_states_out_: Optional[torch.Tensor],
+                      cache_indices: Optional[torch.Tensor],
                      has_initial_state: Optional[torch.Tensor],
                      silu_activation: bool) -> torch.Tensor:
-    return torch.ops._C.causal_conv1d_fwd(x, weight, bias_, seq_idx_,
+    return torch.ops._C.causal_conv1d_fwd(x, weight, bias_, conv_states,
-                                          initial_states_, final_states_out_,
+                                          query_start_loc, cache_indices,
-                                          silu_activation)
+                                          has_initial_state, silu_activation)
 def causal_conv1d_update(
-    x: torch.Tensor,
+        x: torch.Tensor, conv_state: torch.Tensor, weight: torch.Tensor,
-    conv_state: torch.Tensor,
+        bias_: Optional[torch.Tensor], silu_activation: bool,
-    weight: torch.Tensor,
+        cache_seqlens: Optional[torch.Tensor],
-    bias_: Optional[torch.Tensor],
+        conv_state_indices: Optional[torch.Tensor]) -> torch.Tensor:
    silu_activation: bool,
    conv_state_indices: Optional[torch.Tensor],
 ) -> torch.Tensor:
    return torch.ops._C.causal_conv1d_update(x, conv_state, weight, bias_,
-                                             silu_activation,
+                                             silu_activation, cache_seqlens,
                                             conv_state_indices)
-def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
+def selective_scan_fwd(
-                       B: torch.Tensor, C: torch.Tensor,
+        u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor, B: torch.Tensor,
-                       D_: Optional[torch.Tensor], z_: Optional[torch.Tensor],
+        C: torch.Tensor, D_: Optional[torch.Tensor],
-                       delta_bias_: Optional[torch.Tensor],
+        z_: Optional[torch.Tensor], delta_bias_: Optional[torch.Tensor],
-                       delta_softplus: bool, index_: Optional[torch.Tensor],
+        delta_softplus: bool, query_start_loc: Optional[torch.Tensor],
-                       x: Optional[torch.Tensor]) -> List[torch.Tensor]:
+        cache_indices: Optional[torch.Tensor],
-    return torch.ops._C.selective_scan_fwd(u, delta, A, B, C, D_, z_,
+        has_initial_state: Optional[torch.Tensor], ssm_states: torch.Tensor):
-                                           delta_bias_, delta_softplus, index_,
+    torch.ops._C.selective_scan_fwd(u, delta, A, B, C, D_, z_, delta_bias_,
-                                           x)
+                                    delta_softplus, query_start_loc,
                                    cache_indices, has_initial_state,
                                    ssm_states)
 # moe
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@ -12,59 +12,44 @@ def causal_conv1d_fn(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: Optional[torch.Tensor] = None,
-    seq_idx: Optional[torch.Tensor] = None,
+    query_start_loc: Optional[torch.Tensor] = None,
-    initial_states: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
-    return_final_states: bool = False,
+    has_initial_state: Optional[torch.Tensor] = None,
-    final_states_out=None,
+    conv_states: Optional[torch.Tensor] = None,
-    activation: str = "silu",
+    activation: Optional[str] = "silu",
 ):
    """
-    x: (batch, dim, seqlen)
+    x: (batch, dim, seqlen) or (dim,cu_seq_len) for varlen
        sequences are concatenated from left to right for varlen
    weight: (dim, width)
    bias: (dim,)
-    seq_idx: (batch, seqlen)
+    query_start_loc: (batch + 1) int32
-    initial_states: (batch, dim, width - 1)
+        The cumulative sequence lengths of the sequences in
-    final_states_out: (batch, dim, width - 1), to be written to
+        the batch, used to index into sequence. prepended by 0.
        for example: query_start_loc = torch.Tensor([0,10,16,17]), 
        x.shape=(dim,17)
    cache_indices: (batch)  int32
        indicates the corresponding state index, 
        like so: conv_state = conv_states[cache_indices[batch_id]]
    has_initial_state: (batch) bool
        indicates whether should the kernel take the current state as initial 
        state for the calculations
    conv_states: (...,dim,width - 1) itype
        updated inplace if provided
    activation: either None or "silu" or "swish"
    out: (batch, dim, seqlen)
    """
    if activation not in [None, "silu", "swish"]:
        raise NotImplementedError("activation must be None, silu, or swish")
-    if x.stride(2) != 1 and x.stride(1) != 1:
+    if x.stride(-1) != 1:
        x = x.contiguous()
    bias = bias.contiguous() if bias is not None else None
    if seq_idx is not None:
        assert (initial_states is
                None), "initial_states must be None if seq_idx is not None"
        assert (not return_final_states
                ), "If seq_idx is not None, we don't return final_states_out"
    seq_idx = seq_idx.contiguous() if seq_idx is not None else None
    if initial_states is not None and (initial_states.stride(2) != 1
                                       and initial_states.stride(1) != 1):
        initial_states = initial_states.contiguous()
    if return_final_states:
        assert (
            x.stride(1) == 1
        ), "Only channel-last layout support returning final_states_out"
        if final_states_out is not None:
            assert (final_states_out.stride(2) == 1
                    or final_states_out.stride(1) == 1)
        else:
            batch, dim, seqlen = x.shape
            width = weight.shape[1]
            final_states_out = torch.empty(batch,
                                           width - 1,
                                           dim,
                                           device=x.device,
                                           dtype=x.dtype).transpose(1, 2)
    else:
        final_states_out = None
-    out = ops.causal_conv1d_fwd(x, weight, bias, seq_idx, initial_states,
+    out = ops.causal_conv1d_fwd(x, weight, bias, conv_states, query_start_loc,
-                                final_states_out, activation
+                                cache_indices, has_initial_state, activation
                                in ["silu", "swish"])
-    return (out, None) if not return_final_states else (out, final_states_out)
+    return out
 def causal_conv1d_update(x: torch.Tensor,
@ -72,21 +57,33 @@ def causal_conv1d_update(x: torch.Tensor,
                         weight: torch.Tensor,
                         bias: Optional[torch.Tensor] = None,
                         activation: Optional[str] = None,
                         cache_seqlens: Optional[torch.Tensor] = None,
                         conv_state_indices: Optional[torch.Tensor] = None):
    """
-    x: (batch, dim)
+    x: (batch, dim) or (batch, dim, seqlen)
-    conv_state: (batch, dim, width)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
    weight: (dim, width)
    bias: (dim,)
    cache_seqlens: (batch,), dtype int32.
        If not None, the conv_state is treated as a circular buffer.
        The conv_state will be updated by copying x to the conv_state 
        starting at the index
        @cache_seqlens % state_len.
    conv_state_indices: (batch,), dtype int32
        If not None, the conv_state is a larger tensor along the batch dim, 
        and we are selecting the batch coords specified by conv_state_indices.
        Useful for a continuous batching scenario.
-    out: (batch, dim)
+    out: (batch, dim) or (batch, dim, seqlen)
    """
    if activation not in [None, "silu", "swish"]:
        raise NotImplementedError("activation must be None, silu, or swish")
-    activation_bool = activation in ["silu", "swish"]
+    activation_val = activation in ["silu", "swish"]
-    return ops.causal_conv1d_update(x, conv_state, weight, bias,
+    unsqueeze = x.dim() == 2
-                                    activation_bool, conv_state_indices)
+    if unsqueeze:
        x = x.unsqueeze(-1)
    out = ops.causal_conv1d_update(x, conv_state, weight, bias, activation_val,
                                   cache_seqlens, conv_state_indices)
    if unsqueeze:
        out = out.squeeze(-1)
    return out
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@ -1,6 +1,8 @@
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/selective_state_update.py
 from typing import Tuple
 import torch
 import triton
 import triton.language as tl
@ -317,7 +319,9 @@ def selective_state_update(state,
    return out
-def selective_scan_fn(u,
+def selective_scan_fn(
        u,
        ssm_states,
        delta,
        A,
        B,
@ -326,11 +330,39 @@ def selective_scan_fn(u,
        z=None,
        delta_bias=None,
        delta_softplus=False,
-                      return_last_state=False,
+        query_start_loc=None,
-                      position_indices=None,
+        cache_indices=None,
-                      prev_state=None):
+        has_initial_state=None) -> Tuple[torch.Tensor, torch.Tensor]:
-    """if return_last_state is True, returns (out, last_state)
+    """
    u: (dim, total_length) for varlen or (batch, dim, seqlen) 
    delta: (dim, total_length) for varlen or (batch, dim, seqlen)
    A: (dim, dstate) 
    B: (ngroups, dstate, total_length) for varlen or 
                                        (batch,ngroups,dstate,seqlen)
    C: (ngroups, dstate, total_length) for varlen or 
                                        (batch,ngroups,dstate,seqlen)
    D: (dim,) 
    z: (dim, total_length) for varlen or (batch, dim, seqlen) 
    dt_bias: (dim,) or (dim)
    query_start_loc: (batch + 1) int32
        The cumulative sequence lengths of the sequences in
        the batch, used to index into sequence. prepended with 0.
        for example: query_start_loc = torch.Tensor([0,10,16,17]), 
        x.shape=(dim,17)
    cache_indices: (batch) int32
        A tensor with each cell is a correspondent 
        input and output ssm_state index
    has_initial_state: (batch) bool
        A tensor populated with ones and zeros, 
        indicate if the ssm_state at the corresponding index should be 
        used as initial state. Not providing argument assumes 
        there's no initial state
    returns
        output: (dim, total_length) for varlen or (batch, dim, seqlen) 
                supports inplace replacement
        last_state has shape (batch, dim, dstate). 
                supports inplace replacement if ssm_state was provided
    """
    if u.stride(-1) != 1:
        u = u.contiguous()
@ -344,28 +376,20 @@ def selective_scan_fn(u,
        C = C.contiguous()
    if z is not None and z.stride(-1) != 1:
        z = z.contiguous()
-    if B.dim() == 3:
+    if B.dim() == 3 and query_start_loc is None:
        B = B.unsqueeze(1)
-    if C.dim() == 3:
+    if B.dim() == 2 and query_start_loc is not None:
        B = B.unsqueeze(0)
    if C.dim() == 3 and query_start_loc is None:
        C = C.unsqueeze(1)
-    n_chunks = int((u.shape[-1] + 2048 - 1) / 2048)
+    if C.dim() == 2 and query_start_loc is not None:
-    x = torch.zeros((
+        C = C.unsqueeze(0)
-        u.shape[0],
+
-        u.shape[1],
+    ops.selective_scan_fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus,
-        n_chunks,
+                           query_start_loc, cache_indices, has_initial_state,
-        int(A.shape[1] * 2),
+                           ssm_states)
-    ),
+
                    device=u.device,
                    dtype=torch.float32,
                    requires_grad=False)
    x[:, :, 0, 0::2] = 1
    if prev_state is not None:
        x[:, :, 0, 1::2].copy_(prev_state)
    out, *rest = ops.selective_scan_fwd(u, delta, A, B, C, D, z, delta_bias,
                                        delta_softplus, position_indices, x)
    last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
    if z is None:
-        return out if not return_last_state else (out, last_state)
+        return delta  # output written inplace to delta
    else:
-        out_z = rest[0]
+        return z  # output written inplace to z
        return out_z if not return_last_state else (out_z, last_state)
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@ -138,42 +138,47 @@ class JambaMambaMixer(nn.Module):
        self.c_layernorm = RMSNorm(self.ssm_state_size,
                                   eps=config.rms_norm_eps)
-    def mamba_forward(self,
+    def forward(self, hidden_states: torch.Tensor,
-                      hidden_states: torch.Tensor,
+                attn_metadata: AttentionMetadata, conv_state: torch.Tensor,
-                      cache_params: MambaCacheParams = None):
+                ssm_state: torch.Tensor):
        # 1. Gated MLP's linear projection
-        projected_states = self.in_proj(hidden_states)[0].transpose(1, 2)
+        projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
-        hidden_states, gate = projected_states.chunk(2, dim=1)
+        hidden_states, gate = projected_states.chunk(2, dim=-2)
        # 2. Convolution sequence transformation
        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
                                               self.conv1d.weight.size(2))
        if cache_params is not None and not cache_params.is_prompt:
            hidden_states = causal_conv1d_update(
                hidden_states.squeeze(-1),
                cache_params.conv_state,
                conv_weights,
                self.conv1d.bias,
                self.activation,
            )
            hidden_states = hidden_states.unsqueeze(-1)
        else:
            if cache_params is not None:
                conv_states = nn.functional.pad(
                    hidden_states,
                    (self.conv_kernel_size - hidden_states.shape[-1], 0))
                cache_params.conv_state.copy_(conv_states)
-            hidden_states, _ = causal_conv1d_fn(
+        if attn_metadata.query_start_loc is not None \
            and attn_metadata.context_lens_tensor is not None:
            # |---------- N-1 iteration --------|
            # |---------------- N iteration ---------------------|
            # |- tokenA -|......................|-- newTokens ---|
            # |---------- context_len ----------|
            # |-------------------- seq_len ---------------------|
            #                                   |-- query_len ---|
            hidden_states = causal_conv1d_fn(
                hidden_states,
                conv_weights,
                self.conv1d.bias,
                activation=self.activation,
                conv_states=conv_state,
                has_initial_state=attn_metadata.context_lens_tensor > 0,
                query_start_loc=attn_metadata.query_start_loc)
        else:
            hidden_states = causal_conv1d_update(
                hidden_states.transpose(0, 1),
                conv_state,
                conv_weights,
                self.conv1d.bias,
                self.activation,
            )
            hidden_states = hidden_states.transpose(0, 1)
        # 3. State Space Model sequence transformation
        # 3.a. input varying initialization of time_step, B and C
-        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))[0]
+        ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
        time_step, B, C = torch.split(
            ssm_parameters,
@ -184,72 +189,46 @@ class JambaMambaMixer(nn.Module):
        B = self.b_layernorm(B.contiguous())
        C = self.c_layernorm(C.contiguous())
-        discrete_time_step = self.dt_proj(time_step)[0].transpose(1, 2)
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
            self.dt_proj, "bias") else None)
-        if cache_params is not None and not cache_params.is_prompt:
+
-            scan_outputs = selective_state_update(
+        if attn_metadata.query_start_loc is not None \
-                cache_params.ssm_state,
+            and attn_metadata.context_lens_tensor is not None:
-                hidden_states[..., 0],
+            scan_outputs = selective_scan_fn(
                discrete_time_step[..., 0],
                self.A,
                B[:, 0],
                C[:, 0],
                self.D,
                gate[..., 0],
                time_proj_bias,
                dt_softplus=True,
            ).unsqueeze(-1)
        else:
            scan_outputs, ssm_state = selective_scan_fn(
                hidden_states,
                ssm_state,
                discrete_time_step,
                self.A,
-                B.transpose(1, 2),
+                B.transpose(-2, -1),
-                C.transpose(1, 2),
+                C.transpose(-2, -1),
                self.D.float(),
                gate,
                time_proj_bias,
                delta_softplus=True,
-                return_last_state=True,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
                query_start_loc=attn_metadata.query_start_loc)
        else:
            scan_outputs = selective_state_update(
                ssm_state,
                hidden_states.transpose(0, 1),
                discrete_time_step.transpose(0, 1),
                self.A,
                B,
                C,
                self.D,
                gate.transpose(0, 1),
                time_proj_bias,
                dt_softplus=True,
            )
-            if ssm_state is not None and cache_params is not None:
+            scan_outputs = scan_outputs.transpose(0, 1)
                cache_params.ssm_state.copy_(ssm_state)
        # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))[0]
+        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
                                                                     -1))[0]
        return contextualized_states
    def forward(
        self,
        hidden_states: torch.Tensor,
        attn_metadata: AttentionMetadata,
        conv_state: torch.Tensor,
        ssm_state: torch.Tensor,
    ):
        if attn_metadata.prefill_metadata is not None:
            offset = 0
            for i, prompt_len in enumerate(
                    attn_metadata.prefill_metadata.seq_lens):
                cache = MambaCacheParams(True,
                                         conv_state=conv_state[i].unsqueeze(0),
                                         ssm_state=ssm_state[i].unsqueeze(0))
                hidden_states[offset:offset + prompt_len].copy_(
                    self.mamba_forward(hidden_states[offset:offset +
                                                     prompt_len].unsqueeze(0),
                                       cache_params=cache)[0])
                offset += prompt_len
        else:
            cache = MambaCacheParams(False,
                                     conv_state=conv_state,
                                     ssm_state=ssm_state)
            hidden_states = self.mamba_forward(hidden_states.unsqueeze(1),
                                               cache_params=cache)
            hidden_states = hidden_states.squeeze(1)
        return hidden_states
 class JambaMoE(nn.Module):
@ -571,8 +550,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
        lora_config: Optional[LoRAConfig] = None,
        scheduler_config: Optional[SchedulerConfig] = None,
    ) -> None:
        assert not scheduler_config.chunked_prefill_enabled, \
            "Jamba currently does not support chunked prefill"
        assert not cache_config.enable_prefix_caching, \
            "Jamba currently does not support prefix caching"
@ -616,18 +593,10 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
        if "seqlen_agnostic_capture_inputs" not in kwargs:
            # We get here only on Prefill/Eager mode runs
            assert all(
                key in kwargs
                for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
            finished_requests_ids = kwargs["finished_requests_ids"]
-            self._release_mamba_cache(finished_requests_ids)
+            mamba_cache = self._release_finished_and_prepare_mamba_cache(
-            batch_size = input_ids.shape[0]
+                finished_requests_ids, request_ids_to_seq_ids)
            if attn_metadata.prefill_metadata:
                batch_size = len(request_ids_to_seq_ids)
            mamba_cache = self._prepare_current_run_mamba_cache(
                request_ids_to_seq_ids, batch_size, finished_requests_ids)
        else:
            # CUDA graph capturing runs
            mamba_cache = kwargs["seqlen_agnostic_capture_inputs"]
@ -699,13 +668,15 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
    def _prepare_current_run_mamba_cache(
            self, request_ids_to_seq_ids: Dict[str, list[int]],
-            batch_size: int, finished_requests_ids: List[str]):
+            finished_requests_ids: List[str]
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        running_indices = []
        request_ids_to_seq_ids_flatten = [
            (req_id, seq_id)
            for req_id, seq_ids in request_ids_to_seq_ids.items()
            for seq_id in seq_ids
        ]
        batch_size = len(request_ids_to_seq_ids_flatten)
        for dest_index, (request_id,
                         seq_id) in enumerate(request_ids_to_seq_ids_flatten):
            if request_id in finished_requests_ids:
@ -769,22 +740,21 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
                    seq_ids2index.update({seq_id: to_index})
                    return
    def _release_finished_and_prepare_mamba_cache(
            self, finished_requests_ids,
            request_ids_to_seq_ids) -> Tuple[torch.Tensor, torch.Tensor]:
        self._release_mamba_cache(finished_requests_ids)
        return self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
                                                     finished_requests_ids)
    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
        """
        Copy the relevant Mamba cache into the CUDA graph input buffer 
        that was provided during the capture runs 
        (JambaForCausalLM.mamba_gc_cache_buffer). 
        """
-        assert all(
+        self._release_finished_and_prepare_mamba_cache(
-            key in kwargs
+            kwargs["finished_requests_ids"], kwargs["request_ids_to_seq_ids"])
            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
        finished_requests_ids = kwargs["finished_requests_ids"]
        self._release_mamba_cache(finished_requests_ids)
        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
        cg_batch_size = input_buffers['input_ids'].shape[0]
        self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
                                              cg_batch_size,
                                              finished_requests_ids)
    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
        """
@ -819,7 +789,7 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
        hidden_size = self.config.hidden_size
        conv_state_shape = (
            self.config.mamba_expand * hidden_size // world_size,
-            self.config.mamba_d_conv,
+            self.config.mamba_d_conv - 1,
        )
        temporal_state_shape = (
            self.config.mamba_expand * self.config.hidden_size // world_size,