Change the name to vLLM (#150)

This commit is contained in:
Woosuk Kwon 2023-06-17 03:07:40 -07:00 committed by GitHub
parent e5464ee484
commit 0b98ba15c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
90 changed files with 342 additions and 339 deletions

View File

@ -1,6 +1,6 @@
# Contributing to CacheFlow # Contributing to vLLM
Thank you for your interest in contributing to CacheFlow! Thank you for your interest in contributing to vLLM!
Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
There are several ways you can contribute to the project: There are several ways you can contribute to the project:
@ -11,9 +11,9 @@ There are several ways you can contribute to the project:
However, remember that contributions aren't just about code. However, remember that contributions aren't just about code.
We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
Finally, one of the most impactful ways to support us is by raising awareness about CacheFlow. Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
Talk about it in your blog posts, highlighting how it's driving your incredible projects. Talk about it in your blog posts, highlighting how it's driving your incredible projects.
Express your support on Twitter if CacheFlow aids you, or simply offer your appreciation by starring our repository. Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
## Setup for development ## Setup for development
@ -70,5 +70,5 @@ If a comment isn't clear or you disagree with a suggestion, feel free to ask for
### Thank You ### Thank You
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to CacheFlow. Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
Your contributions make CacheFlow a great tool for everyone! Your contributions make vLLM a great tool for everyone!

View File

@ -1,4 +1,4 @@
# CacheFlow # vLLM
## Build from source ## Build from source
@ -28,7 +28,7 @@ python examples/simple_server.py --help
To start the server: To start the server:
```bash ```bash
ray start --head ray start --head
python -m cacheflow.entrypoints.fastapi_server # --model <your_model> python -m vllm.entrypoints.fastapi_server # --model <your_model>
``` ```
To test the server: To test the server:
@ -45,9 +45,9 @@ pip install gradio
Start the server: Start the server:
```bash ```bash
python -m cacheflow.http_frontend.fastapi_frontend python -m vllm.http_frontend.fastapi_frontend
# At another terminal # At another terminal
python -m cacheflow.http_frontend.gradio_webserver python -m vllm.http_frontend.gradio_webserver
``` ```
## Load LLaMA weights ## Load LLaMA weights
@ -62,5 +62,5 @@ Since LLaMA weight is not fully public, we cannot directly download the LLaMA we
2. For all the commands above, specify the model with `--model /output/path/llama-7b` to load the model. For example: 2. For all the commands above, specify the model with `--model /output/path/llama-7b` to load the model. For example:
```bash ```bash
python simple_server.py --model /output/path/llama-7b python simple_server.py --model /output/path/llama-7b
python -m cacheflow.http_frontend.fastapi_frontend --model /output/path/llama-7b python -m vllm.http_frontend.fastapi_frontend --model /output/path/llama-7b
``` ```

View File

@ -1,4 +1,4 @@
# Benchmarking CacheFlow # Benchmarking vLLM
## Downloading the ShareGPT dataset ## Downloading the ShareGPT dataset

View File

@ -11,7 +11,7 @@ def main(args: argparse.Namespace):
for i in range(args.n_threads)] for i in range(args.n_threads)]
api_url = f"http://{args.host}:{args.port}/generate" api_url = f"http://{args.host}:{args.port}/generate"
headers = {"User-Agent": "CacheFlow Benchmark Client"} headers = {"User-Agent": "vLLM Benchmark Client"}
ploads = [{ ploads = [{
"prompt": p, "prompt": p,
"max_tokens": args.max_tokens, "max_tokens": args.max_tokens,

View File

@ -6,7 +6,7 @@ import numpy as np
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from cacheflow import LLM, SamplingParams from vllm import LLM, SamplingParams
def main(args: argparse.Namespace): def main(args: argparse.Namespace):

View File

@ -1,8 +1,8 @@
"""Benchmark online serving throughput. """Benchmark online serving throughput.
On the server side, run one of the following commands: On the server side, run one of the following commands:
(CacheFlow backend) (vLLM backend)
python -m cacheflow.entrypoints.api_server \ python -m vllm.entrypoints.api_server \
--disable-log-requests --model <your_model> --disable-log-requests --model <your_model>
(TGI backend) (TGI backend)
@ -114,7 +114,7 @@ async def send_request(
request_start_time = time.time() request_start_time = time.time()
headers = {"User-Agent": "Benchmark Client"} headers = {"User-Agent": "Benchmark Client"}
if backend == "cacheflow": if backend == "vllm":
pload = { pload = {
"prompt": prompt, "prompt": prompt,
"n": 1, "n": 1,
@ -213,8 +213,8 @@ def main(args: argparse.Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Benchmark the online serving throughput.") description="Benchmark the online serving throughput.")
parser.add_argument("--backend", type=str, default="cacheflow", parser.add_argument("--backend", type=str, default="vllm",
choices=["cacheflow", "tgi"]) choices=["vllm", "tgi"])
parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8001) parser.add_argument("--port", type=int, default=8001)
parser.add_argument("--dataset", type=str, required=True, parser.add_argument("--dataset", type=str, required=True,

View File

@ -5,12 +5,13 @@ import random
import time import time
from typing import List, Tuple from typing import List, Tuple
from cacheflow import LLM, SamplingParams
import torch import torch
from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM, from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM,
PreTrainedTokenizerBase) PreTrainedTokenizerBase)
from tqdm import tqdm from tqdm import tqdm
from vllm import LLM, SamplingParams
def get_tokenizer(model_name: str) -> PreTrainedTokenizerBase: def get_tokenizer(model_name: str) -> PreTrainedTokenizerBase:
config = AutoConfig.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name)
@ -70,7 +71,7 @@ def sample_requests(
return sampled_requests return sampled_requests
def run_cacheflow( def run_vllm(
requests: List[Tuple[str, int, int]], requests: List[Tuple[str, int, int]],
model: str, model: str,
tensor_parallel_size: int, tensor_parallel_size: int,
@ -172,8 +173,8 @@ def main(args: argparse.Namespace):
tokenizer = get_tokenizer(args.model) tokenizer = get_tokenizer(args.model)
requests = sample_requests(args.dataset, args.num_prompts, tokenizer) requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
if args.backend == "cacheflow": if args.backend == "vllm":
elapsed_time = run_cacheflow( elapsed_time = run_vllm(
requests, args.model, args.tensor_parallel_size, args.seed, args.n, requests, args.model, args.tensor_parallel_size, args.seed, args.n,
args.use_beam_search) args.use_beam_search)
elif args.backend == "hf": elif args.backend == "hf":
@ -192,8 +193,8 @@ def main(args: argparse.Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.") parser = argparse.ArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend", type=str, choices=["cacheflow", "hf"], parser.add_argument("--backend", type=str, choices=["vllm", "hf"],
default="cacheflow") default="vllm")
parser.add_argument("--dataset", type=str, required=True, parser.add_argument("--dataset", type=str, required=True,
help="Path to the dataset.") help="Path to the dataset.")
parser.add_argument("--model", type=str, default="facebook/opt-125m") parser.add_argument("--model", type=str, default="facebook/opt-125m")
@ -207,7 +208,7 @@ if __name__ == "__main__":
parser.add_argument("--hf-max-batch-size", type=int, default=None, parser.add_argument("--hf-max-batch-size", type=int, default=None,
help="Maximum batch size for HF backend.") help="Maximum batch size for HF backend.")
args = parser.parse_args() args = parser.parse_args()
if args.backend == "cacheflow": if args.backend == "vllm":
if args.hf_max_batch_size is not None: if args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.") raise ValueError("HF max batch size is only for HF backend.")
elif args.backend == "hf": elif args.backend == "hf":

View File

@ -1,18 +0,0 @@
from cacheflow.engine.arg_utils import EngineArgs
from cacheflow.engine.llm_engine import LLMEngine
from cacheflow.engine.ray_utils import initialize_cluster
from cacheflow.entrypoints.llm import LLM
from cacheflow.outputs import CompletionOutput, RequestOutput
from cacheflow.sampling_params import SamplingParams
__version__ = "0.1.0"
__all__ = [
"LLM",
"SamplingParams",
"RequestOutput",
"CompletionOutput",
"LLMEngine",
"EngineArgs",
"initialize_cluster",
]

View File

@ -1,10 +0,0 @@
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.model_loader import get_model
from cacheflow.model_executor.utils import set_random_seed
__all__ = [
"InputMetadata",
"get_model",
"set_random_seed",
]

View File

@ -1,12 +0,0 @@
from cacheflow.model_executor.models.gpt_neox import GPTNeoXForCausalLM
from cacheflow.model_executor.models.gpt2 import GPT2LMHeadModel
from cacheflow.model_executor.models.llama import LlamaForCausalLM
from cacheflow.model_executor.models.opt import OPTForCausalLM
__all__ = [
"GPT2LMHeadModel",
"GPTNeoXForCausalLM",
"LlamaForCausalLM",
"OPTForCausalLM",
]

View File

@ -1,7 +1,7 @@
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
namespace cacheflow { namespace vllm {
template<typename T> template<typename T>
__device__ __forceinline__ T silu(const T& x) { __device__ __forceinline__ T silu(const T& x) {
@ -22,7 +22,7 @@ __global__ void silu_and_mul_kernel(
} }
} }
} // namespace cacheflow } // namespace vllm
void silu_and_mul( void silu_and_mul(
torch::Tensor& out, // [num_tokens, d] torch::Tensor& out, // [num_tokens, d]
@ -40,7 +40,7 @@ void silu_and_mul(
input.scalar_type(), input.scalar_type(),
"silu_and_mul_kernel", "silu_and_mul_kernel",
[&] { [&] {
cacheflow::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>( vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
out.data_ptr<scalar_t>(), out.data_ptr<scalar_t>(),
input.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
d); d);

View File

@ -1,6 +1,6 @@
/* /*
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The CacheFlow team. * Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -19,7 +19,7 @@
#include <stdint.h> #include <stdint.h>
namespace cacheflow { namespace vllm {
// A vector type to store Q, K, V elements. // A vector type to store Q, K, V elements.
template<typename T, int VEC_SIZE> template<typename T, int VEC_SIZE>
@ -61,4 +61,4 @@ inline __device__ void zero(T& dst) {
dst = tmp.raw; dst = tmp.raw;
} }
} // namespace cacheflow } // namespace vllm

View File

@ -1,6 +1,6 @@
/* /*
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The CacheFlow team. * Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -27,7 +27,7 @@
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
namespace cacheflow { namespace vllm {
// Utility function for attention softmax. // Utility function for attention softmax.
template<int NUM_WARPS> template<int NUM_WARPS>
@ -315,10 +315,10 @@ __global__ void single_query_cached_kv_attention_kernel(
} }
} }
} // namespace cacheflow } // namespace vllm
#define LAUNCH_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS) \ #define LAUNCH_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS) \
cacheflow::single_query_cached_kv_attention_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS> \ vllm::single_query_cached_kv_attention_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS> \
<<<grid, block, shared_mem_size, stream>>>( \ <<<grid, block, shared_mem_size, stream>>>( \
out_ptr, \ out_ptr, \
query_ptr, \ query_ptr, \

View File

@ -1,6 +1,6 @@
/* /*
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The CacheFlow team. * Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -22,7 +22,7 @@
#include <float.h> #include <float.h>
#include <type_traits> #include <type_traits>
namespace cacheflow { namespace vllm {
// Q*K^T operation. // Q*K^T operation.
template<int THREAD_GROUP_SIZE, typename Vec, int N> template<int THREAD_GROUP_SIZE, typename Vec, int N>
@ -52,4 +52,4 @@ struct Qk_dot {
} }
}; };
} // namespace cacheflow } // namespace vllm

View File

@ -1,7 +1,7 @@
/* /*
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The CacheFlow team. * Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -25,7 +25,7 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <stdint.h> #include <stdint.h>
namespace cacheflow { namespace vllm {
// Define custom BF16 vector data types. // Define custom BF16 vector data types.
struct bf16_4_t { struct bf16_4_t {
@ -420,4 +420,4 @@ inline __device__ void from_float(bf16_8_t& dst, Float8_ src) {
#endif #endif
} }
} // namespace cacheflow } // namespace vllm

View File

@ -1,7 +1,7 @@
/* /*
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The CacheFlow team. * Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -23,7 +23,7 @@
#include <stdint.h> #include <stdint.h>
namespace cacheflow { namespace vllm {
// FP16 vector types for Q, K, V. // FP16 vector types for Q, K, V.
template<> template<>
@ -441,4 +441,4 @@ inline __device__ Float8_ to_float(uint4 u) {
return tmp; return tmp;
} }
} // namespace cacheflow } // namespace vllm

View File

@ -1,7 +1,7 @@
/* /*
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The CacheFlow team. * Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -22,7 +22,7 @@
#include <stdint.h> #include <stdint.h>
namespace cacheflow { namespace vllm {
// Define custom FP32 vector data types. // Define custom FP32 vector data types.
struct Float4_ { struct Float4_ {
@ -265,4 +265,4 @@ inline __device__ Float8_ to_float(Float8_ u) {
return u; return u;
} }
} // namespace cacheflow } // namespace vllm

View File

@ -46,7 +46,7 @@ void swap_blocks(
} }
} }
namespace cacheflow { namespace vllm {
// Grid: (num_layers, num_pairs) // Grid: (num_layers, num_pairs)
template<typename scalar_t> template<typename scalar_t>
@ -77,7 +77,7 @@ __global__ void copy_blocks_kernel(
} }
} }
} // namespace cacheflow } // namespace vllm
void copy_blocks( void copy_blocks(
std::vector<torch::Tensor>& key_caches, std::vector<torch::Tensor>& key_caches,
@ -129,7 +129,7 @@ void copy_blocks(
at::ScalarType::Half, at::ScalarType::Half,
at::ScalarType::BFloat16, at::ScalarType::BFloat16,
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] { key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
cacheflow::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>( vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
key_cache_ptrs_tensor.data_ptr<int64_t>(), key_cache_ptrs_tensor.data_ptr<int64_t>(),
value_cache_ptrs_tensor.data_ptr<int64_t>(), value_cache_ptrs_tensor.data_ptr<int64_t>(),
block_mapping_tensor.data_ptr<int>(), block_mapping_tensor.data_ptr<int>(),
@ -137,7 +137,7 @@ void copy_blocks(
})); }));
} }
namespace cacheflow { namespace vllm {
template<typename scalar_t> template<typename scalar_t>
__global__ void reshape_and_cache_kernel( __global__ void reshape_and_cache_kernel(
@ -181,7 +181,7 @@ __global__ void reshape_and_cache_kernel(
} }
} }
} // namespace cacheflow } // namespace vllm
void reshape_and_cache( void reshape_and_cache(
torch::Tensor& key, // [num_tokens, num_heads, head_size] torch::Tensor& key, // [num_tokens, num_heads, head_size]
@ -208,7 +208,7 @@ void reshape_and_cache(
key.scalar_type(), key.scalar_type(),
"reshape_and_cache_kernel", "reshape_and_cache_kernel",
[&] { [&] {
cacheflow::reshape_and_cache_kernel<scalar_t><<<grid, block, 0, stream>>>( vllm::reshape_and_cache_kernel<scalar_t><<<grid, block, 0, stream>>>(
key.data_ptr<scalar_t>(), key.data_ptr<scalar_t>(),
value.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
key_cache.data_ptr<scalar_t>(), key_cache.data_ptr<scalar_t>(),
@ -223,7 +223,7 @@ void reshape_and_cache(
}); });
} }
namespace cacheflow { namespace vllm {
// Grid: (num_blocks, block_size). // Grid: (num_blocks, block_size).
template<typename scalar_t> template<typename scalar_t>
@ -343,7 +343,7 @@ __global__ void gather_cached_kv_kernel_optimized(
} }
} }
} // namespace cacheflow } // namespace vllm
void gather_cached_kv( void gather_cached_kv(
torch::Tensor& key, // [out] [num_tokens, num_heads, head_size] torch::Tensor& key, // [out] [num_tokens, num_heads, head_size]
@ -370,7 +370,7 @@ void gather_cached_kv(
key.scalar_type(), key.scalar_type(),
"gather_cached_kv_kernel_optimized", "gather_cached_kv_kernel_optimized",
[&] { [&] {
cacheflow::gather_cached_kv_kernel_optimized<scalar_t><<<grid, block, 0, stream>>>( vllm::gather_cached_kv_kernel_optimized<scalar_t><<<grid, block, 0, stream>>>(
key.data_ptr<scalar_t>(), key.data_ptr<scalar_t>(),
value.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
key_cache.data_ptr<scalar_t>(), key_cache.data_ptr<scalar_t>(),

View File

@ -3,7 +3,7 @@
#include "reduction_utils.cuh" #include "reduction_utils.cuh"
namespace cacheflow { namespace vllm {
// TODO(woosuk): Further optimize this kernel. // TODO(woosuk): Further optimize this kernel.
template<typename scalar_t> template<typename scalar_t>
@ -33,7 +33,7 @@ __global__ void rms_norm_kernel(
} }
} }
} // namespace cacheflow } // namespace vllm
void rms_norm( void rms_norm(
torch::Tensor& out, // [num_tokens, hidden_size] torch::Tensor& out, // [num_tokens, hidden_size]
@ -52,7 +52,7 @@ void rms_norm(
input.scalar_type(), input.scalar_type(),
"rms_norm_kernel", "rms_norm_kernel",
[&] { [&] {
cacheflow::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>( vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
out.data_ptr<scalar_t>(), out.data_ptr<scalar_t>(),
input.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),

View File

@ -1,7 +1,7 @@
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
namespace cacheflow { namespace vllm {
template<typename scalar_t> template<typename scalar_t>
__global__ void rotary_embedding_neox_kernel( __global__ void rotary_embedding_neox_kernel(
@ -46,7 +46,7 @@ __global__ void rotary_embedding_neox_kernel(
} }
} }
} // namespace cacheflow } // namespace vllm
void rotary_embedding_neox( void rotary_embedding_neox(
torch::Tensor& positions, // [num_tokens] torch::Tensor& positions, // [num_tokens]
@ -70,7 +70,7 @@ void rotary_embedding_neox(
query.scalar_type(), query.scalar_type(),
"rotary_embedding_neox", "rotary_embedding_neox",
[&] { [&] {
cacheflow::rotary_embedding_neox_kernel<scalar_t><<<grid, block, 0, stream>>>( vllm::rotary_embedding_neox_kernel<scalar_t><<<grid, block, 0, stream>>>(
positions.data_ptr<int64_t>(), positions.data_ptr<int64_t>(),
query.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
key.data_ptr<scalar_t>(), key.data_ptr<scalar_t>(),

View File

@ -1,6 +1,6 @@
/* /*
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
* Copyright (c) 2023, The CacheFlow team. * Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -17,7 +17,7 @@
*/ */
#pragma once #pragma once
namespace cacheflow { namespace vllm {
template<typename T> template<typename T>
__inline__ __device__ T warpReduceSum(T val) { __inline__ __device__ T warpReduceSum(T val) {
@ -48,4 +48,4 @@ __inline__ __device__ T blockReduceSum(T val) {
return val; return val;
} }
} // namespace cacheflow } // namespace vllm

View File

@ -1,4 +1,4 @@
# CacheFlow documents # vLLM documents
## Build the docs ## Build the docs

View File

@ -17,9 +17,9 @@
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
project = 'CacheFlow' project = 'vLLM'
copyright = '2023, CacheFlow Team' copyright = '2023, vLLM Team'
author = 'the CacheFlow Team' author = 'the vLLM Team'
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
@ -55,7 +55,7 @@ html_title = project
html_theme = 'sphinx_book_theme' html_theme = 'sphinx_book_theme'
html_theme_options = { html_theme_options = {
'path_to_docs': 'docs/source', 'path_to_docs': 'docs/source',
'repository_url': 'https://github.com/WoosukKwon/cacheflow', 'repository_url': 'https://github.com/WoosukKwon/vllm',
'use_repository_button': True, 'use_repository_button': True,
} }

View File

@ -1,8 +1,8 @@
Installation Installation
============ ============
CacheFlow is a Python library that includes some C++ and CUDA code. vLLM is a Python library that includes some C++ and CUDA code.
CacheFlow can run on systems that meet the following requirements: vLLM can run on systems that meet the following requirements:
* OS: Linux * OS: Linux
* Python: 3.8 or higher * Python: 3.8 or higher
@ -10,23 +10,23 @@ CacheFlow can run on systems that meet the following requirements:
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, etc.) * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, etc.)
.. note:: .. note::
As of now, CacheFlow does not support CUDA 12. As of now, vLLM does not support CUDA 12.
If you are using Hopper or Lovelace GPUs, please use CUDA 11.8. If you are using Hopper or Lovelace GPUs, please use CUDA 11.8.
.. tip:: .. tip::
If you have trouble installing CacheFlow, we recommend using the NVIDIA PyTorch Docker image. If you have trouble installing vLLM, we recommend using the NVIDIA PyTorch Docker image.
.. code-block:: console .. code-block:: console
$ # Pull the Docker image with CUDA 11.8. $ # Pull the Docker image with CUDA 11.8.
$ docker run --gpus all -it --rm --shm-size=8g nvcr.io/nvidia/pytorch:22.12-py3 $ docker run --gpus all -it --rm --shm-size=8g nvcr.io/nvidia/pytorch:22.12-py3
Inside the Docker container, please execute :code:`pip uninstall torch` before installing CacheFlow. Inside the Docker container, please execute :code:`pip uninstall torch` before installing vLLM.
Install with pip Install with pip
---------------- ----------------
You can install CacheFlow using pip: You can install vLLM using pip:
.. code-block:: console .. code-block:: console
@ -34,8 +34,8 @@ You can install CacheFlow using pip:
$ conda create -n myenv python=3.8 -y $ conda create -n myenv python=3.8 -y
$ conda activate myenv $ conda activate myenv
$ # Install CacheFlow. $ # Install vLLM.
$ pip install cacheflow # This may take 5-10 minutes. $ pip install vllm # This may take 5-10 minutes.
.. _build_from_source: .. _build_from_source:
@ -43,10 +43,10 @@ You can install CacheFlow using pip:
Build from source Build from source
----------------- -----------------
You can also build and install CacheFlow from source. You can also build and install vLLM from source.
.. code-block:: console .. code-block:: console
$ git clone https://github.com/WoosukKwon/cacheflow.git $ git clone https://github.com/WoosukKwon/vllm.git
$ cd cacheflow $ cd vllm
$ pip install -e . # This may take 5-10 minutes. $ pip install -e . # This may take 5-10 minutes.

View File

@ -8,7 +8,7 @@ Placeholder.
.. code-block:: python .. code-block:: python
from cacheflow import LLM, SamplingParams from vllm import LLM, SamplingParams
# Sample prompts. # Sample prompts.
prompts = [ prompts = [

View File

@ -1,5 +1,5 @@
Welcome to CacheFlow! Welcome to vLLM!
===================== ================
Documentation Documentation
------------- -------------

View File

@ -3,30 +3,30 @@
Adding a New Model Adding a New Model
================== ==================
This document provides a high-level guide on integrating a `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ model into CacheFlow. This document provides a high-level guide on integrating a `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ model into vLLM.
.. note:: .. note::
The complexity of adding a new model depends heavily on the model's architecture. The complexity of adding a new model depends heavily on the model's architecture.
The process is considerably straightforward if the model shares a similar architecture with an existing model in CacheFlow. The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
.. tip:: .. tip::
If you are encountering issues while integrating your model into CacheFlow, feel free to open an issue on our `GitHub <https://github.com/WoosukKwon/cacheflow/issues>`_ repository. If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/WoosukKwon/vllm/issues>`_ repository.
We will be happy to help you out! We will be happy to help you out!
0. Fork the CacheFlow repository 0. Fork the vLLM repository
-------------------------------- --------------------------------
Start by forking our `GitHub <https://github.com/WoosukKwon/cacheflow/issues>`_ repository and then :ref:`build it from source <build_from_source>`. Start by forking our `GitHub <https://github.com/WoosukKwon/vllm/issues>`_ repository and then :ref:`build it from source <build_from_source>`.
This gives you the ability to modify the codebase and test your model. This gives you the ability to modify the codebase and test your model.
1. Bring your model code 1. Bring your model code
------------------------ ------------------------
Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `cacheflow/model_executor/models <https://github.com/WoosukKwon/cacheflow/tree/main/cacheflow/model_executor/models>`_ directory. Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models <https://github.com/WoosukKwon/vllm/tree/main/vllm/model_executor/models>`_ directory.
For instance, CacheFlow's `OPT model <https://github.com/WoosukKwon/cacheflow/blob/main/cacheflow/model_executor/models/opt.py>`_ was adpated from the HuggingFace's `modeling_opt.py <https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py>`_ file. For instance, vLLM's `OPT model <https://github.com/WoosukKwon/vllm/blob/main/vllm/model_executor/models/opt.py>`_ was adpated from the HuggingFace's `modeling_opt.py <https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py>`_ file.
.. warning:: .. warning::
When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
@ -62,11 +62,11 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
+) -> Dict[int, SequenceOutputs]: +) -> Dict[int, SequenceOutputs]:
3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. 3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
4. Replace the attention operation with either :code:`GPTCacheFlowAttention` or :code:`GPTNeoXCacheFlowAttention`, depending on the model's architecture. 4. Replace the attention operation with either :code:`GPTPagedAttention` or :code:`GPTNeoXPagedAttention`, depending on the model's architecture.
.. note:: .. note::
Currently, CacheFlow supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
If your model employs a different attention mechanism, you will need to implement a new attention layer in CacheFlow. If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
3. (Optional) Implement tensor parallelism support 3. (Optional) Implement tensor parallelism support
@ -91,4 +91,4 @@ While the process is straightforward for most layers, the tensor-parallel layers
5. Register your model 5. Register your model
---------------------- ----------------------
Finally, include your :code:`*ForCausalLM` class in `cacheflow/model_executor/models/__init__.py <https://github.com/WoosukKwon/cacheflow/blob/main/cacheflow/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `cacheflow/model_executor/model_loader.py <https://github.com/WoosukKwon/cacheflow/blob/main/cacheflow/model_executor/model_loader.py>`_. Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py <https://github.com/WoosukKwon/vllm/blob/main/vllm/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py <https://github.com/WoosukKwon/vllm/blob/main/vllm/model_executor/model_loader.py>`_.

View File

@ -3,8 +3,8 @@
Supported Models Supported Models
================ ================
CacheFlow supports a variety of generative Transformer models in `HuggingFace Transformers <https://github.com/huggingface/transformers>`_. vLLM supports a variety of generative Transformer models in `HuggingFace Transformers <https://github.com/huggingface/transformers>`_.
The following is the list of model architectures that are currently supported by CacheFlow. The following is the list of model architectures that are currently supported by vLLM.
Alongside each architecture, we include some popular models that use it. Alongside each architecture, we include some popular models that use it.
.. list-table:: .. list-table::
@ -22,19 +22,19 @@ Alongside each architecture, we include some popular models that use it.
* - :code:`OPTForCausalLM` * - :code:`OPTForCausalLM`
- OPT, OPT-IML - OPT, OPT-IML
If your model uses one of the above model architectures, you can seamlessly run your model with CacheFlow. If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model. Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
Alternatively, you can raise an issue on our `GitHub <https://github.com/WoosukKwon/cacheflow/issues>`_ project. Alternatively, you can raise an issue on our `GitHub <https://github.com/WoosukKwon/vllm/issues>`_ project.
.. tip:: .. tip::
The easiest way to check if your model is supported is to run the program below: The easiest way to check if your model is supported is to run the program below:
.. code-block:: python .. code-block:: python
from cacheflow import LLM from vllm import LLM
llm = LLM(model=...) # Name or path of your model llm = LLM(model=...) # Name or path of your model
output = llm.generate("Hello, my name is") output = llm.generate("Hello, my name is")
print(output) print(output)
If CacheFlow successfully generates text, it indicates that your model is supported. If vLLM successfully generates text, it indicates that your model is supported.

View File

@ -1,4 +1,4 @@
"""Example Python client for cacheflow.entrypoints.api_server""" """Example Python client for vllm.entrypoints.api_server"""
import argparse import argparse
import json import json

View File

@ -6,7 +6,7 @@ import requests
def http_bot(prompt): def http_bot(prompt):
headers = {"User-Agent": "Cacheflow Client"} headers = {"User-Agent": "vLLM Client"}
pload = { pload = {
"prompt": prompt, "prompt": prompt,
"stream": True, "stream": True,
@ -24,7 +24,7 @@ def http_bot(prompt):
def build_demo(): def build_demo():
with gr.Blocks() as demo: with gr.Blocks() as demo:
gr.Markdown( gr.Markdown(
"# Cacheflow text completion demo\n" "# vLLM text completion demo\n"
) )
inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER") inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
outputbox = gr.Textbox(label="Output", placeholder="Generated result from the model") outputbox = gr.Textbox(label="Output", placeholder="Generated result from the model")

View File

@ -1,6 +1,6 @@
import argparse import argparse
from cacheflow import EngineArgs, LLMEngine, SamplingParams from vllm import EngineArgs, LLMEngine, SamplingParams
def main(args: argparse.Namespace): def main(args: argparse.Namespace):

View File

@ -1,4 +1,4 @@
from cacheflow import LLM, SamplingParams from vllm import LLM, SamplingParams
# Sample prompts. # Sample prompts.

View File

@ -1,6 +1,6 @@
import openai import openai
# Modify OpenAI's API key and API base to use CacheFlow's API server. # Modify OpenAI's API key and API base to use vLLM's API server.
openai.api_key = "EMPTY" openai.api_key = "EMPTY"
openai.api_base = "http://localhost:8000/v1" openai.api_base = "http://localhost:8000/v1"
model = "facebook/opt-125m" model = "facebook/opt-125m"

View File

@ -3,6 +3,6 @@ python_version = 3.8
ignore_missing_imports = True ignore_missing_imports = True
files = cacheflow files = vllm
# TODO(woosuk): Include the code from Megatron and HuggingFace. # TODO(woosuk): Include the code from Megatron and HuggingFace.
exclude = cacheflow/model_executor/parallel_utils/|cacheflow/model_executor/models/ exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/

View File

@ -75,7 +75,7 @@ ext_modules = []
# Cache operations. # Cache operations.
cache_extension = CUDAExtension( cache_extension = CUDAExtension(
name="cacheflow.cache_ops", name="vllm.cache_ops",
sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"], sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
) )
@ -83,7 +83,7 @@ ext_modules.append(cache_extension)
# Attention kernels. # Attention kernels.
attention_extension = CUDAExtension( attention_extension = CUDAExtension(
name="cacheflow.attention_ops", name="vllm.attention_ops",
sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"], sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
) )
@ -91,7 +91,7 @@ ext_modules.append(attention_extension)
# Positional encoding kernels. # Positional encoding kernels.
positional_encoding_extension = CUDAExtension( positional_encoding_extension = CUDAExtension(
name="cacheflow.pos_encoding_ops", name="vllm.pos_encoding_ops",
sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"], sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
) )
@ -99,7 +99,7 @@ ext_modules.append(positional_encoding_extension)
# Layer normalization kernels. # Layer normalization kernels.
layernorm_extension = CUDAExtension( layernorm_extension = CUDAExtension(
name="cacheflow.layernorm_ops", name="vllm.layernorm_ops",
sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"], sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
) )
@ -107,7 +107,7 @@ ext_modules.append(layernorm_extension)
# Activation kernels. # Activation kernels.
activation_extension = CUDAExtension( activation_extension = CUDAExtension(
name="cacheflow.activation_ops", name="vllm.activation_ops",
sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"], sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
) )
@ -144,18 +144,18 @@ def get_requirements() -> List[str]:
setuptools.setup( setuptools.setup(
name="cacheflow", name="vllm",
version=find_version(get_path("cacheflow", "__init__.py")), version=find_version(get_path("vllm", "__init__.py")),
author="CacheFlow Team", author="vLLM Team",
author_email="cacheflow@gmail.com", author_email="vllm@gmail.com", # FIXME
license="Apache 2.0", license="Apache 2.0",
description="CacheFlow: A high-performance LLM Serving System", description="vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention", # FIXME
long_description=read_readme(), long_description=read_readme(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
url="https://github.com/WoosukKwon/cacheflow", url="https://github.com/WoosukKwon/vllm",
project_urls={ project_urls={
"Homepage": "https://github.com/WoosukKwon/cacheflow", "Homepage": "https://github.com/WoosukKwon/vllm",
"Documentation": "https://cacheflow.readthedocs.io/en/latest/", "Documentation": "https://vllm.readthedocs.io/en/latest/", # FIXME
}, },
classifiers=[ classifiers=[
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",

View File

@ -1,7 +1,7 @@
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from cacheflow import activation_ops from vllm import activation_ops
def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor: def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor:

View File

@ -5,7 +5,7 @@ import torch
from xformers import ops as xops from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from cacheflow import attention_ops from vllm import attention_ops
MAX_SEQ_LEN = 4096 MAX_SEQ_LEN = 4096
TEST_SEED = 0 TEST_SEED = 0

View File

@ -2,7 +2,7 @@ import random
import torch import torch
from cacheflow import cache_ops from vllm import cache_ops
@torch.inference_mode() @torch.inference_mode()

View File

@ -1,7 +1,7 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
from cacheflow import layernorm_ops from vllm import layernorm_ops
class RefRMSNorm(nn.Module): class RefRMSNorm(nn.Module):

View File

@ -4,7 +4,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from cacheflow import pos_encoding_ops from vllm import pos_encoding_ops
def rotate_half(x: torch.Tensor) -> torch.Tensor: def rotate_half(x: torch.Tensor) -> torch.Tensor:

21
vllm/__init__.py Normal file
View File

@ -0,0 +1,21 @@
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
from vllm.engine.ray_utils import initialize_cluster
from vllm.entrypoints.llm import LLM
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.sampling_params import SamplingParams
__version__ = "0.1.0"
__all__ = [
"LLM",
"SamplingParams",
"RequestOutput",
"CompletionOutput",
"LLMEngine",
"EngineArgs",
"AsyncLLMEngine",
"AsyncEngineArgs",
"initialize_cluster",
]

View File

@ -1,7 +1,7 @@
"""Token blocks.""" """Token blocks."""
from typing import List from typing import List
from cacheflow.utils import Device from vllm.utils import Device
_BLANK_TOKEN_ID = -1 _BLANK_TOKEN_ID = -1

View File

@ -3,8 +3,8 @@ from typing import Optional
import torch import torch
from transformers import AutoConfig, PretrainedConfig from transformers import AutoConfig, PretrainedConfig
from cacheflow.logger import init_logger from vllm.logger import init_logger
from cacheflow.utils import get_cpu_memory from vllm.utils import get_cpu_memory
logger = init_logger(__name__) logger = init_logger(__name__)
@ -87,7 +87,7 @@ class CacheConfig:
Args: Args:
block_size: Size of a cache block in number of tokens. block_size: Size of a cache block in number of tokens.
gpu_memory_utilization: Fraction of GPU memory to use for the gpu_memory_utilization: Fraction of GPU memory to use for the
CacheFlow execution. vLLM execution.
swap_space: Size of the CPU swap space per GPU (in GiB). swap_space: Size of the CPU swap space per GPU (in GiB).
""" """
def __init__( def __init__(

View File

@ -1,9 +1,9 @@
"""A block manager that manages token blocks.""" """A block manager that manages token blocks."""
from typing import Dict, List, Optional, Set, Tuple from typing import Dict, List, Optional, Set, Tuple
from cacheflow.block import PhysicalTokenBlock from vllm.block import PhysicalTokenBlock
from cacheflow.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
from cacheflow.utils import Device from vllm.utils import Device
class BlockAllocator: class BlockAllocator:

View File

@ -1,6 +1,6 @@
from typing import List from typing import List
from cacheflow.sequence import SequenceGroup from vllm.sequence import SequenceGroup
class Policy: class Policy:

View File

@ -2,11 +2,11 @@ import enum
import time import time
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
from cacheflow.config import CacheConfig, SchedulerConfig from vllm.config import CacheConfig, SchedulerConfig
from cacheflow.core.block_manager import BlockSpaceManager from vllm.core.block_manager import BlockSpaceManager
from cacheflow.core.policy import PolicyFactory from vllm.core.policy import PolicyFactory
from cacheflow.logger import init_logger from vllm.logger import init_logger
from cacheflow.sequence import (Sequence, SequenceData, SequenceGroup, from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
SequenceGroupMetadata, SequenceOutputs, SequenceGroupMetadata, SequenceOutputs,
SequenceStatus) SequenceStatus)

View File

@ -3,13 +3,13 @@ import dataclasses
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple from typing import Optional, Tuple
from cacheflow.config import (CacheConfig, ModelConfig, ParallelConfig, from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
SchedulerConfig) SchedulerConfig)
@dataclass @dataclass
class EngineArgs: class EngineArgs:
"""Arguments for CacheFlow engine.""" """Arguments for vLLM engine."""
model: str model: str
download_dir: Optional[str] = None download_dir: Optional[str] = None
use_np_weights: bool = False use_np_weights: bool = False
@ -33,7 +33,7 @@ class EngineArgs:
def add_cli_args( def add_cli_args(
parser: argparse.ArgumentParser, parser: argparse.ArgumentParser,
) -> argparse.ArgumentParser: ) -> argparse.ArgumentParser:
"""Shared CLI arguments for CacheFlow engine.""" """Shared CLI arguments for vLLM engine."""
# Model arguments # Model arguments
parser.add_argument('--model', type=str, default='facebook/opt-125m', parser.add_argument('--model', type=str, default='facebook/opt-125m',
help='name or path of the huggingface model to use') help='name or path of the huggingface model to use')
@ -118,7 +118,7 @@ class EngineArgs:
@dataclass @dataclass
class AsyncEngineArgs(EngineArgs): class AsyncEngineArgs(EngineArgs):
"""Arguments for asynchronous CacheFlow engine.""" """Arguments for asynchronous vLLM engine."""
engine_use_ray: bool = False engine_use_ray: bool = False
disable_log_requests: bool = False disable_log_requests: bool = False

View File

@ -2,12 +2,12 @@ import asyncio
import time import time
from typing import Dict, List, Optional from typing import Dict, List, Optional
from cacheflow.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from cacheflow.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from cacheflow.engine.ray_utils import initialize_cluster, ray from vllm.engine.ray_utils import initialize_cluster, ray
from cacheflow.logger import init_logger from vllm.logger import init_logger
from cacheflow.outputs import RequestOutput from vllm.outputs import RequestOutput
from cacheflow.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
logger = init_logger(__name__) logger = init_logger(__name__)
@ -104,7 +104,7 @@ class AsyncLLMEngine:
arrival_time = time.time() arrival_time = time.time()
# Create an event to notify us that there is new output from the # Create an event to notify us that there is new output from the
# cacheflow engine. # vLLM engine.
request_event = asyncio.Event() request_event = asyncio.Event()
self.request_events[request_id] = request_event self.request_events[request_id] = request_event
@ -114,7 +114,7 @@ class AsyncLLMEngine:
f"sampling params: {sampling_params}, " f"sampling params: {sampling_params}, "
f"prompt token ids: {prompt_token_ids}.") f"prompt token ids: {prompt_token_ids}.")
# Add the request into the cacheflow engine's waiting queue. # Add the request into the vLLM engine's waiting queue.
if self.engine_use_ray: if self.engine_use_ray:
await self.engine.add_request.remote( await self.engine.add_request.remote(
request_id, prompt, sampling_params, request_id, prompt, sampling_params,
@ -126,7 +126,7 @@ class AsyncLLMEngine:
prompt_token_ids=prompt_token_ids, prompt_token_ids=prompt_token_ids,
arrival_time=arrival_time) arrival_time=arrival_time)
# The cacheflow engine does not have a background loop that keeps # The vLLM engine does not have a background loop that keeps
# processing incoming requests. Therefore, we need to keep kicking # processing incoming requests. Therefore, we need to keep kicking
# the engine to process the requests. # the engine to process the requests.
while True: while True:

View File

@ -1,19 +1,18 @@
import time import time
from typing import Any, List, Optional from typing import Any, List, Optional
from cacheflow.config import (CacheConfig, ModelConfig, ParallelConfig, from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
SchedulerConfig) SchedulerConfig)
from cacheflow.core.scheduler import Scheduler from vllm.core.scheduler import Scheduler
from cacheflow.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from cacheflow.engine.ray_utils import DeviceID, initialize_cluster, ray from vllm.engine.ray_utils import DeviceID, initialize_cluster, ray
from cacheflow.engine.tokenizer_utils import (detokenize_incrementally, from vllm.engine.tokenizer_utils import detokenize_incrementally, get_tokenizer
get_tokenizer) from vllm.logger import init_logger
from cacheflow.logger import init_logger from vllm.outputs import RequestOutput
from cacheflow.outputs import RequestOutput from vllm.sampling_params import SamplingParams
from cacheflow.sampling_params import SamplingParams from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
from cacheflow.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Counter
from cacheflow.utils import Counter from vllm.worker.worker import Worker
from cacheflow.worker.worker import Worker
logger = init_logger(__name__) logger = init_logger(__name__)
@ -21,7 +20,7 @@ logger = init_logger(__name__)
class LLMEngine: class LLMEngine:
"""An LLM engine that receives requests and generates texts. """An LLM engine that receives requests and generates texts.
This is the main class for the CacheFlow LLM engine. It receives requests This is the main class for the vLLM engine. It receives requests
from clients and generates texts from the LLM. It includes a tokenizer, a from clients and generates texts from the LLM. It includes a tokenizer, a
language model (possibly distributed across multiple GPUs), and GPU memory language model (possibly distributed across multiple GPUs), and GPU memory
space allocated for intermediate states (aka KV cache). This class utilizes space allocated for intermediate states (aka KV cache). This class utilizes

View File

@ -6,7 +6,7 @@ try:
except ImportError: except ImportError:
ray = None ray = None
from cacheflow.config import ParallelConfig from vllm.config import ParallelConfig
DeviceID = Tuple[int, Optional[str], int] # rank, node resource (node IP), device id DeviceID = Tuple[int, Optional[str], int] # rank, node resource (node IP), device id

View File

@ -3,7 +3,7 @@ from typing import List, Tuple, Union
from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer, from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast) PreTrainedTokenizerFast)
from cacheflow.logger import init_logger from vllm.logger import init_logger
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@ -6,10 +6,10 @@ from fastapi import BackgroundTasks, FastAPI, Request
from fastapi.responses import Response, StreamingResponse from fastapi.responses import Response, StreamingResponse
import uvicorn import uvicorn
from cacheflow.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from cacheflow.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from cacheflow.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from cacheflow.utils import random_uuid from vllm.utils import random_uuid
TIMEOUT_KEEP_ALIVE = 5 # seconds. TIMEOUT_KEEP_ALIVE = 5 # seconds.
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds

View File

@ -3,11 +3,11 @@ from typing import List, Optional, Union
from tqdm import tqdm from tqdm import tqdm
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from cacheflow.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from cacheflow.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from cacheflow.outputs import RequestOutput from vllm.outputs import RequestOutput
from cacheflow.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from cacheflow.utils import Counter from vllm.utils import Counter
class LLM: class LLM:

View File

@ -13,17 +13,17 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
import uvicorn import uvicorn
from cacheflow.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from cacheflow.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from cacheflow.engine.tokenizer_utils import get_tokenizer from vllm.engine.tokenizer_utils import get_tokenizer
from cacheflow.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
CompletionRequest, CompletionResponse, CompletionResponseChoice, CompletionRequest, CompletionResponse, CompletionResponseChoice,
CompletionResponseStreamChoice, CompletionStreamResponse, ErrorResponse, CompletionResponseStreamChoice, CompletionStreamResponse, ErrorResponse,
LogProbs, ModelCard, ModelList, ModelPermission, UsageInfo) LogProbs, ModelCard, ModelList, ModelPermission, UsageInfo)
from cacheflow.logger import init_logger from vllm.logger import init_logger
from cacheflow.outputs import RequestOutput from vllm.outputs import RequestOutput
from cacheflow.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from cacheflow.utils import random_uuid from vllm.utils import random_uuid
TIMEOUT_KEEP_ALIVE = 5 # seconds TIMEOUT_KEEP_ALIVE = 5 # seconds
@ -93,11 +93,11 @@ async def create_completion(raw_request: Request):
for the API specification. This API mimics the OpenAI Completion API. for the API specification. This API mimics the OpenAI Completion API.
NOTE: Currently we do not support the following features: NOTE: Currently we do not support the following features:
- echo (since the cacheflow engine does not currently support - echo (since the vLLM engine does not currently support
getting the logprobs of prompt tokens) getting the logprobs of prompt tokens)
- suffix (the language models we currently support do not support - suffix (the language models we currently support do not support
suffix) suffix)
- logit_bias (to be supported in cacheflow engine) - logit_bias (to be supported by vLLM engine)
""" """
request = CompletionRequest(**await raw_request.json()) request = CompletionRequest(**await raw_request.json())
logger.info(f"Received completion request: {request}") logger.info(f"Received completion request: {request}")
@ -107,7 +107,7 @@ async def create_completion(raw_request: Request):
return error_check_ret return error_check_ret
if request.echo: if request.echo:
# We do not support echo since the cacheflow engine does not # We do not support echo since the vLLM engine does not
# currently support getting the logprobs of prompt tokens. # currently support getting the logprobs of prompt tokens.
return create_error_response(HTTPStatus.BAD_REQUEST, return create_error_response(HTTPStatus.BAD_REQUEST,
"echo is not currently supported") "echo is not currently supported")
@ -118,7 +118,7 @@ async def create_completion(raw_request: Request):
"suffix is not currently supported") "suffix is not currently supported")
if request.logit_bias is not None: if request.logit_bias is not None:
# TODO: support logit_bias in cacheflow engine. # TODO: support logit_bias in vLLM engine.
return create_error_response(HTTPStatus.BAD_REQUEST, return create_error_response(HTTPStatus.BAD_REQUEST,
"logit_bias is not currently supported") "logit_bias is not currently supported")
@ -274,7 +274,7 @@ async def create_completion(raw_request: Request):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="CacheFlow OpenAI-Compatible RESTful API server." description="vLLM OpenAI-Compatible RESTful API server."
) )
parser.add_argument("--host", type=str, default="localhost", help="host name") parser.add_argument("--host", type=str, default="localhost", help="host name")
parser.add_argument("--port", type=int, default=8000, help="port number") parser.add_argument("--port", type=int, default=8000, help="port number")

View File

@ -4,7 +4,7 @@ from typing import Dict, List, Literal, Optional, Union
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from cacheflow.utils import random_uuid from vllm.utils import random_uuid
class ErrorResponse(BaseModel): class ErrorResponse(BaseModel):
@ -34,7 +34,7 @@ class ModelCard(BaseModel):
id: str id: str
object: str = "model" object: str = "model"
created: int = Field(default_factory=lambda: int(time.time())) created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "cacheflow" owned_by: str = "vllm"
root: Optional[str] = None root: Optional[str] = None
parent: Optional[str] = None parent: Optional[str] = None
permission: List[ModelPermission] = Field(default_factory=list) permission: List[ModelPermission] = Field(default_factory=list)
@ -82,7 +82,7 @@ class CompletionRequest(BaseModel):
best_of: Optional[int] = None best_of: Optional[int] = None
logit_bias: Optional[Dict[str, float]] = None logit_bias: Optional[Dict[str, float]] = None
user: Optional[str] = None user: Optional[str] = None
# Additional parameters supported by cacheflow # Additional parameters supported by vLLM
top_k: Optional[int] = -1 top_k: Optional[int] = -1
ignore_eos: Optional[bool] = False ignore_eos: Optional[bool] = False
use_beam_search: Optional[bool] = False use_beam_search: Optional[bool] = False

View File

@ -22,7 +22,7 @@ class NewLineFormatter(logging.Formatter):
return msg return msg
_root_logger = logging.getLogger("cacheflow") _root_logger = logging.getLogger("vllm")
_default_handler = None _default_handler = None

View File

@ -0,0 +1,10 @@
from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.utils import set_random_seed
__all__ = [
"InputMetadata",
"get_model",
"set_random_seed",
]

View File

@ -3,8 +3,8 @@ from typing import Dict, List, Tuple
import torch import torch
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from cacheflow.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from cacheflow.sequence import SequenceData from vllm.sequence import SequenceData
class InputMetadata: class InputMetadata:

View File

@ -2,7 +2,7 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
from cacheflow import activation_ops from vllm import activation_ops
_ACTIVATION_REGISTRY = { _ACTIVATION_REGISTRY = {
"gelu": nn.GELU(), "gelu": nn.GELU(),

View File

@ -5,16 +5,16 @@ import torch
import torch.nn as nn import torch.nn as nn
from xformers import ops as xops from xformers import ops as xops
from cacheflow import attention_ops from vllm import attention_ops
from cacheflow import cache_ops from vllm import cache_ops
from cacheflow import pos_encoding_ops from vllm import pos_encoding_ops
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
_SUPPORTED_HEAD_SIZES = [64, 80, 96, 128] _SUPPORTED_HEAD_SIZES = [64, 80, 96, 128]
class GPTCacheFlowAttention(nn.Module): class PagedAttention(nn.Module):
"""GPT-style multi-head attention. """GPT-style multi-head PagedAttention.
This class takes flattened 1D query, key, and value tensors as input. The This class takes flattened 1D query, key, and value tensors as input. The
input 1D tensors can be split into three parts: the prompt tokens, the input 1D tensors can be split into three parts: the prompt tokens, the
@ -164,8 +164,8 @@ class GPTCacheFlowAttention(nn.Module):
return output.view(-1, self.num_heads * self.head_size) return output.view(-1, self.num_heads * self.head_size)
class GPTNeoXCacheFlowAttention(GPTCacheFlowAttention): class PagedAttentionWithRoPE(PagedAttention):
"""Attention with GPT-NeoX style rotary embedding.""" """PagedAttention with GPT-NeoX style rotary embedding."""
def __init__( def __init__(
self, self,

View File

@ -2,7 +2,7 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
from cacheflow import layernorm_ops from vllm import layernorm_ops
class RMSNorm(nn.Module): class RMSNorm(nn.Module):

View File

@ -5,11 +5,11 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
gather_from_tensor_model_parallel_region) gather_from_tensor_model_parallel_region)
from cacheflow.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
class Sampler(nn.Module): class Sampler(nn.Module):

View File

@ -5,10 +5,10 @@ import torch
import torch.nn as nn import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from cacheflow.config import ModelConfig from vllm.config import ModelConfig
from cacheflow.model_executor.models import ( from vllm.model_executor.models import (GPT2LMHeadModel, GPTNeoXForCausalLM,
GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM) LlamaForCausalLM, OPTForCausalLM)
from cacheflow.model_executor.weight_utils import initialize_dummy_weights from vllm.model_executor.weight_utils import initialize_dummy_weights
# TODO(woosuk): Lazy-load the model classes. # TODO(woosuk): Lazy-load the model classes.
_MODEL_REGISTRY = { _MODEL_REGISTRY = {

View File

@ -0,0 +1,12 @@
from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.model_executor.models.opt import OPTForCausalLM
__all__ = [
"GPT2LMHeadModel",
"GPTNeoXForCausalLM",
"LlamaForCausalLM",
"OPTForCausalLM",
]

View File

@ -1,6 +1,6 @@
# coding=utf-8 # coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# #
@ -26,17 +26,17 @@ import torch
from torch import nn from torch import nn
from transformers import GPT2Config from transformers import GPT2Config
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention from vllm.model_executor.layers.attention import PagedAttention
from cacheflow.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights) load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]
@ -53,13 +53,13 @@ class GPT2Attention(nn.Module):
self.head_dim = self.hidden_size // total_num_heads self.head_dim = self.hidden_size // total_num_heads
self.scale = self.head_dim ** -0.5 self.scale = self.head_dim ** -0.5
self.c_attn = ColumnParallelLinear(self.hidden_size, 3 * self.hidden_size, bias=True, self.c_attn = ColumnParallelLinear(self.hidden_size, 3 * self.hidden_size,
gather_output=False, bias=True, gather_output=False,
perform_initialization=False) perform_initialization=False)
self.c_proj = RowParallelLinear(self.hidden_size, self.hidden_size, bias=True, self.c_proj = RowParallelLinear(self.hidden_size, self.hidden_size,
input_is_parallel=True, bias=True, input_is_parallel=True,
perform_initialization=False) perform_initialization=False)
self.attn = GPTCacheFlowAttention(self.num_heads, self.head_dim, self.attn = PagedAttention(self.num_heads, self.head_dim,
scale=self.scale) scale=self.scale)
def forward( def forward(

View File

@ -1,6 +1,6 @@
# coding=utf-8 # coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved. # Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
@ -25,17 +25,17 @@ import torch
from torch import nn from torch import nn
from transformers import GPTNeoXConfig from transformers import GPTNeoXConfig
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
from cacheflow.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights) load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]
@ -63,7 +63,7 @@ class GPTNeoXAttention(nn.Module):
scaling = self.head_size ** -0.5 scaling = self.head_size ** -0.5
rotary_dim = int(self.head_size * config.rotary_pct) rotary_dim = int(self.head_size * config.rotary_pct)
assert rotary_dim % 2 == 0 assert rotary_dim % 2 == 0
self.attn = GPTNeoXCacheFlowAttention(self.num_heads, self.head_size, self.attn = PagedAttentionWithRoPE(self.num_heads, self.head_size,
scaling, rotary_dim) scaling, rotary_dim)
def forward( def forward(
@ -149,6 +149,7 @@ class GPTNeoXLayer(nn.Module):
class GPTNeoXModel(nn.Module): class GPTNeoXModel(nn.Module):
def __init__(self, config: GPTNeoXConfig): def __init__(self, config: GPTNeoXConfig):
super().__init__() super().__init__()
self.config = config self.config = config

View File

@ -1,6 +1,6 @@
# coding=utf-8 # coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
# #
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
@ -30,19 +30,19 @@ import torch
from torch import nn from torch import nn
from transformers import LlamaConfig from transformers import LlamaConfig
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from cacheflow.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
from cacheflow.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights) load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]
@ -104,7 +104,7 @@ class LlamaAttention(nn.Module):
input_is_parallel=True, input_is_parallel=True,
perform_initialization=False, perform_initialization=False,
) )
self.attn = GPTNeoXCacheFlowAttention(self.num_heads, self.head_dim, self.attn = PagedAttentionWithRoPE(self.num_heads, self.head_dim,
self.scaling, rotary_dim=self.head_dim) self.scaling, rotary_dim=self.head_dim)
def forward( def forward(

View File

@ -1,6 +1,6 @@
# coding=utf-8 # coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. # Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
@ -25,17 +25,17 @@ import torch
from torch import nn from torch import nn
from transformers import OPTConfig from transformers import OPTConfig
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention from vllm.model_executor.layers.attention import PagedAttention
from cacheflow.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights) load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]
@ -75,7 +75,7 @@ class OPTAttention(nn.Module):
self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias, self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
input_is_parallel=True, input_is_parallel=True,
perform_initialization=False) perform_initialization=False)
self.attn = GPTCacheFlowAttention(self.num_heads, self.head_dim, self.attn = PagedAttention(self.num_heads, self.head_dim,
scale=self.scaling) scale=self.scaling)
def forward( def forward(

View File

@ -1,5 +1,5 @@
import cacheflow.model_executor.parallel_utils.parallel_state import vllm.model_executor.parallel_utils.parallel_state
import cacheflow.model_executor.parallel_utils.tensor_parallel import vllm.model_executor.parallel_utils.tensor_parallel
# Alias parallel_state as mpu, its legacy name # Alias parallel_state as mpu, its legacy name
mpu = parallel_state mpu = parallel_state

View File

@ -1,4 +1,4 @@
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

View File

@ -1,4 +1,4 @@
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
@ -11,7 +11,7 @@ import torch.nn.functional as F
import torch.nn.init as init import torch.nn.init as init
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
get_all_reduce_launcher, get_all_reduce_launcher,

View File

@ -1,10 +1,10 @@
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/mappings.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/mappings.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import torch import torch
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
get_tensor_model_parallel_group, get_tensor_model_parallel_group,

View File

@ -1,4 +1,4 @@
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/random.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/random.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
@ -11,7 +11,7 @@ import torch
from torch import _C from torch import _C
from torch.cuda import _lazy_call, device as device_ctx_manager from torch.cuda import _lazy_call, device as device_ctx_manager
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
) )

View File

@ -1,4 +1,4 @@
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

View File

@ -4,8 +4,8 @@ import random
import numpy as np import numpy as np
import torch import torch
from cacheflow.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized from vllm.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized
from cacheflow.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed from vllm.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed
def set_random_seed(seed: int) -> None: def set_random_seed(seed: int) -> None:

View File

@ -1,6 +1,6 @@
from typing import Dict, List, Optional from typing import Dict, List, Optional
from cacheflow.sequence import SequenceGroup, SequenceStatus from vllm.sequence import SequenceGroup, SequenceStatus
class CompletionOutput: class CompletionOutput:

View File

@ -2,8 +2,8 @@ import copy
import enum import enum
from typing import Dict, List, Optional, Union from typing import Dict, List, Optional, Union
from cacheflow.block import LogicalTokenBlock from vllm.block import LogicalTokenBlock
from cacheflow.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
class SequenceStatus(enum.Enum): class SequenceStatus(enum.Enum):

View File

@ -3,8 +3,8 @@ from typing import Dict, List, Tuple
import torch import torch
from cacheflow import cache_ops from vllm import cache_ops
from cacheflow.config import CacheConfig, ModelConfig, ParallelConfig from vllm.config import CacheConfig, ModelConfig, ParallelConfig
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]

View File

@ -3,16 +3,15 @@ from typing import Dict, List, Tuple
import torch import torch
from cacheflow.config import (CacheConfig, ModelConfig, ParallelConfig, from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
SchedulerConfig) SchedulerConfig)
from cacheflow.model_executor import get_model, InputMetadata, set_random_seed from vllm.model_executor import get_model, InputMetadata, set_random_seed
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
initialize_model_parallel, initialize_all_reduce_launcher) initialize_model_parallel, initialize_all_reduce_launcher)
from cacheflow.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from cacheflow.sequence import (SequenceData, SequenceGroupMetadata, from vllm.sequence import SequenceData, SequenceGroupMetadata, SequenceOutputs
SequenceOutputs) from vllm.worker.cache_engine import CacheEngine
from cacheflow.worker.cache_engine import CacheEngine from vllm.utils import get_gpu_memory
from cacheflow.utils import get_gpu_memory
class Worker: class Worker: