Skip to content

AWS EC2 VLLM

Bases: BaseServerObject

Serve an Expert Model using VLLM on AWS EC2 Machine Expert models operate independently and can be hosted on different or common machines. They may include closed-source models like GPT-4. Refer to (app/configs/demo_orch_ec2_mix.json) for configuration details.

Source code in app/servers/vllm_ec2_server.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class Ec2VllmExpert(BaseServerObject):
    """
        Serve an Expert Model using VLLM on AWS EC2 Machine
        Expert models operate independently and can be hosted on different or common machines. 
        They may include closed-source models like GPT-4. Refer to (app/configs/demo_orch_ec2_mix.json) for configuration details.
    """
    def __init__(self,**kwargs):
        """
        """
        self.config = kwargs
        self.aws_engine = AwsEngine(self.config)
        self.instance_name = self.config.get('instance_name')
        self.model_id = self.config.get('model_id')
        self.base_url = None
        self.config.update(dict(logs=[]))

    def start_server(self):
        """
        Starts a Dedicated EC2 Instance
        The 'instance_name' serves as a unique identifier. If an instance tagged with 'instance_name' is already present in a given region, the operation is aborted. The unique identifier can be edited in app/server_manager.py: get_server.
        """
        instance_state = \
            create_or_revive_expert_instance(
                self.aws_engine,
                instance_name=self.instance_name,
                model_id=self.model_id,
                expert_instance_type=self.config['instance_type'],
                KeyName=self.config['KeyName'],
                ami_id=self.config['ami_id'],
                wait_for_expert_to_start=False,
                key_path = self.config['KeyPath']
            )
        self.config.update(instance_state)
        self.config['logs'].append(f'starting expert server {self.model_id}')

    def _set_base_url(self):
        self.ip_address = self.config.get('ip_address')
        self.port = self.config.get('port', 8000)
        self.transfer_protocol =  self.config.get('transfer_protocol', 'http')
        self.base_url = \
            f"{self.transfer_protocol}://{self.ip_address}:{self.port}/v1"
        self.docs_url = \
            f"{self.transfer_protocol}://{self.ip_address}:{self.port}/docs"

    def start_inference_endpoint(self, max_wait_time=120):
        """Starts a new tumx session with name 'tmux_session_name' and activates the environment 'pytorch'.        
        NOTE: we provide an aws ami that has the required conda env and VLLM installed and ready to used. For changing the env refer app/utils/ssh_utils.py : start_vllm_server

        Args:
            max_wait_time (int, optional): Defaults to 120.
        """
        instance_meta = \
            run_vllm_server(
                    self.aws_engine,
                    self.instance_name,
                    self.model_id,
                    max_wait_time = max_wait_time,
                    wait_for_expert_to_start=False,
                    key_path= self.config['KeyPath']
                )
        self.config.update(instance_meta)
        self._set_base_url()
        self.config['logs'].append(f'starting expert inference endpoint {self.model_id}')

    def stop_server(self):
        """Stops the Ec2 server. 
        """
        response = stop_ec2_instance(
            self.aws_engine, 
            self.config['InstanceId']
        )
        return response

    def check_servers_state(self):
        ssh = get_ssh_session(self.config['ip_address'], key_path=self.config['KeyPath'])
        status = check_vllm_server_running_status(
                ssh, wait=False, verbose=False, docs_url=self.docs_url)
        return status

    def get_response(self, message, stream=False):
        """Generate Text using the expert LLM.

        Args:
            message (str): Input Query
            stream (bool, optional): Get a response stream. Defaults to False.

        Returns:
            response (str): Output Text Generation
        """
        messages = [{"role": "user", "content": message}]
        if stream:
            return self._generate_stream(messages)
        else:
            return self._generate(messages)

    def _generate(self, messages, max_new_tokens=1000):
        openai.api_base = self.base_url
        try:
            response = openai.ChatCompletion.create(
                model=self.model_id,
                messages=messages,
                request_timeout=60
            )
            answer = response['choices'][0]['message']['content']
            return answer
        except Exception as e:
            print(f"An error occurred: {e}")
            return "Sorry, I couldn't process your request. Too many requests for me to handle!"

    def _generate_stream(self, messages, max_new_tokens=1000):
        openai.api_base = self.base_url
        try:
            response = openai.ChatCompletion.create(
                model=self.model_id,
                messages=messages,
                request_timeout=60,
                stream=True
            )
            for chunk in response:
                content = chunk['choices'][0]['delta'].get("content", "")
                yield content
        except Exception as e:
            print(f"An error occurred: {e}")
            yield "Sorry, I couldn't process your request. Too many requests for me to handle!"

__init__(**kwargs)

Source code in app/servers/vllm_ec2_server.py
19
20
21
22
23
24
25
26
27
def __init__(self,**kwargs):
    """
    """
    self.config = kwargs
    self.aws_engine = AwsEngine(self.config)
    self.instance_name = self.config.get('instance_name')
    self.model_id = self.config.get('model_id')
    self.base_url = None
    self.config.update(dict(logs=[]))

get_response(message, stream=False)

Generate Text using the expert LLM.

Parameters:

Name Type Description Default
message str

Input Query

required
stream bool

Get a response stream. Defaults to False.

False

Returns:

Name Type Description
response str

Output Text Generation

Source code in app/servers/vllm_ec2_server.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def get_response(self, message, stream=False):
    """Generate Text using the expert LLM.

    Args:
        message (str): Input Query
        stream (bool, optional): Get a response stream. Defaults to False.

    Returns:
        response (str): Output Text Generation
    """
    messages = [{"role": "user", "content": message}]
    if stream:
        return self._generate_stream(messages)
    else:
        return self._generate(messages)

start_inference_endpoint(max_wait_time=120)

Starts a new tumx session with name 'tmux_session_name' and activates the environment 'pytorch'.
NOTE: we provide an aws ami that has the required conda env and VLLM installed and ready to used. For changing the env refer app/utils/ssh_utils.py : start_vllm_server

Parameters:

Name Type Description Default
max_wait_time int

Defaults to 120.

120
Source code in app/servers/vllm_ec2_server.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def start_inference_endpoint(self, max_wait_time=120):
    """Starts a new tumx session with name 'tmux_session_name' and activates the environment 'pytorch'.        
    NOTE: we provide an aws ami that has the required conda env and VLLM installed and ready to used. For changing the env refer app/utils/ssh_utils.py : start_vllm_server

    Args:
        max_wait_time (int, optional): Defaults to 120.
    """
    instance_meta = \
        run_vllm_server(
                self.aws_engine,
                self.instance_name,
                self.model_id,
                max_wait_time = max_wait_time,
                wait_for_expert_to_start=False,
                key_path= self.config['KeyPath']
            )
    self.config.update(instance_meta)
    self._set_base_url()
    self.config['logs'].append(f'starting expert inference endpoint {self.model_id}')

start_server()

Starts a Dedicated EC2 Instance The 'instance_name' serves as a unique identifier. If an instance tagged with 'instance_name' is already present in a given region, the operation is aborted. The unique identifier can be edited in app/server_manager.py: get_server.

Source code in app/servers/vllm_ec2_server.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def start_server(self):
    """
    Starts a Dedicated EC2 Instance
    The 'instance_name' serves as a unique identifier. If an instance tagged with 'instance_name' is already present in a given region, the operation is aborted. The unique identifier can be edited in app/server_manager.py: get_server.
    """
    instance_state = \
        create_or_revive_expert_instance(
            self.aws_engine,
            instance_name=self.instance_name,
            model_id=self.model_id,
            expert_instance_type=self.config['instance_type'],
            KeyName=self.config['KeyName'],
            ami_id=self.config['ami_id'],
            wait_for_expert_to_start=False,
            key_path = self.config['KeyPath']
        )
    self.config.update(instance_state)
    self.config['logs'].append(f'starting expert server {self.model_id}')

stop_server()

Stops the Ec2 server.

Source code in app/servers/vllm_ec2_server.py
77
78
79
80
81
82
83
84
def stop_server(self):
    """Stops the Ec2 server. 
    """
    response = stop_ec2_instance(
        self.aws_engine, 
        self.config['InstanceId']
    )
    return response