2023-07-12T16:57:37.051+02:00 | > File "/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py", line 124, in serve_inner model = get_model(model_id, revision, sharded, quantize, trust_remote_code) File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/__init__.py", line 208, in get_model raise NotImplementedError("sharded is not supported for this model")
| 2023-07-12T16:57:37.802+02:00 | NotImplementedError: sharded is not supported for this model #033[2m#033[3mrank#033[0m#033[2m=#033[0m0#033[0m
| 2023-07-12T16:57:37.802+02:00 | #033[2m2023-07-12T14:57:37.591548Z#033[0m #033[31mERROR#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Shard 1 failed to start:
| 2023-07-12T16:57:37.802+02:00 | Traceback (most recent call last): File "/opt/conda/bin/text-generation-server", line 8, in <module> sys.exit(app()) File "/opt/conda/lib/python3.9/site-packages/text_generation_server/cli.py", line 67, in serve server.serve(model_id, revision, sharded, quantize, trust_remote_code, uds_path) File "/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py", line 155, in serve asyncio.run(serve_inner(model_id, revision, sharded, quantize, trust_remote_code)) File "/opt/conda/lib/python3.9/asyncio/runners.py", line 44, in run return loop.run_until_complete(main) File "/opt/conda/lib/python3.9/asyncio/base_events.py", line 647, in run_until_complete return future.result() File "/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py", line 124, in serve_inner model = get_model(model_id, revision, sharded, quantize, trust_remote_code) File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/__init__.py", line 208, in get_model raise NotImplementedError("sharded is not supported for this model")
| 2023-07-12T16:57:37.802+02:00 | NotImplementedError: sharded is not supported for this model
| 2023-07-12T16:57:37.802+02:00 | #033[2m2023-07-12T14:57:37.591613Z#033[0m #033[32m INFO#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Shutting down shards
| 2023-07-12T16:57:37.802+02:00Copy#033[2m2023-07-12T14:57:37.656495Z#033[0m #033[32m INFO#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Shard 0 terminated | #033[2m2023-07-12T14:57:37.656495Z#033[0m #033[32m INFO#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Shard 0 terminated
| 2023-07-12T16:57:42.357+02:00 | Error: ShardCannotStart
import * as cdk from 'aws-cdk-lib';
import * as ec2 from 'aws-cdk-lib/aws-ec2';
import * as lambda from 'aws-cdk-lib/aws-lambda';
import { Construct } from 'constructs';
import { ChatBotBackendStack } from './chatbot-backend/chatbot-backend-stack';
import {
LargeLanguageModel,
ModelKind,
ContainerImages,
} from './large-language-model';
export interface ChatBotStackProps extends cdk.StackProps {
vpc: ec2.Vpc;
semanticSearchApi: lambda.Function | null;
maxParallelLLMQueries: number;
}
export class ChatBotStack extends cdk.Stack {
constructor(scope: Construct, id: string, props: ChatBotStackProps) {
super(scope, id, {
description: 'AWS LLM CHATBOT (uksb-1tupboc16)',
...props,
});
const { vpc, semanticSearchApi, maxParallelLLMQueries } = props;
const largeLanguageModels = this.createLLMs({ vpc });
new ChatBotBackendStack(this, 'ChatBotBackendStack', {
vpc: vpc,
semanticSearchApi,
largeLanguageModels,
maxParallelLLMQueries,
});
}
createLLMs({ vpc }: { vpc: ec2.Vpc }) {
const falcon7bInstruct = new LargeLanguageModel(
this,
'tiiuae-falcon7b-instruct',
{
vpc,
region: this.region,
model: {
kind: ModelKind.Container,
modelId: 'tiiuae/falcon-7b-instruct',
container: ContainerImages.HF_PYTORCH_LLM_TGI_INFERENCE_LATEST,
instanceType: 'ml.g5.2xlarge',
env: {
SM_NUM_GPUS: '4',
},
},
}
);
...
...
return [falcon7bInstruct];
}
}