背景
Ollama启动后即可提供模型调用的服务;类似于Docker的操作命令。
观察发现,用不了多久Ollama的模型就会消失。
其实只是Ollama加载模型极快;不活跃的服务就会被卸载掉。需要调用模型,又能很快载入。
代码
from flask import Flask, request, Response
from openai import OpenAI
app = Flask(__name__)
client = OpenAI(
    base_url="http://172.**.**.35:11434/v1/",
    api_key="ollama"
)
def event_stream(question):
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "你是一个人工智能助手"},
            {"role": "user", "content": question}
        ],
        model="qwen2.5:latest",
        stream=True
    )
    # 输出数据到前端
    for chunk in response:
        if chunk.choices[0].finish_reason is not None:
            data = '[Done]'
        else:
            data = chunk.choices[0].delta.content
        yield 'data: %s\n\n' % data.replace("\n", "<br/>")
@app.route("/")
def index():
    return "Welcome to WhatsRAG!"
@app.route("/whatsrag/chat/stream", methods=["GET"])
def chat_stream():
    question = request.args.get('question')
    print(question)
    return Response(event_stream(question), content_type='text/event-stream')
app.run(debug=True, host="0.0.0.0", port="12345")
				

学习大模型如何调用