diff --git a/README.md b/README.md index 84c969dab9..f9af6fbf9b 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ SynapseML (previously known as MMLSpark), is an open-source library that simplif With SynapseML, you can build scalable and intelligent systems to solve challenges in domains such as anomaly detection, computer vision, deep learning, text analytics, and others. SynapseML can train and evaluate models on single-node, multi-node, and elastically resizable clusters of computers. This lets you scale your work without wasting resources. SynapseML is usable across Python, R, Scala, Java, and .NET. Furthermore, its API abstracts over a wide variety of databases, file systems, and cloud data stores to simplify experiments no matter where data is located. -SynapseML requires Scala 2.12, Spark 3.4+, and Python 3.8+. +SynapseML requires Scala 2.12, Spark 3.5+, and Python 3.8+. | Topics | Links | | :------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | @@ -117,48 +117,32 @@ In Microsoft Fabric notebooks SynapseML is already installed. To change the vers ### Synapse Analytics -In Azure Synapse notebooks please place the following in the first cell of your notebook. +In Azure Synapse notebooks please place the following in the first cell of your notebook. -- For Spark 3.5 Pools: +- For Spark 4.0 Pools: ```bash %%configure -f { "name": "synapseml", "conf": { - "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:1.1.0", + "spark.jars.packages": "com.microsoft.azure:synapseml_2.13:1.1.0-spark4.0", "spark.jars.repositories": "https://mmlspark.azureedge.net/maven", - "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.12,org.scalactic:scalactic_2.12,org.scalatest:scalatest_2.12,com.fasterxml.jackson.core:jackson-databind", + "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.13,org.scalactic:scalactic_2.13,org.scalatest:scalatest_2.13,com.fasterxml.jackson.core:jackson-databind", "spark.yarn.user.classpath.first": "true", "spark.sql.parquet.enableVectorizedReader": "false" } } ``` -- For Spark 3.4 Pools: - -```bash -%%configure -f -{ - "name": "synapseml", - "conf": { - "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:1.0.15", - "spark.jars.repositories": "https://mmlspark.azureedge.net/maven", - "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.12,org.scalactic:scalactic_2.12,org.scalatest:scalatest_2.12,com.fasterxml.jackson.core:jackson-databind", - "spark.yarn.user.classpath.first": "true", - "spark.sql.parquet.enableVectorizedReader": "false" - } -} -``` - -- For Spark 3.3 Pools: +- For Spark 3.5 Pools: ```bash %%configure -f { "name": "synapseml", "conf": { - "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:0.11.4-spark3.3", + "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:1.1.0", "spark.jars.repositories": "https://mmlspark.azureedge.net/maven", "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.12,org.scalactic:scalactic_2.12,org.scalatest:scalatest_2.12,com.fasterxml.jackson.core:jackson-databind", "spark.yarn.user.classpath.first": "true", @@ -167,8 +151,6 @@ In Azure Synapse notebooks please place the following in the first cell of your } ``` - - To install at the pool level instead of the notebook level [add the spark properties listed above to the pool configuration](https://techcommunity.microsoft.com/t5/azure-synapse-analytics-blog/how-to-set-spark-pyspark-custom-configs-in-synapse-workspace/ba-p/2114434). ### Databricks diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAI.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAI.scala index c50509ed5a..eb6e80a5bc 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAI.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAI.scala @@ -16,31 +16,6 @@ import spray.json.DefaultJsonProtocol._ import scala.language.existentials -trait HasPromptInputs extends HasServiceParams { - val prompt: ServiceParam[String] = new ServiceParam[String]( - this, "prompt", "The text to complete", isRequired = false) - - def getPrompt: String = getScalarParam(prompt) - - def setPrompt(v: String): this.type = setScalarParam(prompt, v) - - def getPromptCol: String = getVectorParam(prompt) - - def setPromptCol(v: String): this.type = setVectorParam(prompt, v) - - val batchPrompt: ServiceParam[Seq[String]] = new ServiceParam[Seq[String]]( - this, "batchPrompt", "Sequence of prompts to complete", isRequired = false) - - def getBatchPrompt: Seq[String] = getScalarParam(batchPrompt) - - def setBatchPrompt(v: Seq[String]): this.type = setScalarParam(batchPrompt, v) - - def getBatchPromptCol: String = getVectorParam(batchPrompt) - - def setBatchPromptCol(v: String): this.type = setVectorParam(batchPrompt, v) - -} - trait HasMessagesInput extends Params { val messagesCol: Param[String] = new Param[String]( this, "messagesCol", "The column messages to generate chat completions for," + diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala deleted file mode 100644 index eacd5527d7..0000000000 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.azure.synapse.ml.services.openai - -import com.microsoft.azure.synapse.ml.logging.{FeatureNames, SynapseMLLogging} -import com.microsoft.azure.synapse.ml.param.AnyJsonFormat.anyFormat -import com.microsoft.azure.synapse.ml.services.{HasCognitiveServiceInput, HasInternalJsonOutputParser} -import org.apache.http.entity.{AbstractHttpEntity, ContentType, StringEntity} -import org.apache.spark.ml.ComplexParamsReadable -import org.apache.spark.ml.util._ -import org.apache.spark.sql.{functions => F, Row} -import org.apache.spark.sql.types._ -import spray.json.DefaultJsonProtocol._ -import spray.json._ - -import scala.language.existentials - -object OpenAICompletion extends ComplexParamsReadable[OpenAICompletion] - -class OpenAICompletion(override val uid: String) extends OpenAIServicesBase(uid) - with HasOpenAITextParams with HasPromptInputs with HasCognitiveServiceInput - with HasInternalJsonOutputParser with SynapseMLLogging with HasTextOutput { - logClass(FeatureNames.AiServices.OpenAI) - - def this() = this(Identifiable.randomUID("OpenAICompletion")) - - def urlPath: String = "" - - override private[ml] def internalServiceType: String = "openai" - - setDefault(apiVersion -> Left("2024-02-01")) - - override def setCustomServiceName(v: String): this.type = { - setUrl(s"https://$v.openai.azure.com/" + urlPath.stripPrefix("/")) - } - - override protected def prepareUrlRoot: Row => String = { row => - s"${getUrl}openai/deployments/${getValue(row, deploymentName)}/completions" - } - - override protected[openai] def prepareEntity: Row => Option[AbstractHttpEntity] = { - r => - lazy val optionalParams: Map[String, Any] = getOptionalParams(r) - getValueOpt(r, prompt) - .map(prompt => getStringEntity(prompt, optionalParams)) - .orElse(getValueOpt(r, batchPrompt) - .map(batchPrompt => getStringEntity(batchPrompt, optionalParams))) - .orElse(throw new IllegalArgumentException( - "Please set one of prompt, batchPrompt, indexPrompt or batchIndexPrompt.")) - } - - override val subscriptionKeyHeaderName: String = "api-key" - - override def shouldSkip(row: Row): Boolean = - super.shouldSkip(row) || - (emptyParamData(row, prompt) && emptyParamData(row, batchPrompt)) - - override def responseDataType: DataType = CompletionResponse.schema - - private[this] def getStringEntity[A](prompt: A, optionalParams: Map[String, Any]): StringEntity = { - val fullPayload = optionalParams.updated("prompt", prompt) - new StringEntity(fullPayload.toJson.compactPrint, ContentType.APPLICATION_JSON) - } - - override private[openai] def getOutputMessageText(outputColName: String): org.apache.spark.sql.Column = { - F.element_at(F.col(outputColName).getField("choices"), 1).getField("text") - } - -} diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala index fc8fa15ff3..32c632c74a 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala @@ -20,7 +20,6 @@ import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, functions => F, types => T} -import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{col, typedLit, udf} import org.apache.spark.sql.types.{DataType, StructField, StructType} @@ -108,7 +107,7 @@ class OpenAIPrompt(override val uid: String) extends Transformer set(postProcessingOptions, v.asScala.toMap) val dropPrompt = new BooleanParam( - this, "dropPrompt", "whether to drop the column of prompts after templating (when using legacy models)") + this, "dropPrompt", "whether to drop the column of prompts after templating") def getDropPrompt: Boolean = $(dropPrompt) @@ -229,32 +228,25 @@ class OpenAIPrompt(override val uid: String) extends Transformer createMessagesUDF: UserDefinedFunction, attachmentsColumn: Column ): (DataFrame, String, OpenAIServicesBase with HasTextOutput) = { - service match { - case c: OpenAICompletion => - if (isSet(responseFormat)) { - throw new IllegalArgumentException("responseFormat is not supported for Completion.") - } - import com.microsoft.azure.synapse.ml.core.schema.DatasetExtensions._ - val promptColName = df.withDerivativeCol("prompt") - (df.withColumn(promptColName, promptCol), promptColName, c.setPromptCol(promptColName)) - - case c: HasMessagesInput => - if (isSet(responseFormat)) { - // Pass through responseFormat without forcing a single shape here. - // Each service validates according to its API (chat_completions vs responses). - c match { - case cc: OpenAIChatCompletion => cc.setResponseFormat(getResponseFormat) - case resp: OpenAIResponses => resp.setResponseFormat(getResponseFormat) - } - } - val messageColName = getMessagesCol + if (isSet(responseFormat)) { + service match { + case cc: OpenAIChatCompletion => cc.setResponseFormat(getResponseFormat) + case resp: OpenAIResponses => resp.setResponseFormat(getResponseFormat) + case _ => // AIFoundryChatCompletion does not currently expose responseFormat. + } + } - ( - df.withColumn(messageColName, createMessagesUDF(promptCol, attachmentsColumn)), - messageColName, - c.setMessagesCol(messageColName) - ) + val messageColName = getMessagesCol + val configuredService = service match { + case c: HasMessagesInput => c.setMessagesCol(messageColName) + case other => other } + + ( + df.withColumn(messageColName, createMessagesUDF(promptCol, attachmentsColumn)), + messageColName, + configuredService + ) } private def usageMappingFor(service: OpenAIServicesBase with HasTextOutput) @@ -491,21 +483,12 @@ class OpenAIPrompt(override val uid: String) extends Transformer private[openai] def hasAIFoundryModel: Boolean = this.isDefined(model) - //deployment name can be set by user, it doesn't have to match with model name - private val legacyModels = Set("ada", "babbage", "curie", "davinci", - "text-ada-001", "text-babbage-001", "text-curie-001", "text-davinci-002", - "text-davinci-003", "code-cushman-001", "code-davinci-002") - private def getOpenAIChatService: OpenAIServicesBase with HasTextOutput = { val completion: OpenAIServicesBase with HasTextOutput = if (hasAIFoundryModel) { new AIFoundryChatCompletion() - } - else if (legacyModels.contains(getDeploymentName)) { - new OpenAICompletion() - } - else { + } else { // Use the apiType parameter to decide between chat_completions and responses getApiType match { case "responses" => new OpenAIResponses() @@ -530,8 +513,6 @@ class OpenAIPrompt(override val uid: String) extends Transformer chatCompletion.prepareEntity(r) case chatCompletion: OpenAIChatCompletion => chatCompletion.prepareEntity(r) - case completion: OpenAICompletion => - completion.prepareEntity(r) } } @@ -556,8 +537,6 @@ class OpenAIPrompt(override val uid: String) extends Transformer chatCompletion.transformSchema(schema.add(getMessagesCol, StructType(Seq()))) case chatCompletion: OpenAIChatCompletion => chatCompletion.transformSchema(schema.add(getMessagesCol, StructType(Seq()))) - case completion: OpenAICompletion => - completion.transformSchema(schema) } val outputDataType: DataType = { diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletionSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletionSuite.scala deleted file mode 100644 index 809fce9247..0000000000 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletionSuite.scala +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.azure.synapse.ml.services.openai - -import com.microsoft.azure.synapse.ml.Secrets -import com.microsoft.azure.synapse.ml.Secrets.getAccessToken -import com.microsoft.azure.synapse.ml.core.test.base.Flaky -import com.microsoft.azure.synapse.ml.core.test.fuzzing.{TestObject, TransformerFuzzing} -import org.apache.spark.ml.util.MLReadable -import org.apache.spark.sql.{DataFrame, Row} -import org.scalactic.Equality - -class OpenAICompletionSuite extends TransformerFuzzing[OpenAICompletion] with OpenAIAPIKey with Flaky { - - import spark.implicits._ - - override def beforeAll(): Unit = { - val aadToken = getAccessToken("https://cognitiveservices.azure.com/") - println(s"Triggering token creation early ${aadToken.length}") - super.beforeAll() - } - - def newCompletion: OpenAICompletion = new OpenAICompletion() - .setDeploymentName(deploymentName) - .setCustomServiceName(openAIServiceName) - .setMaxTokens(200) - .setOutputCol("out") - .setSubscriptionKey(openAIAPIKey) - - lazy val promptCompletion: OpenAICompletion = newCompletion.setPromptCol("prompt") - lazy val batchPromptCompletion: OpenAICompletion = newCompletion.setBatchPromptCol("batchPrompt") - - lazy val df: DataFrame = Seq( - "Once upon a time", - "Best programming language award goes to", - "SynapseML is " - ).toDF("prompt") - - lazy val promptDF: DataFrame = Seq( - "Once upon a time", - "Best programming language award goes to", - "SynapseML is " - ).toDF("prompt") - - lazy val batchPromptDF: DataFrame = Seq( - Seq( - "This is a test", - "Now is the time", - "Knock, knock") - ).toDF("batchPrompt") - - ignore("Basic Usage") { - testCompletion(promptCompletion, promptDF) - } - - ignore("Basic usage with AAD auth") { - val aadToken = getAccessToken("https://cognitiveservices.azure.com/") - - val completion = new OpenAICompletion() - .setAADToken(aadToken) - .setDeploymentName(deploymentName) - .setCustomServiceName(openAIServiceName) - .setPromptCol("prompt") - .setOutputCol("out") - - testCompletion(completion, promptDF) - } - - ignore("Batch Prompt") { - testCompletion(batchPromptCompletion, batchPromptDF) - } - - def testCompletion(completion: OpenAICompletion, df: DataFrame, requiredLength: Int = 10): Unit = { - val fromRow = CompletionResponse.makeFromRowConverter - completion.transform(df).collect().foreach(r => - fromRow(r.getAs[Row]("out")).choices.foreach(c => - assert(c.text.length > requiredLength))) - } - - - override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Unit = { - super.assertDFEq(df1.drop("out"), df2.drop("out"))(eq) - } - - override def testObjects(): Seq[TestObject[OpenAICompletion]] = - Seq(new TestObject(newCompletion, df)) - - override def reader: MLReadable[_] = OpenAICompletion - -} diff --git a/docs/Explore Algorithms/OpenAI/OpenAI.ipynb b/docs/Explore Algorithms/OpenAI/OpenAI.ipynb index 03b3646f94..1693fd7e7a 100644 --- a/docs/Explore Algorithms/OpenAI/OpenAI.ipynb +++ b/docs/Explore Algorithms/OpenAI/OpenAI.ipynb @@ -19,7 +19,7 @@ "\n", "The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but an Azure Databricks, HDInsight, or Spark on Kubernetes, or even a python environment with the `pyspark` package will work. \n", "\n", - "1. An Azure OpenAI resource – request access [here](https://customervoice.microsoft.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR7en2Ais5pxKtso_Pz4b1_xUOFA5Qk1UWDRBMjg0WFhPMkIzTzhKQ1dWNyQlQCN0PWcu) before [creating a resource](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource)\n", + "1. An Azure OpenAI resource \u2013 request access [here](https://customervoice.microsoft.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR7en2Ais5pxKtso_Pz4b1_xUOFA5Qk1UWDRBMjg0WFhPMkIzTzhKQ1dWNyQlQCN0PWcu) before [creating a resource](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource)\n", "1. [Create a Synapse workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-create-workspace)\n", "1. [Create a serverless Apache Spark pool](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-analyze-spark#create-a-serverless-apache-spark-pool)\n" ] @@ -257,236 +257,6 @@ ")" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## (Legacy) Create the OpenAICompletion Apache Spark Client\n", - "\n", - "To apply the OpenAI Completion service to your dataframe you created, create an OpenAICompletion object, which serves as a distributed client. Parameters of the service can be set either with a single value, or by a column of the dataframe with the appropriate setters on the `OpenAICompletion` object. Here we're setting `maxTokens` to 200. A token is around four characters, and this limit applies to the sum of the prompt and the result. We're also setting the `promptCol` parameter with the name of the prompt column in the dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from synapse.ml.services.openai import OpenAICompletion\n", - "\n", - "completion = (\n", - " OpenAICompletion()\n", - " .setSubscriptionKey(key)\n", - " .setDeploymentName(deployment_name)\n", - " .setCustomServiceName(service_name)\n", - " .setMaxTokens(200)\n", - " .setPromptCol(\"prompt\")\n", - " .setErrorCol(\"error\")\n", - " .setOutputCol(\"completions\")\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## (Legacy) Transform the dataframe with the OpenAICompletion Client\n", - "\n", - "After creating the dataframe and the completion client, you can transform your input dataset and add a column called `completions` with all of the information the service adds. Select just the text for simplicity." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql.functions import col\n", - "\n", - "completed_df = completion.transform(df).cache()\n", - "display(\n", - " completed_df.select(\n", - " col(\"prompt\"),\n", - " col(\"error\"),\n", - " col(\"completions.choices.text\").getItem(0).alias(\"text\"),\n", - " ).show(truncate=False)\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Your output should look something like this. The completion text will be different from the sample.\n", - "\n", - "| **prompt** \t| **error** \t| **text** \t|\n", - "|:----------------------------:\t|:----------:\t|:-------------------------------------------------------------------------------------------------------------------------------------:\t|\n", - "| Hello my name is \t| null \t| Makaveli I'm eighteen years old and I want to be a rapper when I grow up I love writing and making music I'm from Los Angeles, CA \t|\n", - "| The best code is code thats \t| null \t| understandable This is a subjective statement, and there is no definitive answer. \t|\n", - "| SynapseML is \t| null \t| A machine learning algorithm that is able to learn how to predict the future outcome of events. \t|" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Improve throughput with request batching for OpenAICompletion\n", - "\n", - "The example makes several requests to the service, one for each prompt. To complete multiple prompts in a single request, use batch mode. First, in the OpenAICompletion object, instead of setting the Prompt column to \"Prompt\", specify \"batchPrompt\" for the BatchPrompt column.\n", - "To do so, create a dataframe with a list of prompts per row.\n", - "\n", - "As of this writing there's currently a limit of 20 prompts in a single request, and a hard limit of 2048 \"tokens\", or approximately 1500 words." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch_df = spark.createDataFrame(\n", - " [\n", - " ([\"The time has come\", \"Pleased to\", \"Today stocks\", \"Here's to\"],),\n", - " ([\"The only thing\", \"Ask not what\", \"Every litter\", \"I am\"],),\n", - " ]\n", - ").toDF(\"batchPrompt\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next we create the OpenAICompletion object. Rather than setting the prompt column, set the batchPrompt column if your column is of type `Array[String]`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch_completion = (\n", - " OpenAICompletion()\n", - " .setSubscriptionKey(key)\n", - " .setDeploymentName(deployment_name)\n", - " .setCustomServiceName(service_name)\n", - " .setMaxTokens(200)\n", - " .setBatchPromptCol(\"batchPrompt\")\n", - " .setErrorCol(\"error\")\n", - " .setOutputCol(\"completions\")\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the call to transform, a request will be made per row. Since there are multiple prompts in a single row, each request is sent with all prompts in that row. The results contain a row for each row in the request." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "completed_batch_df = batch_completion.transform(batch_df).cache()\n", - "display(completed_batch_df.show(truncate=False))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using an automatic minibatcher\n", - "\n", - "If your data is in column format, you can transpose it to row format using SynapseML's `FixedMiniBatcherTransformer`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql.types import StringType\n", - "from synapse.ml.stages import FixedMiniBatchTransformer\n", - "from synapse.ml.core.spark import FluentAPI\n", - "\n", - "completed_autobatch_df = (\n", - " df.coalesce(\n", - " 1\n", - " ) # Force a single partition so that our little 4-row dataframe makes a batch of size 4, you can remove this step for large datasets\n", - " .mlTransform(FixedMiniBatchTransformer(batchSize=4))\n", - " .withColumnRenamed(\"prompt\", \"batchPrompt\")\n", - " .mlTransform(batch_completion)\n", - ")\n", - "\n", - "display(completed_autobatch_df.show(truncate=False))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prompt engineering for translation\n", - "\n", - "The Azure OpenAI service can solve many different natural language tasks through [prompt engineering](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/completions). Here, we show an example of prompting for language translation:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "translate_df = spark.createDataFrame(\n", - " [\n", - " (\"Japanese: Ookina hako English: Big box Japanese: Midori takoEnglish:\",),\n", - " (\n", - " \"French: Quel heure et il au Montreal? English: What time is it in Montreal? French: Ou est le poulet? English:\",\n", - " ),\n", - " ]\n", - ").toDF(\"prompt\")\n", - "\n", - "display(completion.transform(translate_df).show(truncate=False))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prompt for question answering\n", - "\n", - "Here, we prompt GPT-3 for general-knowledge question answering:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qa_df = spark.createDataFrame(\n", - " [\n", - " (\n", - " \"Q: Where is the Grand Canyon?A: The Grand Canyon is in Arizona.Q: What is the weight of the Burj Khalifa in kilograms?A:\",\n", - " )\n", - " ]\n", - ").toDF(\"prompt\")\n", - "\n", - "display(completion.transform(qa_df).show(truncate=False))" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -508,7 +278,7 @@ "| Type | How to set | Requires nested schema | Notes |\n", "|-------------|------------------------|------------------------|-------|\n", "| text | String(\"text\") | No | Raw string output |\n", - "| json_object | String(\"json_object\") | No | Model attempts well‑formed JSON (not strictly validated) |\n", + "| json_object | String(\"json_object\") | No | Model attempts well\u2011formed JSON (not strictly validated) |\n", "| json_schema | Dict/Map only | Yes | Strict; reject bare string or JSON string form |\n", "\n", "Below we request a single field `answer` as structured JSON." @@ -920,4 +690,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/environment.yml b/environment.yml index 47aef99806..c068c563b9 100644 --- a/environment.yml +++ b/environment.yml @@ -46,7 +46,7 @@ dependencies: - transformers==4.49.0 - huggingface-hub==0.26.0 - langchain==0.0.152 - - openai==0.27.5 + - openai==1.99.5 - black==22.3.0 - black[jupyter]==22.3.0 - mistletoe