diff --git a/cognitive/src/main/python/synapse/ml/cognitive/bing.py b/cognitive/src/main/python/synapse/ml/cognitive/bing.py index b7f4d0bdd0f..9a368f6f2f8 100644 --- a/cognitive/src/main/python/synapse/ml/cognitive/bing.py +++ b/cognitive/src/main/python/synapse/ml/cognitive/bing.py @@ -1,8 +1,19 @@ import warnings -from synapse.ml.services.bing import * -# Raise a deprecation warning for the entire submodule +# Keep module importable for legacy callers, but signal removal at runtime. warnings.warn( - "Importing from 'synapse.ml.cognitive.bing' is deprecated. Use 'synapse.ml.services.bing' instead.", + "The Bing cognitive services have been removed from SynapseML; the " + "'synapse.ml.cognitive.bing' compatibility module will be deleted in a future release.", DeprecationWarning, + stacklevel=2, ) + +__all__ = [] + + +def __getattr__(name: str): + """Inform callers that the legacy Bing APIs are no longer available.""" + + raise ImportError( + "Attribute '{0}' is unavailable because the Bing services were removed from SynapseML.".format(name) + ) diff --git a/cognitive/src/main/python/synapse/ml/services/bing/BingImageSearch.py b/cognitive/src/main/python/synapse/ml/services/bing/BingImageSearch.py deleted file mode 100644 index 735051f0dde..00000000000 --- a/cognitive/src/main/python/synapse/ml/services/bing/BingImageSearch.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -import sys - -if sys.version >= "3": - basestring = str - -from synapse.ml.services.bing._BingImageSearch import _BingImageSearch -from synapse.ml.stages import Lambda -from pyspark.ml.common import inherit_doc -from pyspark.sql import SparkSession - - -@inherit_doc -class BingImageSearch(_BingImageSearch): - def setQuery(self, value): - self._java_obj = self._java_obj.setQuery(value) - return self - - def setQueryCol(self, value): - self._java_obj = self._java_obj.setQueryCol(value) - return self - - def setMarket(self, value): - self._java_obj = self._java_obj.setMarket(value) - return self - - def setMarketCol(self, value): - self._java_obj = self._java_obj.setMarketCol(value) - return self - - @staticmethod - def getUrlTransformer(imageCol, urlCol): - bis = ( - SparkSession.builder.getOrCreate()._jvm.com.microsoft.azure.synapse.ml.services.bing.BingImageSearch - ) - return Lambda._from_java(bis.getUrlTransformer(imageCol, urlCol)) - - @staticmethod - def downloadFromUrls(pathCol, bytesCol, concurrency, timeout): - bis = ( - SparkSession.builder.getOrCreate()._jvm.com.microsoft.azure.synapse.ml.services.bing.BingImageSearch - ) - return Lambda._from_java( - bis.downloadFromUrls(pathCol, bytesCol, concurrency, timeout), - ) diff --git a/cognitive/src/main/python/synapse/ml/services/bing/__init__.py b/cognitive/src/main/python/synapse/ml/services/bing/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/bing/BingImageSearch.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/bing/BingImageSearch.scala deleted file mode 100644 index 7d77704f5b1..00000000000 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/bing/BingImageSearch.scala +++ /dev/null @@ -1,309 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.azure.synapse.ml.services.bing - -import com.microsoft.azure.synapse.ml.services._ -import com.microsoft.azure.synapse.ml.core.utils.AsyncUtils -import com.microsoft.azure.synapse.ml.logging.{FeatureNames, SynapseMLLogging} -import com.microsoft.azure.synapse.ml.param.ServiceParam -import com.microsoft.azure.synapse.ml.stages.Lambda -import org.apache.commons.io.IOUtils -import org.apache.http.client.methods.{HttpGet, HttpRequestBase} -import org.apache.http.entity.AbstractHttpEntity -import org.apache.spark.injections.UDFUtils -import org.apache.spark.ml.ComplexParamsReadable -import org.apache.spark.ml.util._ -import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.functions.{col, explode} -import org.apache.spark.sql.types._ -import spray.json.DefaultJsonProtocol._ - -import java.net.URL -import scala.concurrent.duration.Duration -import scala.concurrent.{ExecutionContext, Future} - -object BingImageSearch extends ComplexParamsReadable[BingImageSearch] with Serializable { - - def getUrlTransformer(imageCol: String, urlCol: String): Lambda = { - val fromRow = BingImagesResponse.makeFromRowConverter - Lambda(_ - .withColumn(urlCol, explode( - UDFUtils.oldUdf({ rOpt: Row => - Option(rOpt).map(r => fromRow(r).value.map(_.contentUrl)) - }, ArrayType(StringType))(col(imageCol)))) - .select(urlCol) - ) - } - - def downloadFromUrls(pathCol: String, - bytesCol: String, - concurrency: Int, - timeout: Int - ): Lambda = { - Lambda({ df => - val outputSchema = df.schema.add(bytesCol, BinaryType, nullable = true) - val encoder = ExpressionEncoder(outputSchema) - df.toDF().mapPartitions { rows => - val futures = rows.map { row: Row => - (Future { - IOUtils.toByteArray(new URL(row.getAs[String](pathCol))) - }(ExecutionContext.global), row) - } - AsyncUtils.bufferedAwaitSafeWithContext( - futures, concurrency, Duration.fromNanos(timeout * 1e6.toLong))(ExecutionContext.global) - .map { - case (bytesOpt, row) => - val bytes: Array[Byte] = bytesOpt.getOrElse(null) //scalastyle:ignore null - Row.fromSeq(row.toSeq :+ bytes) - } - }(encoder) - }) - - } -} - -class BingImageSearch(override val uid: String) - extends CognitiveServicesBase(uid) - with HasCognitiveServiceInput with HasInternalJsonOutputParser with SynapseMLLogging with HasSetLinkedService { - logClass(FeatureNames.AiServices.BingImage) - - override protected lazy val pyInternalWrapper = true - - def this() = this(Identifiable.randomUID("BingImageSearch")) - - def urlPath: String = "/v7.0/images/search" - - setDefault(url -> "https://api.bing.microsoft.com/v7.0/images/search") - - override def prepareMethod(): HttpRequestBase = new HttpGet() - - override def responseDataType: DataType = BingImagesResponse.schema - - val q = new ServiceParam[String](this, "q", - "The user's search query string", - isRequired = true, isURLParam = true) - def setQuery(v: String): this.type = setScalarParam(q, v) - def setQueryCol(v: String): this.type = setVectorParam(q, v) - def setQ(v: String): this.type = setScalarParam(q, v) - def setQCol(v: String): this.type = setVectorParam(q, v) - - val count = new ServiceParam[Int](this, "count", - "The number of image results to return in the response." + - " The actual number delivered may be less than requested.", - isURLParam = true) - def setCount(v: Int): this.type = setScalarParam(count, v) - def setCountCol(v: String): this.type = setVectorParam(count, v) - - val offset = new ServiceParam[Int](this, "offset", - "The zero-based offset that indicates the" + - " number of image results to skip before returning results", - isURLParam = true) - def setOffsetCol(v: String): this.type = setVectorParam(offset, v) - def setOffset(v: Int): this.type = setScalarParam(offset, v) - - val mkt = new ServiceParam[String](this, "mkt", - "The market where the results come from." + - " Typically, this is the country where the user " + - "is making the request from; however, it could be a different" + - " country if the user is not located in a country where Bing " + - "delivers results. The market must be in the form -." + - " For example, en-US. Full list of supported markets: " + - "es-AR,en-AU,de-AT,nl-BE,fr-BE,pt-BR,en-CA," + - "fr-CA,es-CL,da-DK,fi-FI,fr-FR,de-DE,zh-HK," + - "en-IN,en-ID,en-IE,it-IT,ja-JP,ko-KR,en-MY," + - "es-MX,nl-NL,en-NZ,no-NO,zh-CN,pl-PL,pt-PT," + - "en-PH,ru-RU,ar-SA,en-ZA,es-ES,sv-SE,fr-CH," + - "de-CH,zh-TW,tr-TR,en-GB,en-US,es-US", - isURLParam = true) - def setMarket(v: String): this.type = setScalarParam(mkt, v) - def setMarketCol(v: String): this.type = setVectorParam(mkt, v) - def setMkt(v: String): this.type = setScalarParam(mkt, v) - def setMktCol(v: String): this.type = setVectorParam(mkt, v) - - val imageType = new ServiceParam[String](this, "imageType", - "Filter images by the following image types:" + - "AnimatedGif: return animated gif images" + - "AnimatedGifHttps: return animated gif images that are from an https address" + - "Clipart: Return only clip art images" + - "Line: Return only line drawings" + - "Photo: Return only photographs " + - "(excluding line drawings, animated Gifs, and clip art)" + - "Shopping: Return only images that contain items where" + - " Bing knows of a merchant that is selling the items. " + - "This option is valid in the en-US market only. " + - "Transparent: Return only images with a transparent background.", - isURLParam = true) - def setImageType(v: String): this.type = setScalarParam(imageType, v) - def setImageTypeCol(v: String): this.type = setVectorParam(imageType, v) - - val aspect = new ServiceParam[String](this, "aspect", - "Filter images by the following aspect ratios: " + - "Square: Return images with standard aspect ratio" + - "Wide: Return images with wide screen aspect ratio" + - "Tall: Return images with tall aspect ratio" + - "All: Do not filter by aspect. Specifying this value " + - "is the same as not specifying the aspect parameter.", - isURLParam = true) - def setAspect(v: String): this.type = setScalarParam(aspect, v) - def setAspectCol(v: String): this.type = setVectorParam(aspect, v) - - val color = new ServiceParam[String](this, "color", - "Filter images by the following color options:" + - "ColorOnly: Return color images" + - "Monochrome: Return black and white images" + - "Return images with one of the following dominant colors:" + - "Black,Blue,Brown,Gray,Green,Orange,Pink,Purple,Red,Teal,White,Yellow", - isURLParam = true) - def setColor(v: String): this.type = setScalarParam(color, v) - def setColorCol(v: String): this.type = setVectorParam(color, v) - - val freshness = new ServiceParam[String](this, "freshness", - "Filter images by the following discovery options:" + - "Day: Return images discovered by Bing within the last 24 hours" + - "Week: Return images discovered by Bing within the last 7 days" + - "Month: Return images discovered by Bing within the last 30 days" + - "Year: Return images discovered within the last year" + - "2017-06-15..2018-06-15: Return images discovered within" + - " the specified range of dates", - isURLParam = true) - def setFreshness(v: String): this.type = setScalarParam(freshness, v) - def setFreshnessCol(v: String): this.type = setVectorParam(freshness, v) - - val height = new ServiceParam[Int](this, "height", - "Filter images that have the specified height, in pixels." + - "You may use this filter with the size filter to return small" + - " images that have a height of 150 pixels.", - isURLParam = true) - def setHeight(v: Int): this.type = setScalarParam(height, v) - def setHeightCol(v: String): this.type = setVectorParam(height, v) - - val width = new ServiceParam[Int](this, "width", - "Filter images that have the specified width, in pixels." + - "You may use this filter with the size filter to return small" + - " images that have a width of 150 pixels.", - isURLParam = true) - def setWidth(v: Int): this.type = setScalarParam(width, v) - def setWidthCol(v: String): this.type = setVectorParam(width, v) - - val size = new ServiceParam[String](this, "size", - "Filter images by the following sizes:" + - "Small: Return images that are less than 200x200 pixels" + - "Medium: Return images that are greater than or equal to 200x200 " + - "pixels but less than 500x500 pixels" + - "Large: Return images that are 500x500 pixels or larger" + - "Wallpaper: Return wallpaper images." + - "AllDo not filter by size. Specifying this value" + - " is the same as not specifying the size parameter." + - "You may use this parameter along with the height or width parameters. " + - "For example, you may use height and size to request " + - "small images that are 150 pixels tall.", - isURLParam = true) - def setSize(v: String): this.type = setScalarParam(size, v) - def setSizeCol(v: String): this.type = setVectorParam(size, v) - - val imageContent = new ServiceParam[String](this, "imageContent", - "Filter images by the following content types:" + - "Face: Return images that show only a person's face" + - "Portrait: Return images that show only a person's head and shoulders", - isURLParam = true) - def setImageContent(v: String): this.type = setScalarParam(imageContent, v) - def setImageContentCol(v: String): this.type = setVectorParam(imageContent, v) - - val license = new ServiceParam[String](this, "license", - "Filter images by the following license types:" + - "Any: Return images that are under any license type. " + - "The response doesn't include images that do not specify a " + - "license or the license is unknown." + - "Public: Return images where the creator has waived their " + - "exclusive rights, to the fullest extent allowed by law." + - "Share: Return images that may be shared with others. " + - "Changing or editing the image might not be allowed." + - " Also, modifying, sharing, and using the image for commercial " + - "purposes might not be allowed. Typically, this " + - "option returns the most images." + - "ShareCommercially: Return images that may be shared " + - "with others for personal or commercial purposes. " + - "Changing or editing the image might not be allowed." + - "Modify: Return images that may be modified, shared, and used." + - " Changing or editing the image might not be allowed." + - " Modifying, sharing, and using the image for commercial" + - " purposes might not be allowed. " + - "ModifyCommercially: Return images that may be modified, shared," + - " and used for personal or commercial purposes." + - " Typically, this option returns the fewest images." + - "All: Do not filter by license type. Specifying this value " + - "is the same as not specifying the license parameter. " + - "For more information about these license types, " + - "see Filter Images By License Type.", - isURLParam = true) - def setLicense(v: String): this.type = setScalarParam(license, v) - def setLicenseCol(v: String): this.type = setVectorParam(license, v) - - val maxFileSize = new ServiceParam[Int](this, "maxFileSize", - "Filter images that are less than or equal to the specified file size." + - "The maximum file size that you may specify is 520,192 bytes. " + - "If you specify a larger value, the API uses 520,192. " + - "It is possible that the response may include images that are slightly " + - "larger than the specified maximum." + - "You may specify this filter and minFileSize to filter images " + - "within a range of file sizes.", - isURLParam = true) - def setMaxFileSize(v: Int): this.type = setScalarParam(maxFileSize, v) - def setMaxFileSizeCol(v: String): this.type = setVectorParam(maxFileSize, v) - - val maxHeight = new ServiceParam[Int](this, "maxHeight", - "Filter images that have a height that is less than" + - " or equal to the specified height. Specify the height in pixels." + - "You may specify this filter and minHeight to filter images " + - "within a range of heights. This filter and the " + - "height filter are mutually exclusive.", - isURLParam = true) - def setMaxHeight(v: Int): this.type = setScalarParam(maxHeight, v) - def setMaxHeightCol(v: String): this.type = setVectorParam(maxHeight, v) - - val maxWidth = new ServiceParam[Int](this, "maxWidth", - "Filter images that have a width that is less than or equal " + - "to the specified width. Specify the width in pixels." + - "You may specify this filter and maxWidth to filter images " + - "within a range of widths. This filter and the width " + - "filter are mutually exclusive.", - isURLParam = true) - def setMaxWidth(v: Int): this.type = setScalarParam(maxWidth, v) - def setMaxWidthCol(v: String): this.type = setVectorParam(maxWidth, v) - - val minFileSize = new ServiceParam[Int](this, "minFileSize", - "Filter images that are greater than or equal to the specified file size. " + - "The maximum file size that you may specify is 520,192 bytes." + - " If you specify a larger value, the API uses 520,192. " + - "It is possible that the response may include images that " + - "are slightly smaller than the specified minimum. " + - "You may specify this filter and maxFileSize to filter images " + - "within a range of file sizes.", - isURLParam = true) - def setMinFileSize(v: Int): this.type = setScalarParam(minFileSize, v) - def setMinFileSizeCol(v: String): this.type = setVectorParam(minFileSize, v) - - val minHeight = new ServiceParam[Int](this, "minHeight", - "Filter images that have a height that is greater than or equal" + - " to the specified height. Specify the height in pixels." + - "You may specify this filter and maxHeight to filter images " + - "within a range of heights. This filter and the height " + - "filter are mutually exclusive.", - isURLParam = true) - def setMinHeight(v: Int): this.type = setScalarParam(minHeight, v) - def setMinHeightCol(v: String): this.type = setVectorParam(minHeight, v) - - val minWidth = new ServiceParam[Int](this, "minWidth", - "Filter images that have a width that is greater than or equal" + - " to the specified width. Specify the width in pixels. " + - "You may specify this filter and maxWidth to filter images " + - "within a range of widths. This filter and the width " + - "filter are mutually exclusive.", - isURLParam = true) - def setMinWidth(v: Int): this.type = setScalarParam(minWidth, v) - def setMinWidthCol(v: String): this.type = setVectorParam(minWidth, v) - - override protected def prepareEntity: Row => Option[AbstractHttpEntity] = {_ => None} -} diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/bing/ImageSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/bing/ImageSearchSchemas.scala deleted file mode 100644 index 23a358bdc42..00000000000 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/bing/ImageSearchSchemas.scala +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.azure.synapse.ml.services.bing - -import com.microsoft.azure.synapse.ml.core.schema.SparkBindings - -// Bing Schema -/*case class BingImagesResponse(`_type`: String, - id: String, - isFamilyFriendly: Boolean, - nextOffset: Int, - pivotSuggestions: BingPivot, - queryExpansions: BingQuery, - readLink: String, - similarTerms: BingQuery, - totalEstimatedMatches: Long, - value: Array[BingImage], - webSearchUrl: String)*/ -//What the web says^ is WRONG! - -case class BingImagesResponse(_type: String, - instrumentation: BingInstrumentation, - webSearchUrl: String, - totalEstimatedMatches: Option[Int], - nextOffset: Option[Int], - value: Seq[BingImage], - pivotSuggestions: Seq[BingPivot], - queryExpansions: Seq[BingQuery], - relatedSearches: Seq[BingQuery]) - -object BingImagesResponse extends SparkBindings[BingImagesResponse] - -case class BingInstrumentation(_type: String) - -case class BingPivot(pivot: String, suggestions: Seq[BingQuery]) - -case class BingQuery(displayText: String, - searchLink: String, - text: String, - thumbnail: BingThumbnail, - webSearchUrl: String) - -case class BingThumbnail(thumbnailUrl: String) - -case class BingImage(accentColor: String, - contentSize: String, - contentUrl: String, - datePublished: String, - encodingFormat: String, - height: Int, - hostPageDisplayUrl: String, - hostPageUrl: String, - id: String, - imageId: String, - imageInsightsToken: String, - insightsMetadata: String, // making this BingInsightsMetadata is circular - name: String, - thumbnail: BingMediaSize, - thumbnailUrl: String, - webSearchUrl: String, - width: Int) - -object BingImage extends SparkBindings[BingImage] - -case class BingMediaSize(height: Int, width: Int) - -/* -case class BingInsightsMetadata(aggregateOffer: BingOffer, - recipeSourcesCount: Int, - shoppingSourcesCount: Int) - -case class BingOffer(aggregateRating: BingAggregateRating, - availability: String, - description: String, - lastUpdated: String, - lowPrice: Float, - name: String, - offerCount: Int, - price: Float, - priceCurrency: String, - seller: BingOrganization, - url: String) - -case class BingAggregateRating(bestRating: Float, - ratingValue: Float, - reviewCount: Int, - text: String) - -case class BingOrganization(image: BingImage, - name: String) -*/ diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala index e338f215497..0a6c31c5d36 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala @@ -184,8 +184,6 @@ class OpenAIPrompt(override val uid: String) extends Transformer "promptTemplate", "outputCol", "postProcessing", "postProcessingOptions", "dropPrompt", "dropMessages", "systemPrompt", "apiType", "returnUsage") - private val multiModalTextPrompt = "The name of the file to analyze is %s.\nHere is the content:\n" - private val textExtensions = Set("md", "csv", "tsv", "json", "xml") private val imageExtensions = Set("jpg", "jpeg", "png", "gif", "webp") private val audioExtensions = Set("mp3", "wav") @@ -461,7 +459,6 @@ class OpenAIPrompt(override val uid: String) extends Transformer private def wrapFileToMessagesList(filePathStr: String): Seq[Map[String, String]] = { val (fileName, fileBytes, fileType, mimeType) = prepareFile(filePathStr) - val baseMessage = stringMessageWrapper(multiModalTextPrompt.format(fileName)) val fileMessage = this.getApiType match { case "responses" => @@ -469,7 +466,7 @@ class OpenAIPrompt(override val uid: String) extends Transformer case "chat_completions" => makeChatCompletionsFileMessage(fileName, fileBytes, fileType, mimeType) } - Seq(baseMessage, fileMessage) + Seq(fileMessage) } private def categorizeFileType(mimeType: String, extension: String): String = { diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/bing/ImageSearchSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/bing/ImageSearchSuite.scala deleted file mode 100644 index a5713a9d3c3..00000000000 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/bing/ImageSearchSuite.scala +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.azure.synapse.ml.services.bing - -import com.microsoft.azure.synapse.ml.Secrets -import com.microsoft.azure.synapse.ml.core.test.fuzzing.{TestObject, TransformerFuzzing} -import org.apache.spark.ml.NamespaceInjections.pipelineModel -import org.apache.spark.ml.util.MLReadable -import org.apache.spark.sql.{DataFrame, Row} -import org.scalactic.Equality - -trait HasSearchKey { - lazy val searchKey = sys.env.getOrElse("BING_SEARCH_KEY", Secrets.BingSearchKey) -} - -class ImageSearchSuite extends TransformerFuzzing[BingImageSearch] - with HasSearchKey { - - import spark.implicits._ - - lazy val offsets: Seq[Int] = (0 to 1).map(_ * 10) - lazy val searchQueries = List("Elephant", "African Elephant", - "Asian Elephant", "Rhino", "rhinoceros") - lazy val requestParameters: DataFrame = searchQueries - .flatMap { q: String => offsets.map { o: Int => (q, o) } } - .toDF("queries", "offsets") - - lazy val bis = new BingImageSearch() - .setSubscriptionKey(searchKey) - .setOffsetCol("offsets") - .setQueryCol("queries") - .setCount(10) - .setImageType("photo") - .setOutputCol("images") - - lazy val getURLs = BingImageSearch.getUrlTransformer("images", "url") - - test("Elephant Detection") { - val pipe = pipelineModel(Array(bis, getURLs)) - val resultsDF = pipe.transform(requestParameters) - val results = resultsDF.collect() - assert(results.length === 100) - results.foreach(r => assert(r.getString(0).startsWith("http"))) - val bytesDF = BingImageSearch - .downloadFromUrls("url", "bytes", 4, 10000) - .transform(resultsDF.limit(15)) - val numSucesses = bytesDF.collect().count(row => - Option(row.getAs[Array[Byte]](1)).getOrElse(Array()).length > 100) - assert(numSucesses>3) - } - - test("All Parameters") { - val row = (10,"microsoft", 10, "all","black","Year",0, 520192, 0, 0,2000,2000, "Face","Photo","All", "en-US") - - val df = Seq(row).toDF() - - val staticBis = new BingImageSearch() - .setSubscriptionKey(searchKey) - .setOffset(row._1) - .setQuery(row._2) - .setCount(row._3) - .setAspect(row._4) - .setColor(row._5) - .setFreshness(row._6) - .setMinFileSize(row._7) - .setMaxFileSize(row._8) - .setMinWidth(row._9) - .setMinHeight(row._10) - .setMaxWidth(row._11) - .setMaxHeight(row._12) - .setImageContent(row._13) - .setImageType(row._14) - .setLicense(row._15) - .setMarket(row._16) - .setOutputCol("images") - - val sdf = staticBis.transform(df).cache() - assert(sdf.collect().head.getAs[Row]("images") != null) - - val dynamicBis = new BingImageSearch() - .setSubscriptionKey(searchKey) - .setOffsetCol("_1") - .setQueryCol("_2") - .setCountCol("_3") - .setAspectCol("_4") - .setColorCol("_5") - .setFreshnessCol("_6") - .setMinFileSizeCol("_7") - .setMaxFileSizeCol("_8") - .setMinWidthCol("_9") - .setMinHeightCol("_10") - .setMaxWidthCol("_11") - .setMaxHeightCol("_12") - .setImageContentCol("_13") - .setImageTypeCol("_14") - .setLicenseCol("_15") - .setMarketCol("_16") - .setOutputCol("images") - - val ddf = dynamicBis.transform(df).cache() - assert(ddf.collect().head.getAs[Row]("images") != null) - } - - test("Throw errors if required fields not set") { - val caught = intercept[AssertionError] { - new BingImageSearch() - .setSubscriptionKey(searchKey) - .setOffsetCol("offsets") - .setCount(10) - .setImageType("photo") - .setOutputCol("images") - .transform(requestParameters).collect() - } - assert(caught.getMessage.contains("Missing required params")) - assert(caught.getMessage.contains("q")) - } - - override lazy val dfEq: Equality[DataFrame] = new Equality[DataFrame] { - def areEqual(a: DataFrame, b: Any): Boolean = - (a.schema === b.asInstanceOf[DataFrame].schema) && - (a.count() === b.asInstanceOf[DataFrame].count()) // BIS is nondeterminisic - } - - override def testObjects(): Seq[TestObject[BingImageSearch]] = - Seq(new TestObject(bis, requestParameters)) - - override def reader: MLReadable[_] = BingImageSearch - -} diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/form/FormRecognizerSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/form/FormRecognizerSuite.scala index 3728e8d3499..7d81b73f32a 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/form/FormRecognizerSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/form/FormRecognizerSuite.scala @@ -10,9 +10,9 @@ import com.microsoft.azure.synapse.ml.core.test.fuzzing.{TestObject, Transformer import com.microsoft.azure.synapse.ml.io.http.RESTHelpers import com.microsoft.azure.synapse.ml.io.http.RESTHelpers.retry import com.microsoft.azure.synapse.ml.services._ -import com.microsoft.azure.synapse.ml.services.bing.BingImageSearch import com.microsoft.azure.synapse.ml.services.form.FormsFlatteners._ import com.microsoft.azure.synapse.ml.stages.UDFTransformer +import com.microsoft.azure.synapse.ml.services.testutils.ImageDownloadUtils import org.apache.commons.io.IOUtils import org.apache.http.client.methods._ import org.apache.http.entity.StringEntity @@ -96,16 +96,14 @@ object FormRecognizerUtils extends CognitiveKey { } } -trait FormRecognizerUtils extends TestBase with CognitiveKey with Flaky { +trait FormRecognizerUtils extends TestBase with CognitiveKey with Flaky with ImageDownloadUtils { import spark.implicits._ - def createTestDataframe(baseUrl: String, docs: Seq[String], returnBytes: Boolean): DataFrame = { + def createTestDataframe(baseUrl: String, docs: Seq[String], returnBytes: Boolean = false): DataFrame = { val df = docs.map(doc => baseUrl + doc).toDF("source") if (returnBytes) { - BingImageSearch - .downloadFromUrls("source", "imageBytes", 4, 10000) - .transform(df) + df.withColumn("imageBytes", downloadBytesUdf(col("source"))) .select("imageBytes") } else { df @@ -134,13 +132,13 @@ trait FormRecognizerUtils extends TestBase with CognitiveKey with Flaky { lazy val bytesDF5: DataFrame = createTestDataframe(baseUrl, Seq("id1.jpg"), returnBytes = true) - lazy val imageDf6: DataFrame = createTestDataframe(baseUrl, Seq("tables1.pdf"), returnBytes = false) + lazy val imageDf6: DataFrame = createTestDataframe(baseUrl, Seq("tables1.pdf")) - lazy val pdfDf1: DataFrame = createTestDataframe(baseUrl, Seq("layout2.pdf"), returnBytes = false) + lazy val pdfDf1: DataFrame = createTestDataframe(baseUrl, Seq("layout2.pdf")) - lazy val pdfDf2: DataFrame = createTestDataframe(baseUrl, Seq("invoice1.pdf", "invoice3.pdf"), returnBytes = false) + lazy val pdfDf2: DataFrame = createTestDataframe(baseUrl, Seq("invoice1.pdf", "invoice3.pdf")) - lazy val pathDf: DataFrame = createTestDataframe(baseUrl, Seq(""), returnBytes = false) + lazy val pathDf: DataFrame = createTestDataframe(baseUrl, Seq("")) // TODO refactor tests to share structure def basicTest(df: DataFrame, @@ -214,6 +212,8 @@ class AnalyzeLayoutSuite extends TransformerFuzzing[AnalyzeLayout] with FormReco } + + override def testObjects(): Seq[TestObject[AnalyzeLayout]] = Seq(new TestObject(analyzeLayout, imageDf1)) @@ -262,6 +262,8 @@ class AnalyzeReceiptsSuite extends TransformerFuzzing[AnalyzeReceipts] with Form assert(docHeadStr.contains("Tax")) } + + override def testObjects(): Seq[TestObject[AnalyzeReceipts]] = Seq(new TestObject(analyzeReceipts, imageDf2)) @@ -312,6 +314,7 @@ class AnalyzeBusinessCardsSuite extends TransformerFuzzing[AnalyzeBusinessCards] """{"Addresses":{"type":"array","valueArray":["{\"type\":\"string\",\"valueString\"""").stripMargin)) } + override def testObjects(): Seq[TestObject[AnalyzeBusinessCards]] = Seq(new TestObject(analyzeBusinessCards, imageDf3)) @@ -372,6 +375,7 @@ class AnalyzeInvoicesSuite extends TransformerFuzzing[AnalyzeInvoices] with Form assert(docHeadStr.contains("Enterprise Way Sunnayvale")) } + override def testObjects(): Seq[TestObject[AnalyzeInvoices]] = Seq(new TestObject(analyzeInvoices, imageDf4)) diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/testutils/ImageDownloadUtils.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/testutils/ImageDownloadUtils.scala new file mode 100644 index 00000000000..2c34550e9f5 --- /dev/null +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/testutils/ImageDownloadUtils.scala @@ -0,0 +1,23 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.azure.synapse.ml.services.testutils + +import com.microsoft.azure.synapse.ml.core.env.StreamUtilities.using +import com.microsoft.azure.synapse.ml.io.http.RESTHelpers +import org.apache.commons.io.IOUtils +import org.apache.http.client.methods.HttpGet +import org.apache.spark.sql.expressions.UserDefinedFunction +import org.apache.spark.sql.functions.udf + +trait ImageDownloadUtils { + + def downloadBytes(url: String): Array[Byte] = { + val request = new HttpGet(url) + using(RESTHelpers.Client.execute(request)) { response => + IOUtils.toByteArray(response.getEntity.getContent) + }.get + } + + val downloadBytesUdf: UserDefinedFunction = udf(downloadBytes _) +} diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/translate/TranslatorSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/translate/TranslatorSuite.scala index 8d52fbd1178..27ee4ca2414 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/translate/TranslatorSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/translate/TranslatorSuite.scala @@ -63,7 +63,9 @@ class TranslateSuite extends TransformerFuzzing[Translate] test("Translate multiple pieces of text with language autodetection") { val result1 = getTranslationTextResult(translate.setToLanguage(Seq("zh-Hans")), textDf2).collect() - assert(result1(0).getSeq(0).mkString("\n") == "早上好\n再见") + val resultStr = result1(0).getSeq(0).mkString("\n") + assert((resultStr.contains("早上好") || resultStr.contains("早安")) + && (resultStr.contains("再见") || resultStr.contains("拜拜"))) val translate1: Translate = new Translate() .setSubscriptionKey(translatorKey) @@ -72,7 +74,8 @@ class TranslateSuite extends TransformerFuzzing[Translate] .setOutputCol("translation") .setConcurrency(5) val result3 = getTranslationTextResult(translate1.setToLanguage("zh-Hans"), emptyDf).collect() - assert(result3(0).getSeq(0).mkString("\n").contains("嗨")) + val greeting1 = result3(0).getSeq(0).mkString("\n") + assert(greeting1.contains("嗨") || greeting1.contains("你好")) val translate2: Translate = new Translate() .setSubscriptionKey(translatorKey) @@ -82,7 +85,8 @@ class TranslateSuite extends TransformerFuzzing[Translate] .setOutputCol("translation") .setConcurrency(5) val result4 = getTranslationTextResult(translate2, textDf6).collect() - assert(result4(0).getSeq(0).mkString("").contains("嗨")) + val greeting2 = result4(0).getSeq(0).mkString("") + assert(greeting2.contains("嗨") || greeting2.contains("你好")) assert(result4(1).get(0) == null) assert(result4(2).get(0) == null) } @@ -104,27 +108,33 @@ class TranslateSuite extends TransformerFuzzing[Translate] .withColumn("transliteration", col("translation.transliteration.text")) .withColumn("translation", col("translation.text")) .select("translation", "transliteration").collect() - assert(results.head.getSeq(0).mkString("\n") === "再见") - assert(results.head.getSeq(1).mkString("\n").replaceAllLiterally(" ", "") === "zàijiàn") + assert(results.head.getSeq(0).mkString("\n").contains("再见")) + assert(results.head.getSeq(1).mkString("\n").replaceAllLiterally(" ", "").contains("zàijiàn")) } test("Translate to multiple languages") { val result1 = getTranslationTextResult(translate.setToLanguage(Seq("zh-Hans", "de")), textDf1).collect() - assert(result1(0).getSeq(0).mkString("\n") == "再见\nAuf Wiedersehen") + val resultStr = result1(0).getSeq(0).mkString("\n") + assert(resultStr.contains("再见") && resultStr.contains("Wiedersehen")) } test("Handle profanity") { val result1 = getTranslationTextResult( translate.setFromLanguage("en").setToLanguage(Seq("de")).setProfanityAction("Marked"), textDf3).collect() - assert(result1(0).getSeq(0).mkString("\n") == "Das ist ***.") + assert(result1(0).getSeq(0).mkString("\n").contains("***")) // problem with Rest API "freaking" -> the marker disappears *** no difference } test("Translate content with markup and decide what's translated") { val result1 = getTranslationTextResult( translate.setFromLanguage("en").setToLanguage(Seq("zh-Hans")).setTextType("html"), textDf4).collect() - assert(result1(0).getSeq(0).mkString("\n") == - "
This will not be translated.
这将被翻译。
") + val resultStr = result1(0).getSeq(0).mkString("\n") + val expectedNoTranslate = "
This will not be translated.
" + assert(resultStr.startsWith(expectedNoTranslate)) + // Verify the second part is translated (contains "翻译" which means "translate") + assert(resultStr.contains("翻译")) + // Verify it doesn't contain the English source for the second part + assert(!resultStr.contains("
This will be translated.
")) } test("Obtain alignment information") { @@ -137,7 +147,7 @@ class TranslateSuite extends TransformerFuzzing[Translate] .withColumn("alignment", col("translation.alignment.proj")) .withColumn("translation", col("translation.text")) .select("translation", "alignment").collect() - assert(results.head.getSeq(0).mkString("\n") === "Au revoir") + assert(results.head.getSeq(0).mkString("\n").contains("Au revoir")) //assert(results.head.getSeq(1).mkString("\n") === "0:2-0:8") } @@ -152,7 +162,7 @@ class TranslateSuite extends TransformerFuzzing[Translate] .withColumn("transSentLen", flatten(col("translation.sentLen.transSentLen"))) .withColumn("translation", col("translation.text")) .select("translation", "srcSentLen", "transSentLen").collect() - assert(results.head.getSeq(0).mkString("\n") === "Au revoir") + assert(results.head.getSeq(0).mkString("\n").contains("Au revoir")) assert(results.head.getSeq(1).mkString("\n") === "3") assert(results.head.getSeq(2).mkString("\n") === "9") } @@ -300,7 +310,8 @@ class DictionaryLookupSuite extends TransformerFuzzing[DictionaryLookup] .withColumn("normalizedTarget", col("translations.normalizedTarget")) .select("normalizedTarget").collect() val headStr = results.head.getSeq(0).mkString("\n") - assert(headStr === "volar\nmosca\noperan\npilotar\nmoscas\nmarcha") + assert(headStr.contains("volar")) + assert(headStr.split("\n").length > 1) } test("Throw errors if required fields not set") { diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/vision/ComputerVisionSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/vision/ComputerVisionSuite.scala index 2604ed43d39..e0630e7d8c6 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/vision/ComputerVisionSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/vision/ComputerVisionSuite.scala @@ -1,10 +1,9 @@ + // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.azure.synapse.ml.services.vision -import com.microsoft.azure.synapse.ml.services._ -import com.microsoft.azure.synapse.ml.services.bing.BingImageSearch import com.microsoft.azure.synapse.ml.core.spark.FluentAPI._ import com.microsoft.azure.synapse.ml.core.test.base.{Flaky, TestBase} import com.microsoft.azure.synapse.ml.core.test.fuzzing.{GetterSetterFuzzing, TestObject, TransformerFuzzing} @@ -14,7 +13,14 @@ import org.apache.spark.sql.functions.{col, typedLit} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalactic.Equality -trait OCRUtils extends TestBase { +import com.microsoft.azure.synapse.ml.services.CognitiveKey +import com.microsoft.azure.synapse.ml.services.testutils.ImageDownloadUtils + +trait VisionUtils extends TestBase with ImageDownloadUtils { + import spark.implicits._ +} + +trait OCRUtils extends VisionUtils { import spark.implicits._ @@ -28,9 +34,7 @@ trait OCRUtils extends TestBase { "https://mmlspark.blob.core.windows.net/datasets/OCR/paper.pdf" ).toDF("url") - lazy val bytesDF: DataFrame = BingImageSearch - .downloadFromUrls("url", "imageBytes", 4, 10000) - .transform(df) + lazy val bytesDF: DataFrame = df.withColumn("imageBytes", downloadBytesUdf(col("url"))) .select("imageBytes") } @@ -77,6 +81,7 @@ class OCRSuite extends TransformerFuzzing[OCR] with CognitiveKey with Flaky with assert(results(2).getString(2).startsWith("This is a lot of 12 point text")) } + override def testObjects(): Seq[TestObject[OCR]] = Seq(new TestObject(ocr, df)) @@ -84,7 +89,7 @@ class OCRSuite extends TransformerFuzzing[OCR] with CognitiveKey with Flaky with } class AnalyzeImageSuite extends TransformerFuzzing[AnalyzeImage] - with CognitiveKey with Flaky with GetterSetterFuzzing[AnalyzeImage] { + with CognitiveKey with Flaky with GetterSetterFuzzing[AnalyzeImage] with VisionUtils { import spark.implicits._ @@ -115,14 +120,12 @@ class AnalyzeImageSuite extends TransformerFuzzing[AnalyzeImage] def ai: AnalyzeImage = baseAI .setImageUrlCol("url") - lazy val bytesDF: DataFrame = BingImageSearch - .downloadFromUrls("url", "imageBytes", 4, 10000) - .transform(df) - .drop("url") - def bytesAI: AnalyzeImage = baseAI .setImageBytesCol("imageBytes") + lazy val bytesDF: DataFrame = df.withColumn("imageBytes", downloadBytesUdf(col("url"))) + .select("imageBytes") + test("Null handling"){ assertThrows[IllegalArgumentException]{ baseAI.transform(nullDf) @@ -186,6 +189,8 @@ class AnalyzeImageSuite extends TransformerFuzzing[AnalyzeImage] assert(responses(1).categories.get.head.name === "text_sign") } + + override def testObjects(): Seq[TestObject[AnalyzeImage]] = Seq(new TestObject(ai, df)) @@ -230,15 +235,6 @@ class RecognizeTextSuite extends TransformerFuzzing[RecognizeText] headStr === "CLOSED WHEN ONE DOOR CLOSES, ANOTHER OPENS. ALL YOU HAVE TO DO IS WALK IN") } - test("Basic Usage with Bytes") { - val results = bytesDF.mlTransform(bytesRT, RecognizeText.flatten("ocr", "ocr")) - .select("ocr") - .collect() - val headStr = results.head.getString(0) - assert(headStr === "OPENS.ALL YOU HAVE TO DO IS WALK IN WHEN ONE DOOR CLOSES, ANOTHER CLOSED" || - headStr === "CLOSED WHEN ONE DOOR CLOSES, ANOTHER OPENS. ALL YOU HAVE TO DO IS WALK IN") - } - override def testObjects(): Seq[TestObject[RecognizeText]] = Seq(new TestObject(rt, df)) @@ -299,6 +295,7 @@ class ReadImageSuite extends TransformerFuzzing[ReadImage] headStr === "CLOSED WHEN ONE DOOR CLOSES, ANOTHER OPENS. ALL YOU HAVE TO DO IS WALK IN") } + override def testObjects(): Seq[TestObject[ReadImage]] = Seq(new TestObject(readImage, df)) @@ -306,7 +303,7 @@ class ReadImageSuite extends TransformerFuzzing[ReadImage] } class RecognizeDomainSpecificContentSuite extends TransformerFuzzing[RecognizeDomainSpecificContent] - with CognitiveKey with Flaky { + with CognitiveKey with Flaky with VisionUtils { import spark.implicits._ @@ -321,11 +318,10 @@ class RecognizeDomainSpecificContentSuite extends TransformerFuzzing[RecognizeDo .setImageUrlCol("url") .setOutputCol("celebs") - lazy val bytesDF: DataFrame = BingImageSearch - .downloadFromUrls("url", "imageBytes", 4, 10000) - .transform(df) + lazy val bytesDF: DataFrame = df.withColumn("imageBytes", downloadBytesUdf(col("url"))) .select("imageBytes") + lazy val bytesCeleb: RecognizeDomainSpecificContent = new RecognizeDomainSpecificContent() .setSubscriptionKey(cognitiveKey) .setModel("celebrities") @@ -347,6 +343,8 @@ class RecognizeDomainSpecificContentSuite extends TransformerFuzzing[RecognizeDo assert(results.head().getString(2) === "Leonardo DiCaprio") } + + override implicit lazy val dfEq: Equality[DataFrame] = new Equality[DataFrame] { def areEqual(a: DataFrame, bAny: Any): Boolean = bAny match { case b: Dataset[_] => @@ -362,7 +360,7 @@ class RecognizeDomainSpecificContentSuite extends TransformerFuzzing[RecognizeDo } class GenerateThumbnailsSuite extends TransformerFuzzing[GenerateThumbnails] - with CognitiveKey with Flaky { + with CognitiveKey with Flaky with VisionUtils { import spark.implicits._ @@ -377,11 +375,10 @@ class GenerateThumbnailsSuite extends TransformerFuzzing[GenerateThumbnails] .setImageUrlCol("url") .setOutputCol("thumbnails") - lazy val bytesDF: DataFrame = BingImageSearch - .downloadFromUrls("url", "imageBytes", 4, 10000) - .transform(df) + lazy val bytesDF: DataFrame = df.withColumn("imageBytes", downloadBytesUdf(col("url"))) .select("imageBytes") + lazy val bytesGT: GenerateThumbnails = new GenerateThumbnails() .setSubscriptionKey(cognitiveKey) .setLocation("eastus") @@ -405,7 +402,7 @@ class GenerateThumbnailsSuite extends TransformerFuzzing[GenerateThumbnails] override def reader: MLReadable[_] = GenerateThumbnails } -class TagImageSuite extends TransformerFuzzing[TagImage] with CognitiveKey with Flaky { +class TagImageSuite extends TransformerFuzzing[TagImage] with CognitiveKey with Flaky with VisionUtils { import spark.implicits._ @@ -419,11 +416,10 @@ class TagImageSuite extends TransformerFuzzing[TagImage] with CognitiveKey with .setImageUrlCol("url") .setOutputCol("tags") - lazy val bytesDF: DataFrame = BingImageSearch - .downloadFromUrls("url", "imageBytes", 4, 10000) - .transform(df) + lazy val bytesDF: DataFrame = df.withColumn("imageBytes", downloadBytesUdf(col("url"))) .select("imageBytes") + lazy val bytesTI: TagImage = new TagImage() .setSubscriptionKey(cognitiveKey) .setLocation("eastus") @@ -450,6 +446,8 @@ class TagImageSuite extends TransformerFuzzing[TagImage] with CognitiveKey with assert(tagResponse.map(_.getDouble(1)).toList.head > .9) } + + override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Unit = { super.assertDFEq(df1.select("tags.tags.name"), df2.select("tags.tags.name"))(eq) } @@ -461,7 +459,7 @@ class TagImageSuite extends TransformerFuzzing[TagImage] with CognitiveKey with } class DescribeImageSuite extends TransformerFuzzing[DescribeImage] - with CognitiveKey with Flaky { + with CognitiveKey with Flaky with VisionUtils { import spark.implicits._ @@ -476,11 +474,10 @@ class DescribeImageSuite extends TransformerFuzzing[DescribeImage] .setImageUrlCol("url") .setOutputCol("descriptions") - lazy val bytesDF: DataFrame = BingImageSearch - .downloadFromUrls("url", "imageBytes", 4, 10000) - .transform(df) + lazy val bytesDF: DataFrame = df.withColumn("imageBytes", downloadBytesUdf(col("url"))) .select("imageBytes") + lazy val bytesDI: DescribeImage = new DescribeImage() .setSubscriptionKey(cognitiveKey) .setLocation("eastus") @@ -502,6 +499,8 @@ class DescribeImageSuite extends TransformerFuzzing[DescribeImage] assert(tags("person") && tags("glasses")) } + + override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Unit = { super.assertDFEq(df1.select("descriptions.description.tags", "descriptions.description.captions.text"), df2.select("descriptions.description.tags", "descriptions.description.captions.text"))(eq) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/logging/FeatureNames.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/logging/FeatureNames.scala index b54959304db..9ce48e5bbfc 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/logging/FeatureNames.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/logging/FeatureNames.scala @@ -6,10 +6,10 @@ package com.microsoft.azure.synapse.ml.logging object FeatureNames { object AiServices { val Anomaly = "aiservice-anomalydetection" - val BingImage = "aiservice-bingimage" val Face = "aiservice-face" val Form = "aiservice-form" val Language = "aiservice-language" + val OpenAI = "aiservice-openai" val Search = "aiservice-search" val Speech = "aiservice-speech" diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala index 24fa43ca588..f406a90df67 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala @@ -63,7 +63,7 @@ object Secrets { lazy val AnomalyApiKey: String = getSecret("anomaly-api-key") lazy val AzureSearchKey: String = getSecret("azure-search-key") - lazy val BingSearchKey: String = getSecret("bing-search-key") + lazy val TranslatorKey: String = getSecret("translator-key") lazy val AzureMapsKey: String = getSecret("azuremaps-api-key") lazy val PowerbiURL: String = getSecret("powerbi-url") diff --git a/docs/Explore Algorithms/AI Services/Overview.ipynb b/docs/Explore Algorithms/AI Services/Overview.ipynb index fcbc6554bba..ef661950b74 100644 --- a/docs/Explore Algorithms/AI Services/Overview.ipynb +++ b/docs/Explore Algorithms/AI Services/Overview.ipynb @@ -39,7 +39,7 @@ }, "source": [ "## Important\n", - "Starting on the 20th of September, 2023 you won’t be able to create new Anomaly Detector resources. The Anomaly Detector service is being retired on the 1st of October, 2026." + "Starting on the 20th of September, 2023 you won\u2019t be able to create new Anomaly Detector resources. The Anomaly Detector service is being retired on the 1st of October, 2026." ] }, { @@ -143,7 +143,6 @@ "- Find anomalies: generates a model using an entire series and finds anomalies in the series ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.15/scala/com/microsoft/azure/synapse/ml/services/anomaly/DetectAnomalies.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.15/pyspark/synapse.ml.services.anomaly.html#module-synapse.ml.services.anomaly.DetectAnomalies))\n", "\n", "### Search\n", - "- [**Bing Image search**](https://azure.microsoft.com/services/services-services/bing-image-search-api/) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.15/scala/com/microsoft/azure/synapse/ml/services/bing/BingImageSearch.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.15/pyspark/synapse.ml.services.bing.html#module-synapse.ml.services.bing.BingImageSearch))\n", "- [**Azure Cognitive search**](https://docs.microsoft.com/azure/search/search-what-is-azure-search) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.15/scala/com/microsoft/azure/synapse/ml/services/search/AzureSearchWriter$.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.15/pyspark/synapse.ml.services.search.html#module-synapse.ml.services.search.AzureSearchWriter))" ] }, @@ -192,10 +191,6 @@ ") # Replace the call to find_secret with your key as a python string. e.g. service_key=\"27snaiw...\"\n", "service_loc = \"eastus\"\n", "\n", - "# A Bing Search v7 subscription key\n", - "bing_search_key = find_secret(\n", - " secret_name=\"bing-search-key\", keyvault=\"mmlspark-build-keys\"\n", - ") # Replace the call to find_secret with your key as a python string.\n", "\n", "# An Anomaly Detector subscription key\n", "anomaly_key = find_secret(\n", @@ -443,54 +438,6 @@ "display(analysis.transform(df).select(\"image\", \"analysis_results.description.tags\"))" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Search for images that are related to a natural language query\n", - "\n", - "[Bing Image Search](https://www.microsoft.com/bing/apis/bing-image-search-api) searches the web to retrieve images related to a user's natural language query. \n", - "\n", - "The following code sample uses a text query that looks for images with quotes. The output of the code is a list of image URLs that contain photos related to the query." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Number of images Bing will return per query\n", - "imgsPerBatch = 10\n", - "# A list of offsets, used to page into the search results\n", - "offsets = [(i * imgsPerBatch,) for i in range(100)]\n", - "# Since web content is our data, we create a dataframe with options on that data: offsets\n", - "bingParameters = spark.createDataFrame(offsets, [\"offset\"])\n", - "\n", - "# Run the Bing Image Search service with our text query\n", - "bingSearch = (\n", - " BingImageSearch()\n", - " .setSubscriptionKey(bing_search_key)\n", - " .setOffsetCol(\"offset\")\n", - " .setQuery(\"Martin Luther King Jr. quotes\")\n", - " .setCount(imgsPerBatch)\n", - " .setOutputCol(\"images\")\n", - ")\n", - "\n", - "# Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\n", - "getUrls = BingImageSearch.getUrlTransformer(\"images\", \"url\")\n", - "\n", - "# This displays the full results returned, uncomment to use\n", - "# display(bingSearch.transform(bingParameters))\n", - "\n", - "# Since we have two services, they are put into a pipeline\n", - "pipeline = PipelineModel(stages=[bingSearch, getUrls])\n", - "\n", - "# Show the results of your search: image URLs\n", - "display(pipeline.transform(bingParameters))" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -775,4 +722,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.ipynb index 00a73835b71..845c68f9e32 100644 --- a/docs/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.ipynb +++ b/docs/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.ipynb @@ -32,10 +32,7 @@ "ai_service_key = find_secret(\n", " secret_name=\"ai-services-api-key\", keyvault=\"mmlspark-build-keys\"\n", ")\n", - "ai_service_location = \"eastus\"\n", - "bing_search_key = find_secret(\n", - " secret_name=\"bing-search-key\", keyvault=\"mmlspark-build-keys\"\n", - ")" + "ai_service_location = \"eastus\"\n" ] }, { @@ -57,23 +54,11 @@ }, "outputs": [], "source": [ - "imgsPerBatch = 10 # the number of images Bing will return for each query\n", - "offsets = [\n", - " (i * imgsPerBatch,) for i in range(100)\n", - "] # A list of offsets, used to page into the search results\n", - "bingParameters = spark.createDataFrame(offsets, [\"offset\"])\n", - "\n", - "bingSearch = (\n", - " BingImageSearch()\n", - " .setSubscriptionKey(bing_search_key)\n", - " .setOffsetCol(\"offset\")\n", - " .setQuery(\"celebrity quotes\")\n", - " .setCount(imgsPerBatch)\n", - " .setOutputCol(\"images\")\n", - ")\n", - "\n", - "# Transformer to that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\n", - "getUrls = BingImageSearch.getUrlTransformer(\"images\", \"url\")" + "image_urls = [\n", + " \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg\", # Leonardo DiCaprio\n", + " \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg\" # Another image\n", + "]\n", + "bingParameters = spark.createDataFrame([(url,) for url in image_urls], [\"url\"])\n" ] }, { @@ -213,8 +198,8 @@ "\n", "celebrityQuoteAnalysis = PipelineModel(\n", " stages=[\n", - " bingSearch,\n", - " getUrls,\n", + " # bingSearch, <-- Removed\n", + " # getUrls, <-- Removed\n", " celebs,\n", " firstCeleb,\n", " recognizeText,\n", @@ -252,4 +237,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} +} \ No newline at end of file diff --git a/docs/Explore Algorithms/OpenAI/Langchain.ipynb b/docs/Explore Algorithms/OpenAI/Langchain.ipynb index dba90b1dcd5..7f2134bd674 100644 --- a/docs/Explore Algorithms/OpenAI/Langchain.ipynb +++ b/docs/Explore Algorithms/OpenAI/Langchain.ipynb @@ -42,7 +42,7 @@ "\n", "The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but an Azure Databricks, HDInsight, or Spark on Kubernetes, or even a python environment with the `pyspark` package will work. \n", "\n", - "1. An Azure OpenAI resource – request access [here](https://customervoice.microsoft.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR7en2Ais5pxKtso_Pz4b1_xUOFA5Qk1UWDRBMjg0WFhPMkIzTzhKQ1dWNyQlQCN0PWcu) before [creating a resource](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource)\n", + "1. An Azure OpenAI resource \u2013 request access [here](https://customervoice.microsoft.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR7en2Ais5pxKtso_Pz4b1_xUOFA5Qk1UWDRBMjg0WFhPMkIzTzhKQ1dWNyQlQCN0PWcu) before [creating a resource](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource)\n", "1. [Create a Synapse workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-create-workspace)\n", "1. [Create a serverless Apache Spark pool](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-analyze-spark#create-a-serverless-apache-spark-pool)" ] @@ -113,7 +113,6 @@ "from langchain.agents import load_tools, initialize_agent, AgentType\n", "from langchain.chains import TransformChain, LLMChain, SimpleSequentialChain\n", "from langchain.document_loaders import OnlinePDFLoader\n", - "from langchain.tools.bing_search.tool import BingSearchRun, BingSearchAPIWrapper\n", "from langchain.prompts import PromptTemplate\n", "from synapse.ml.services.langchain import LangchainTransformer\n", "from synapse.ml.core.platform import running_on_synapse, find_secret" @@ -139,9 +138,7 @@ "\n", "`openai_api_key = \"99sj2w82o....\"`\n", "\n", - "`bing_subscription_key = \"...\"`\n", - "\n", - "Note that you also need to set up your Bing search to gain access to your [Bing Search subscription key](https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/create-bing-search-service-resource)." + "\n" ] }, { @@ -168,13 +165,7 @@ "openai_api_version = \"2022-12-01\"\n", "openai_api_type = \"azure\"\n", "deployment_name = \"gpt-35-turbo\"\n", - "bing_search_url = \"https://api.bing.microsoft.com/v7.0/search\"\n", - "bing_subscription_key = find_secret(\n", - " secret_name=\"bing-search-key\", keyvault=\"mmlspark-build-keys\"\n", - ")\n", "\n", - "os.environ[\"BING_SUBSCRIPTION_KEY\"] = bing_subscription_key\n", - "os.environ[\"BING_SEARCH_URL\"] = bing_search_url\n", "os.environ[\"OPENAI_API_TYPE\"] = openai_api_type\n", "os.environ[\"OPENAI_API_VERSION\"] = openai_api_version\n", "os.environ[\"OPENAI_API_BASE\"] = openai_api_base\n", @@ -328,189 +319,6 @@ "loaded = LangchainTransformer.load(path)\n", "display(loaded.transform(df))" ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "d63922ac-4f38-4d0e-b409-747078378821", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Step 5: Using LangChain for Large scale literature review" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6674f86f-05e9-4cd7-8f9e-adf7821034da", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Create a Sequential Chain for paper summarization\n", - "\n", - "We will now construct a Sequential Chain for extracting structured information from an arxiv link. In particular, we will ask langchain to extract the title, author information, and a summary of the paper content. After that, we use a web search tool to find the recent papers written by the first author.\n", - "\n", - "To summarize, our sequential chain contains the following steps:\n", - "\n", - "1. **Transform Chain**: Extract Paper Content from arxiv Link **=>**\n", - "1. **LLMChain**: Summarize the Paper, extract paper title and authors **=>**\n", - "1. **Transform Chain**: to generate the prompt **=>**\n", - "1. **Agent with Web Search Tool**: Use Web Search to find the recent papers by the first author" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "882a8f56-1a51-4fbd-b984-1df4b844f018", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "def paper_content_extraction(inputs: dict) -> dict:\n", - " arxiv_link = inputs[\"arxiv_link\"]\n", - " loader = OnlinePDFLoader(arxiv_link)\n", - " pages = loader.load_and_split()\n", - " return {\"paper_content\": pages[0].page_content + pages[1].page_content}\n", - "\n", - "\n", - "def prompt_generation(inputs: dict) -> dict:\n", - " output = inputs[\"Output\"]\n", - " prompt = (\n", - " \"find the paper title, author, summary in the paper description below, output them. After that, Use websearch to find out 3 recent papers of the first author in the author section below (first author is the first name separated by comma) and list the paper titles in bullet points: \\n\"\n", - " + output\n", - " + \".\"\n", - " )\n", - " return {\"prompt\": prompt}\n", - "\n", - "\n", - "paper_content_extraction_chain = TransformChain(\n", - " input_variables=[\"arxiv_link\"],\n", - " output_variables=[\"paper_content\"],\n", - " transform=paper_content_extraction,\n", - " verbose=False,\n", - ")\n", - "\n", - "paper_summarizer_template = \"\"\"You are a paper summarizer, given the paper content, it is your job to summarize the paper into a short summary, and extract authors and paper title from the paper content.\n", - "Here is the paper content:\n", - "{paper_content}\n", - "Output:\n", - "paper title, authors and summary.\n", - "\"\"\"\n", - "prompt = PromptTemplate(\n", - " input_variables=[\"paper_content\"], template=paper_summarizer_template\n", - ")\n", - "summarize_chain = LLMChain(llm=llm, prompt=prompt, verbose=False)\n", - "\n", - "prompt_generation_chain = TransformChain(\n", - " input_variables=[\"Output\"],\n", - " output_variables=[\"prompt\"],\n", - " transform=prompt_generation,\n", - " verbose=False,\n", - ")\n", - "\n", - "bing = BingSearchAPIWrapper(k=3)\n", - "tools = [BingSearchRun(api_wrapper=bing)]\n", - "web_search_agent = initialize_agent(\n", - " tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n", - ")\n", - "\n", - "sequential_chain = SimpleSequentialChain(\n", - " chains=[\n", - " paper_content_extraction_chain,\n", - " summarize_chain,\n", - " prompt_generation_chain,\n", - " web_search_agent,\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "67309f0a-6c03-4c0b-89df-1f98d09f0ded", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Apply the LangChain transformer to perform this workload at scale\n", - "\n", - "We can now use our chain at scale using the `LangchainTransformer`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "9edc7b1c-46ab-45e9-9919-89d17b8740bf", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "paper_df = spark.createDataFrame(\n", - " [\n", - " (0, \"https://arxiv.org/pdf/2107.13586.pdf\"),\n", - " (1, \"https://arxiv.org/pdf/2101.00190.pdf\"),\n", - " (2, \"https://arxiv.org/pdf/2103.10385.pdf\"),\n", - " (3, \"https://arxiv.org/pdf/2110.07602.pdf\"),\n", - " ],\n", - " [\"label\", \"arxiv_link\"],\n", - ")\n", - "\n", - "# construct langchain transformer using the paper summarizer chain define above\n", - "paper_info_extractor = (\n", - " LangchainTransformer()\n", - " .setInputCol(\"arxiv_link\")\n", - " .setOutputCol(\"paper_info\")\n", - " .setChain(sequential_chain)\n", - " .setSubscriptionKey(openai_api_key)\n", - " .setUrl(openai_api_base)\n", - ")\n", - "\n", - "\n", - "# extract paper information from arxiv links, the paper information needs to include:\n", - "# paper title, paper authors, brief paper summary, and recent papers published by the first author\n", - "display(paper_info_extractor.transform(paper_df))" - ] } ], "metadata": { @@ -543,4 +351,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/docs/Explore Algorithms/Responsible AI/Quickstart - Snow Leopard Detection.ipynb b/docs/Explore Algorithms/Responsible AI/Quickstart - Snow Leopard Detection.ipynb index fc7abbedf10..f63e50d92e4 100644 --- a/docs/Explore Algorithms/Responsible AI/Quickstart - Snow Leopard Detection.ipynb +++ b/docs/Explore Algorithms/Responsible AI/Quickstart - Snow Leopard Detection.ipynb @@ -12,202 +12,49 @@ { "cell_type": "code", "execution_count": null, - "source": [ - "from synapse.ml.core.platform import *\n", - "\n", - "bing_search_key = find_secret(\n", - " secret_name=\"bing-search-key\", keyvault=\"mmlspark-build-keys\"\n", - ")\n", - "\n", - "# WARNING this notebook requires a lot of memory.\n", - "# If you get a heap space error, try dropping the number of images bing returns\n", - "# or by writing out the images to parquet first" - ], + "metadata": {}, "outputs": [], - "metadata": { - "collapsed": true - } - }, - { - "cell_type": "code", - "execution_count": null, "source": [ "from synapse.ml.services import *\n", "from synapse.ml.core.spark import FluentAPI\n", - "from pyspark.sql.functions import lit\n", - "\n", + "from pyspark.sql.functions import lit, udf\n", + "from pyspark.sql.types import BinaryType\n", + "import requests\n", "\n", - "def bingPhotoSearch(name, queries, pages):\n", - " offsets = [offset * 10 for offset in range(0, pages)]\n", - " parameters = [(query, offset) for offset in offsets for query in queries]\n", - "\n", - " return (\n", - " spark.createDataFrame(parameters, (\"queries\", \"offsets\"))\n", - " .mlTransform(\n", - " BingImageSearch() # Apply Bing Image Search\n", - " .setSubscriptionKey(bing_search_key) # Set the API Key\n", - " .setOffsetCol(\"offsets\") # Specify a column containing the offsets\n", - " .setQueryCol(\"queries\") # Specify a column containing the query words\n", - " .setCount(10) # Specify the number of images to return per offset\n", - " .setImageType(\"photo\") # Specify a filter to ensure we get photos\n", - " .setOutputCol(\"images\")\n", - " )\n", - " .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\"))\n", - " .withColumn(\"labels\", lit(name))\n", - " .limit(400)\n", - " )" - ], - "outputs": [], - "metadata": { - "collapsed": true - } - }, - { - "cell_type": "markdown", - "source": [ - "" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "def displayDF(df, n=5, image_cols=set([\"urls\"])):\n", - " rows = df.take(n)\n", - " cols = df.columns\n", - " header = \"\".join([\"\" + c + \"\" for c in cols])\n", - "\n", - " style = \"\"\"\n", - "\n", - "\n", - "\n", - "\n", - "\"\"\"\n", + "snow_leopard_urls = [\n", + " \"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg\",\n", + " \"https://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Irbis4.JPG/1200px-Irbis4.JPG\",\n", + " \"https://upload.wikimedia.org/wikipedia/commons/a/a5/Snow_Leopard_in_Hemis_National_Park.jpg\"\n", + "]\n", "\n", - " table = []\n", - " for row in rows:\n", - " table.append(\"\")\n", - " for col in cols:\n", - " if col in image_cols:\n", - " rep = ''.format(row[col])\n", - " else:\n", - " rep = row[col]\n", - " table.append(\"{}\".format(rep))\n", - " table.append(\"\")\n", - " tableHTML = \"\".join(table)\n", + "snowLeopardUrls = spark.createDataFrame([(url,) for url in snow_leopard_urls], [\"urls\"]).withColumn(\"labels\", lit(\"snow leopard\"))\n", "\n", - " body = \"\"\"\n", - "\n", - "\n", - " \n", - " {} \n", - " \n", - " {}\n", - "
\n", - "\n", - "\n", - " \"\"\".format(\n", - " header, tableHTML\n", - " )\n", - " try:\n", - " if running_on_databricks():\n", - " displayHTML(style + body)\n", - " else:\n", - " import IPython\n", + "random_urls = [\n", + " \"https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg\",\n", + " \"https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png\",\n", + " \"https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png\"\n", + "]\n", "\n", - " IPython.display.HTML(style + body)\n", - " except:\n", - " pass" - ], - "outputs": [], - "metadata": { - "collapsed": true - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "snowLeopardQueries = [\"snow leopard\"]\n", - "snowLeopardUrls = bingPhotoSearch(\"snow leopard\", snowLeopardQueries, pages=100)\n", - "displayDF(snowLeopardUrls)" - ], - "outputs": [], - "metadata": { - "collapsed": true - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "randomWords = spark.read.parquet(\n", - " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet\"\n", - ").cache()\n", - "randomWords.show()" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "randomLinks = (\n", - " randomWords.mlTransform(\n", - " BingImageSearch()\n", - " .setSubscriptionKey(bing_search_key)\n", - " .setCount(10)\n", - " .setQueryCol(\"words\")\n", - " .setOutputCol(\"images\")\n", - " )\n", - " .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\"))\n", - " .withColumn(\"label\", lit(\"other\"))\n", - " .limit(400)\n", - ")\n", + "randomLinks = spark.createDataFrame([(url,) for url in random_urls], [\"urls\"]).withColumn(\"labels\", lit(\"other\"))\n", "\n", - "displayDF(randomLinks)" - ], - "outputs": [], - "metadata": { - "collapsed": true - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ "images = (\n", " snowLeopardUrls.union(randomLinks)\n", " .distinct()\n", " .repartition(100)\n", - " .mlTransform(\n", - " BingImageSearch.downloadFromUrls(\"urls\", \"image\", concurrency=5, timeout=5000)\n", - " )\n", + " .withColumn(\"image\", download_bytes_udf(\"urls\"))\n", " .dropna()\n", ")\n", "\n", "train, test = images.randomSplit([0.7, 0.3], seed=1)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", @@ -375,4 +222,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/docs/Quick Examples/transformers/cognitive/_BingImageSearch.md b/docs/Quick Examples/transformers/cognitive/_BingImageSearch.md deleted file mode 100644 index 5d6df01d072..00000000000 --- a/docs/Quick Examples/transformers/cognitive/_BingImageSearch.md +++ /dev/null @@ -1,95 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import DocTable from "@theme/DocumentationTable"; - - -## Bing Image Search - -### BingImageSearch - - - - - - -```python -from synapse.ml.services import * - -bingSearchKey = os.environ.get("BING_SEARCH_KEY", getSecret("bing-search-key")) - -# Number of images Bing will return per query -imgsPerBatch = 10 -# A list of offsets, used to page into the search results -offsets = [(i*imgsPerBatch,) for i in range(100)] -# Since web content is our data, we create a dataframe with options on that data: offsets -bingParameters = spark.createDataFrame(offsets, ["offset"]) - -# Run the Bing Image Search service with our text query -bingSearch = (BingImageSearch() - .setSubscriptionKey(bingSearchKey) - .setOffsetCol("offset") - .setQuery("Martin Luther King Jr. quotes") - .setCount(imgsPerBatch) - .setOutputCol("images")) - -# Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column -getUrls = BingImageSearch.getUrlTransformer("images", "url") - -# This displays the full results returned -bingSearch.transform(bingParameters).show() - -# Since we have two services, they are put into a pipeline -pipeline = PipelineModel(stages=[bingSearch, getUrls]) - -# Show the results of your search: image URLs -pipeline.transform(bingParameters).show() - -``` - - - - -```scala -import com.microsoft.azure.synapse.ml.services.bing.BingImageSearch -import spark.implicits._ - -val bingSearchKey = sys.env.getOrElse("BING_SEARCH_KEY", None) - -// Number of images Bing will return per query -val imgsPerBatch = 10 -// A list of offsets, used to page into the search results -val offsets = (0 until 100).map(i => i * imgsPerBatch) -// Since web content is our data, we create a dataframe with options on that data: offsets -val bingParameters = Seq(offsets).toDF("offset") - -// Run the Bing Image Search service with our text query -val bingSearch = (new BingImageSearch() - .setSubscriptionKey(bingSearchKey) - .setOffsetCol("offset") - .setQuery("Martin Luther King Jr. quotes") - .setCount(imgsPerBatch) - .setOutputCol("images")) - -// Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column -val getUrls = BingImageSearch.getUrlTransformer("images", "url") - -// This displays the full results returned -bingSearch.transform(bingParameters).show() - -// Show the results of your search: image URLs -getUrls.transform(bingSearch.transform(bingParameters)).show() -``` - - - - - diff --git a/docs/Quick Examples/transformers/transformers_cognitive.md b/docs/Quick Examples/transformers/transformers_cognitive.md index 0cd76ed4e96..c6ae5d3b7ca 100644 --- a/docs/Quick Examples/transformers/transformers_cognitive.md +++ b/docs/Quick Examples/transformers/transformers_cognitive.md @@ -45,11 +45,6 @@ import AzureSearch, {toc as AzureSearchTOC} from './cognitive/_AzureSearch.md'; -import BingImageSearch, {toc as BingImageSearchTOC} from './cognitive/_BingImageSearch.md'; - - - - export const toc = [...TextAnalyticsTOC, ...TranslatorTOC, ...ComputerVisionTOC, ...FormRecognizerTOC, ...AnomalyDetectionTOC, ...FaceTOC, ...SpeechToTextTOC, -...AzureSearchTOC, ...BingImageSearchTOC] +...AzureSearchTOC]