diff --git a/Document-Processing-toc.html b/Document-Processing-toc.html index 597f4b0e44..069bf05570 100644 --- a/Document-Processing-toc.html +++ b/Document-Processing-toc.html @@ -172,53 +172,66 @@
  • Data Extraction
  • diff --git a/Document-Processing/Data-Extraction/NET/Assemblies-required-data-extraction.md b/Document-Processing/Data-Extraction/NET/Assemblies-required.md similarity index 71% rename from Document-Processing/Data-Extraction/NET/Assemblies-required-data-extraction.md rename to Document-Processing/Data-Extraction/NET/Assemblies-required.md index 42c6442151..20da70fe17 100644 --- a/Document-Processing/Data-Extraction/NET/Assemblies-required-data-extraction.md +++ b/Document-Processing/Data-Extraction/NET/Assemblies-required.md @@ -1,6 +1,6 @@ --- -title: Assemblies required for DataExtraction | Syncfusion -description: This section details the Syncfusion assemblies required to configure and run Data Extraction seamlessly in .NET projects. +title: Assemblies required for Data Extraction | Syncfusion +description: This section details the Syncfusion assemblies required to configure and run Data Extraction seamlessly in .NET projects. platform: document-processing control: DataExtraction documentation: UG @@ -11,6 +11,7 @@ keywords: Assemblies ## Assemblies for Smart Data Extractor The following assemblies need to be referenced in your application based on the platform. + @@ -22,10 +23,10 @@ The following assemblies need to be referenced in your application based on the - +
    {{'WPF'| markdownify }}, - {{'Windows Forms'| markdownify }} and {{'ASP.NET MVC'|  - markdownify }} + {{'Windows Forms'| markdownify }} and {{'ASP.NET MVC'| markdownify }} + Syncfusion.SmartDataExtractor.Base
    Syncfusion.Compression.Base
    Syncfusion.ImagePreProcessor.Base
    Syncfusion.OCRProcessor.Base
    @@ -42,6 +43,7 @@ The following assemblies need to be referenced in your application based on the and {{'.NET Platforms'| markdownify }}
    + Syncfusion.SmartDataExtractor.Portable
    Syncfusion.Compression.Portable
    Syncfusion.ImagePreProcessor.Portable
    Syncfusion.OCRProcessor.Portable
    @@ -58,6 +60,7 @@ The following assemblies need to be referenced in your application based on the {{'.NET Multi-platform App UI (.NET MAUI)'| markdownify }}
    + Syncfusion.SmartDataExtractor.NET
    Syncfusion.Compression.NET
    Syncfusion.ImagePreProcessor.NET
    Syncfusion.OCRProcessor.NET
    @@ -75,6 +78,7 @@ The following assemblies need to be referenced in your application based on the ## Assemblies for Smart Table Extractor The following assemblies need to be referenced in your application based on the platform. + @@ -90,6 +94,7 @@ The following assemblies need to be referenced in your application based on the markdownify }}
    + Syncfusion.SmartTableExtractor.Base
    Syncfusion.Compression.Base
    Syncfusion.ImagePreProcessor.Base
    Syncfusion.OCRProcessor.Base
    @@ -104,6 +109,7 @@ The following assemblies need to be referenced in your application based on the and {{'.NET Platforms'| markdownify }}
    + Syncfusion.SmartTableExtractor.Portable
    Syncfusion.Compression.Portable
    Syncfusion.ImagePreProcessor.Portable
    Syncfusion.OCRProcessor.Portable
    @@ -118,6 +124,7 @@ The following assemblies need to be referenced in your application based on the {{'.NET Multi-platform App UI (.NET MAUI)'| markdownify }}
    + Syncfusion.SmartTableExtractor.NET
    Syncfusion.Compression.NET
    Syncfusion.ImagePreProcessor.NET
    Syncfusion.OCRProcessor.NET
    @@ -138,48 +145,45 @@ The following assemblies need to be referenced in your application based on the - - - + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + +
    Platform(s)AssemblyDependent AssembliesPlatform(s)Assembly
    Windows Forms, WPF and ASP.NET MVC5Syncfusion.SmartFormRecognizer.Base -
      -
    • Syncfusion.Pdf.Base
    • -
    • Syncfusion.Compression.Base
    • -
    • Syncfusion.PdfToImageConverter.Base
    • -
    -
    Blazor, .NET Core and .NET PlatformsSyncfusion.SmartFormRecognizer.Portable -
      -
    • Syncfusion.Pdf.Portable
    • -
    • Syncfusion.Compression.Portable
    • -
    • Syncfusion.PdfToImageConverter.Portable
    • -
    -
    Syncfusion.SmartFormRecognizer.NET -
      -
    • Syncfusion.Pdf.NET
    • -
    • Syncfusion.Compression.NET
    • -
    • Syncfusion.PdfToImageConverter.NET
    • -
    -
    + {{'WPF'| markdownify }}, + {{'Windows Forms'| markdownify }} and {{'ASP.NET MVC'| markdownify }} + + Syncfusion.SmartFormRecognizer.Base
    + Syncfusion.Compression.Base
    + Syncfusion.Pdf.Base
    + Syncfusion.PdfToImageConverter.Base
    +
    + {{'.NET Core'| markdownify }} and {{'.NET Platforms'| markdownify }} + + Syncfusion.SmartFormRecognizer.Portable
    + Syncfusion.Compression.Portable
    + Syncfusion.Pdf.Portable
    + Syncfusion.PdfToImageConverter.Portable
    +
    + {{'.NET Multi-platform App UI (.NET MAUI)'| markdownify }} + + Syncfusion.SmartFormRecognizer.NET
    + Syncfusion.Compression.NET
    + Syncfusion.Pdf.NET
    + Syncfusion.PdfToImageConverter.NET
    +
    diff --git a/Document-Processing/Data-Extraction/NET/Extract-Data-in-ASP-NET-Core.md b/Document-Processing/Data-Extraction/NET/Extract-Data-in-ASP-NET-Core.md index 77742f2197..56b7e10b20 100644 --- a/Document-Processing/Data-Extraction/NET/Extract-Data-in-ASP-NET-Core.md +++ b/Document-Processing/Data-Extraction/NET/Extract-Data-in-ASP-NET-Core.md @@ -1,6 +1,6 @@ --- title: Extract data in ASP.NET Core | Syncfusion -description: Learn how to extract data from pdf in ASP.NET Core with easy steps using Syncfusion .NET Core Data extraction library. +description: Learn how to extract data from PDF in ASP.NET Core with step‑by‑step guidance using Syncfusion .NET Core Data extraction library. platform: document-processing control: SmartDataExtractor documentation: UG @@ -10,18 +10,181 @@ documentation: UG The Syncfusion® Smart Data Extractor is a .NET library used to extract structured data and document elements from PDFs and images in ASP.NET Core applications. -To include the .NET Core Smart Data Extractor library into your ASP.NET Core application, please refer to the [NuGet Package Required](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/nuget-packages-required) or [Assemblies Required](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/assemblies-required) documentation. +To quickly get started with extracting structured data from PDF and image files in ASP.NET Core using the Syncfusion® Smart Data Extractor library, refer to this video tutorial: +{% youtube "https://www.youtube.com/watch?v=fxijc3BgDTY" %} +To include the .NET Core Smart Data Extractor library into your ASP.NET Core application, please refer to the [NuGet Packages Required](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/nuget-packages-required) or [Assemblies Required](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/assemblies-required) documentation. -## Steps to Extract Data form PDF in ASP.NET Core application + +## Steps to Extract Data from PDF in ASP.NET Core application {% tabcontents %} {% tabcontent Visual Studio %} -{% include_relative tabcontent-support/Extract-Data-in-ASP-NET-Core-Visual-Studio.md %} + +**Prerequisites**: + +* Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). +* Install Visual Studio: Download and install Visual Studio from the [official website](https://visualstudio.microsoft.com/downloads/). + + + +Step 1: Create a new C# ASP.NET Core Web Application project. + ![Create ASP.NET Core Web application in Visual Studio](GettingStarted_images/ASPNETCoreProjectCreation.png) + +Step 2: In configuration window, name your project and click Next. + ![Select Web Application pattern](GettingStarted_images/ASPNETCoreConfiguration.png) + ![Select version](GettingStarted_images/ASPNETCoreAdditionalInfo.png) + + +Step 3: Install the [Syncfusion.SmartDataExtractor.Net.Core](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.Net.Core/) package as reference to your ASP.NET Core applications from [NuGet.org](https://www.nuget.org/). + ![Install SmartDataExtractor .NET Core NuGet package](GettingStarted_images/ASPNETCore_Nuget.png) + +Step 4: A default controller named HomeController.cs is added on creation of ASP.NET Core project. Include the following namespaces in that HomeController.cs file. + +{% highlight c# tabtitle="C#" %} + + using Syncfusion.SmartDataExtractor; + using System.Diagnostics; + using System.Text; + +{% endhighlight %} + +Step 5: A default action method named Index will be present in HomeController.cs. Right click on Index method and select Go To View where you will be directed to its associated view page Index.cshtml. Add a new button in the Index.cshtml as shown below. +{% highlight c# tabtitle="C#" %} + +@{ + Html.BeginForm("ExtractData", "Home", FormMethod.Get); + { +
    + +
    + } + Html.EndForm(); +} + +{% endhighlight %} + +Step 6: Add a new action method named `ExportToJson` in HomeController.cs and include the following code example to extract data as JSON using the DataExtractor (help.syncfusion.com in Bing) class. Then use the **ExtractDataAsJson** method of the DataExtractor object to process the input and export the results in JSON format. + +{% highlight c# tabtitle="C#" %} + +// Open the input PDF file as a stream. +using (FileStream stream = new FileStream(Path.GetFullPath("Input.pdf"), FileMode.Open, FileAccess.Read)) +{ + // Initialize the Smart Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract form data as JSON. + string data = extractor.ExtractDataAsJson(stream); + // Convert JSON string into a MemoryStream for download. + MemoryStream outputStream = new MemoryStream(Encoding.UTF8.GetBytes(data)); + // Reset stream position. + outputStream.Position = 0; + // Return JSON file as download in browser. + FileStreamResult fileStreamResult = new FileStreamResult(outputStream, "application/json"); + fileStreamResult.FileDownloadName = "Output.json"; + return fileStreamResult; +} + +{% endhighlight %} + +Step 7: Build the project. + Click on Build > Build Solution or press Ctrl + Shift + B to build the project. + +Step 8: Run the project. + Click the Start button (green arrow) or press F5 to run the app. + {% endtabcontent %} {% tabcontent Visual Studio Code %} -{% include_relative tabcontent-support/Extract-Data-in-ASP-NET-Core-VS-Code.md %} + +**Prerequisites**: + +* Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). +* Install Visual Studio Code: Download and install Visual Studio Code from the [official website](https://code.visualstudio.com/download). +* Install C# Extension for VS Code: Open Visual Studio Code, go to the Extensions view (Ctrl+Shift+X), and search for 'C#'. Install the official [C# extension provided by Microsoft](https://marketplace.visualstudio.com/items?itemName=ms-dotnettools.csharp). + +Step 1: Open the terminal (Ctrl+` ) and run the following command to create a C# ASP.NET Core Web Application project. + +``` +dotnet new mvc -n ExtractDataASPNETCoreAPP +``` +Step 2: Replace **ExtractDataASPNETCoreAPP** with your desired project name. + +Step 3: Navigate to the project directory using the following command + +``` +cd ExtractDataASPNETCoreAPP +``` +Step 4: Use the following command in the terminal to add the [Syncfusion.SmartDataExtractor.Net.Core ](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.Net.Core) package to your project. + +``` +dotnet add package Syncfusion.SmartDataExtractor.Net.Core +``` + +Step 5: A default controller named HomeController.cs gets added on creation of ASP.NET Core project. Include the following namespaces in that HomeController.cs file. + +{% highlight c# tabtitle="C#" %} + + using Syncfusion.SmartDataExtractor; + using System.Diagnostics; + using System.Text; + +{% endhighlight %} + +Step 6: A default action method named Index will be present in HomeController.cs. Right-click on Index method and select Go To View where you will be directed to its associated view page Index.cshtml. Add a new button in the Index.cshtml as shown below. + +{% highlight c# tabtitle="C#" %} + +@{ + Html.BeginForm("ExtractData", "Home", FormMethod.Get); + { +
    + +
    + } + Html.EndForm(); +} + +{% endhighlight %} + +Step 7: Add a new action method named `ExportToJson` in HomeController.cs and include the following code example to extract data as JSON using the DataExtractor (help.syncfusion.com in Bing) class. Then use the **ExtractDataAsJson** method of the DataExtractor object to process the input and export the results in JSON format. + +{% highlight c# tabtitle="C#" %} + +// Open the input PDF file as a stream. +using (FileStream stream = new FileStream(Path.GetFullPath("Input.pdf"), FileMode.Open, FileAccess.Read)) +{ + // Initialize the Smart Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract form data as JSON. + string data = extractor.ExtractDataAsJson(stream); + // Convert JSON string into a MemoryStream for download. + MemoryStream outputStream = new MemoryStream(Encoding.UTF8.GetBytes(data)); + // Reset stream position. + outputStream.Position = 0; + // Return JSON file as download in browser. + FileStreamResult fileStreamResult = new FileStreamResult(outputStream, "application/json"); + fileStreamResult.FileDownloadName = "Output.json"; + return fileStreamResult; +} + +{% endhighlight %} + +Step 8: Build the project. + +Run the following command in terminal to build the project. + +``` +dotnet build +``` + +Step 9: Run the project. + +Run the following command in terminal to run the project. + +``` +dotnet run +``` {% endtabcontent %} {% endtabcontents %} @@ -30,5 +193,3 @@ You can download a complete working sample from [GitHub](https://github.com/Sync By executing the program, you will get the PDF document as follows. ![ASP.Net Core output JSON document](GettingStarted_images/JSON_Output.png) - -Click [here](https://www.syncfusion.com/document-sdk/net-pdf-data-extraction) to explore the rich set of Syncfusion® Data Extraction library features. \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/Extract-Data-in-ASP-NET-MVC.md b/Document-Processing/Data-Extraction/NET/Extract-Data-in-ASP-NET-MVC.md index bdc267e22d..645f914fc5 100644 --- a/Document-Processing/Data-Extraction/NET/Extract-Data-in-ASP-NET-MVC.md +++ b/Document-Processing/Data-Extraction/NET/Extract-Data-in-ASP-NET-MVC.md @@ -1,6 +1,6 @@ --- title: Extract Data in ASP.NET MVC Application | Syncfusion -description: Learn how to extract data in an ASP.NET MVC application with easy steps using the Syncfusion Data Extraction library. +description: Learn how to extract data in an ASP.NET MVC application with step‑by‑step guidance using the Syncfusion Data Extraction library. platform: document-processing control: SmartDataExtractor documentation: UG @@ -8,7 +8,7 @@ keywords: Assemblies --- -# Extracting Data in ASP.NET MVC +# Extract Data in ASP.NET MVC The Syncfusion® Smart Data Extractor is a .NET library used to extract structured data and document elements from PDFs and images in ASP.NET MVC applications. @@ -17,11 +17,11 @@ The Syncfusion® Smart Data Extractor is a .NET library used to ex Step 1: Create a new C# ASP.NET Web Application (.NET Framework) project. ![Create ASP.NET MVC application](GettingStarted_images/MVC_ProjectCreation.png) -Step 2: In the project configuration windows, name your project and select Create. +Step 2: In the project configuration window, name your project and select Create. ![Configuration window1](GettingStarted_images/MVC_Data_ProjectCreation.png) ![Configuration window2](GettingStarted_images/CreateASPNetMVCProject_DataExtractor.png) -Step 3: Install [Syncfusion.SmartDataExtractor.AspNet.Mvc5](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.AspNet.Mvc5) NuGet package as reference to your .NET applications from [NuGet.org](https://www.nuget.org/). +Step 3: Install [Syncfusion.SmartDataExtractor.AspNet.Mvc5](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.AspNet.Mvc5) NuGet package as reference to your .NET application from [NuGet.org](https://www.nuget.org/). ![NuGet package installation](GettingStarted_images/MVC_DataExtractorNuget.png) Step 4: Include the following namespaces in the HomeController.cs file. @@ -51,7 +51,7 @@ Step 5: Add a new button in the Index.cshtml as shown below. {% endhighlight %} -Step 6: Add a new action method named ExtractData in `HomeController.cs` and include the below code example to extract data from a PDF document using the **ExtractDataAsJson** method in the **DataExtractor** class. +Step 6: Add a new action method named ExtractData in `HomeController.cs` and include the following code example to extract data from a PDF document using the **ExtractDataAsJson** method in the **DataExtractor** class. {% highlight c# tabtitle="C#" %} @@ -76,9 +76,9 @@ using (FileStream stream = new FileStream(inputPath, FileMode.Open, FileAccess.R {% endhighlight %} By executing the program, you will get the PDF document as follows. -![HTML to PDF output document](GettingStarted_images/JSON_Output.png) +![ASP.NET MVC output JSON document](GettingStarted_images/JSON_Output.png) -A complete working sample can be downloaded from [Github](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Getting-Started/ASP.NETMVC/Extract_Data). +A complete working sample can be downloaded from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Getting-Started/ASP.NETMVC/Extract_Data). Click [here](https://www.syncfusion.com/document-sdk/net-pdf-data-extraction) to explore the rich set of Syncfusion® Data Extraction library features. \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/Extract-Data-in-Blazor.md b/Document-Processing/Data-Extraction/NET/Extract-Data-in-Blazor.md index 7089fb1eb4..eee106fa53 100644 --- a/Document-Processing/Data-Extraction/NET/Extract-Data-in-Blazor.md +++ b/Document-Processing/Data-Extraction/NET/Extract-Data-in-Blazor.md @@ -21,10 +21,10 @@ The Syncfusion® Smart Data Extractor is a .NET library used to ex **Prerequisites**: * Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). -* Install Visual Studio: Download and install Visual Studio Code from the [official website](https://code.visualstudio.com/download). +* Install Visual Studio: Download and install Visual Studio from the [official website](https://code.visualstudio.com/download). -Step 1: Create a new C# Blazor Web app project. +Step 1: Create a new C# Blazor Web App project. * Select "Blazor Web App" from the template and click **Next**. ![Create Blazor Web App application in Visual Studio](GettingStarted_images/Blazor_ProjectCreation.png) @@ -45,7 +45,7 @@ To **Extract Data from PDF in a Blazor Web App Server**, install [Syncfusion.Sma Step 3: Create a Razor file named `Home.razor` in the `Pages` folder, which is located inside the `Components` folder. -Include the following namespaces in the file: +Add the following directives and service injections in the file {% tabs %} {% highlight c# tabtitle="C#" %} @@ -180,7 +180,7 @@ public static class FileUtils Step 10: Add JavaScript function to `App.razor`. -Add the following JavaScript function in the `App.razor` file located in the `Pages` folder. +Add the following JavaScript function in the `App.razor` file located in the root of the project. {% tabs %} @@ -241,7 +241,7 @@ Click the Start button (green arrow) or press F5 to run the applicati Upon executing the program, the **JSON document** will be generated as follows. -![Data Extraction in Blazor Web App Server](GettingStarted_images/JSON_Output.png) +![Blazor Web App Server output JSON document](GettingStarted_images/JSON_Output.png) {% endtabcontent %} @@ -254,7 +254,7 @@ Upon executing the program, the **JSON document** will be generated as follows. * Open Visual Studio Code and install the [C# for Visual Studio Code extension](https://marketplace.visualstudio.com/items?itemName=ms-dotnettools.csharp) from the Extensions Marketplace. -Step 1: Create a new C# Blazor Web app project. +Step 1: Create a new C# Blazor Web App project. * Open the command palette by pressing Ctrl+Shift+P and type **.NET:New Project** and enter. * Choose the **Blazor Web App** template. @@ -275,7 +275,7 @@ dotnet add package Syncfusion.SmartDataExtractor.NET.Core Step 3: Create a Razor file named `Home.razor` in the `Pages` folder, which is located inside the `Components` folder. -Include the following namespaces in the file: +Add the following directives and service injections in the file {% tabs %} {% highlight c# tabtitle="C#" %} @@ -415,7 +415,7 @@ public static class FileUtils Step 10: Add JavaScript function to `App.razor`. -Add the following JavaScript function in the `App.razor` file located in the `Pages` folder. +Add the following JavaScript function in the `App.razor` file located in the root of the project. {% tabs %} @@ -484,7 +484,7 @@ dotnet run Upon executing the program, the **JSON document** will be generated as follows. -![Data Extraction in Blazor Web App Server](GettingStarted_images/JSON_Output.png) +![Blazor Web App Server output JSON document](GettingStarted_images/JSON_Output.png) {% endtabcontent %} diff --git a/Document-Processing/Data-Extraction/NET/Extract-Data-in-Console.md b/Document-Processing/Data-Extraction/NET/Extract-Data-in-Console.md index 31bfabd3f2..cac2eb2884 100644 --- a/Document-Processing/Data-Extraction/NET/Extract-Data-in-Console.md +++ b/Document-Processing/Data-Extraction/NET/Extract-Data-in-Console.md @@ -14,11 +14,132 @@ The Syncfusion® Smart Data Extractor is a .NET library used to ex {% tabcontents %} {% tabcontent Visual Studio %} -{% include_relative tabcontent-support/Extract-Data-in-Console-Visual-Studio.md %} + +**Prerequisites**: + +* Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). +* Install Visual Studio: Download and install Visual Studio from the [official website](https://code.visualstudio.com/download). + +Step 1: Create a new C# Console Application project. +![Console sample creation](GettingStarted_images/ConsoleCreation.png) + +Step 2: Name the project. +![Name the application](GettingStarted_images/ConsoleName.png) + +Step 3: Install the [Syncfusion.SmartDataExtractor.Net.Core](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.Net.Core) NuGet package as reference to your .NET Standard applications from [NuGet.org](https://www.nuget.org). +![NET Core NuGet package](GettingStarted_images/ConsoleCoreNuget.png) + +Step 4: Include the following namespaces in the *Program.cs* file. + +{% highlight c# tabtitle="C#" %} + +using System.IO; +using Syncfusion.Pdf.Parsing; +using Syncfusion.SmartDataExtractor; + +{% endhighlight %} + +Step 5: Include the following code snippet in *Program.cs* to Extract data from an PDF file. + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +//Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Smart Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Extract data as JSON. + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); +} + +{% endhighlight %} + +{% endtabs %} + +Step 6: Build the project. + +Click on Build > Build Solution or press Ctrl + Shift + B to build the project. + +Step 7: Run the project. + +Click the Start button (green arrow) or press F5 to run the app. + {% endtabcontent %} {% tabcontent Visual Studio Code %} -{% include_relative tabcontent-support/Extract-Data-in-Console-VS-Code.md %} +**Prerequisites**: + +* Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). +* Install Visual Studio Code: Download and install Visual Studio Code from the [official website](https://code.visualstudio.com/download). +* Install C# Extension for VS Code: Open Visual Studio Code, go to the Extensions view (Ctrl+Shift+X), and search for 'C#'. Install the official [C# extension provided by Microsoft](https://marketplace.visualstudio.com/items?itemName=ms-dotnettools.csharp). + + + +Step 1: Open the terminal (Ctrl+` ) and run the following command to create a new .NET Core console application project. + +``` +dotnet new console -n ExtractDataConsoleApp +``` +Step 2: Replace ****ExtractDataConsoleApp** with your desired project name. + +Step 3: Navigate to the project directory using the following command + +``` +cd ExtractDataConsoleApp +``` +Step 4: Use the following command in the terminal to add the [Syncfusion.SmartDataExtractor.Net.Core](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.Net.Core) package to your project. + +``` +dotnet add package Syncfusion.SmartDataExtractor.Net.Core +``` + +Step 5: Include the following namespaces in the *Program.cs* file. + +{% highlight c# tabtitle="C#" %} + +using System.IO; +using Syncfusion.Pdf.Parsing; +using Syncfusion.SmartDataExtractor; + +{% endhighlight %} + +Step 6: Include the following code snippet in *Program.cs* to Extract data from an PDF file. + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +//Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Smart Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Extract data as JSON. + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); +} + +{% endhighlight %} + +Step 7: Build the project. + +Run the following command in terminal to build the project. + +``` +dotnet build +``` + +Step 8: Run the project. + +Run the following command in terminal to run the project. + +``` +dotnet run +``` + {% endtabcontent %} {% endtabcontents %} @@ -35,7 +156,7 @@ The following steps illustrates Extracting Data from PDF document in console app **Prerequisites**: * Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). -* Install Visual Studio: Download and install Visual Studio Code from the [official website](https://code.visualstudio.com/download). +* Install Visual Studio: Download and install Visual Studio from the [official website](https://code.visualstudio.com/download). **Steps to Extract Data from PDF using .NET Framework** @@ -86,6 +207,6 @@ Click the Start button (green arrow) or press F5 to run the app. You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Getting-Started/Console/.NETFramework/Extract_Data). By executing the program, you will get the PDF document as follows. -![Console output PDF document](GettingStarted_images/JSON_Output.png) +![Console output JSON document](GettingStarted_images/JSON_Output.png) diff --git a/Document-Processing/Data-Extraction/NET/Extract-Data-in-MAUI.md b/Document-Processing/Data-Extraction/NET/Extract-Data-in-MAUI.md index ebf035bac7..846bebc344 100644 --- a/Document-Processing/Data-Extraction/NET/Extract-Data-in-MAUI.md +++ b/Document-Processing/Data-Extraction/NET/Extract-Data-in-MAUI.md @@ -8,9 +8,9 @@ keywords: Assemblies --- -# Extract Data from PDF in MAUI Application +# Extract Data from PDF in .NET MAUI Application -The Syncfusion® Smart Data Extractor is a .NET library used to extract structured data and document elements from PDFs and images in NET MAUI applications. +The Syncfusion® Smart Data Extractor is a .NET library used to extract structured data and document elements from PDFs and images in .NET MAUI applications. ## Steps to Extract Data from PDF in .NET MAUI @@ -38,7 +38,7 @@ Step 3: Install the [Syncfusion.SmartDataExtractor.NET](https://www.nuget.org/pa N> If you reference Syncfusion® assemblies from trial setup or from the NuGet feed, you also have to add "Syncfusion.Licensing" assembly reference and include a license key in your projects. Please refer to this [link](https://help.syncfusion.com/common/essential-studio/licensing/overview) to know about registering a Syncfusion® license key in your application to use our components. -Step 3: Add a new button to the **MainPage.xaml** as shown below. +Step 4: Add a new button to the **MainPage.xaml** as shown below. {% tabs %} @@ -72,7 +72,7 @@ Step 3: Add a new button to the **MainPage.xaml** as shown below. {% endtabs %} -Step 4: Include the following namespaces in the **MainPage.xaml.cs** file. +Step 5: Include the following namespaces in the **MainPage.xaml.cs** file. {% tabs %} @@ -85,7 +85,7 @@ using Syncfusion.SmartDataExtractor; {% endtabs %} -Step 5: Add a new action method **OnExtractDataClicked** in MainPage.xaml.cs and include the below code snippet to **Extract Data from PDF**. +Step 6: Add a new action method **OnExtractDataClicked** in MainPage.xaml.cs and include the following code snippet to **Extract Data from PDF**. {% tabs %} @@ -108,14 +108,14 @@ await DisplayAlert("Success", $"Extracted data saved to {outputPath}", "OK"); {% endtabs %} -Step 6: Run the Application. +Step 7: Run the Application. 1. Select the target framework, device or emulator. 2. Press F5 to run the application. -By executing the program, you will get the **JSON File** as follows. +By executing the program, you will get the **JSON file** as follows. -![Extract Data in .NET MAUI](GettingStarted_images/JSON_Output.png) +![.NET MAUI output JSON document](GettingStarted_images/JSON_Output.png) Click [here](https://www.syncfusion.com/document-sdk/net-pdf-data-extraction) to explore the rich set of Syncfusion® Data Extraction library features. @@ -141,13 +141,13 @@ Step 1: Create a new C# .NET MAUI app project. * Select the project location, type the project name and press enter. * Then choose **Create project**. -Step 2: To **Extract Data from PDF Document in .NET MAUI app**, install [Syncfusion.SmartDataExtractor.NET.Core](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.NET.Core) to the MAUI project. +Step 2: To **Extract Data from PDF Document in .NET MAUI app**, install [Syncfusion.SmartDataExtractor.NET](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.NET) to the MAUI project. * Press Ctrl + ` (backtick) to open the integrated terminal in Visual Studio Code. * Ensure you're in the project root directory where your .csproj file is located. -* Run the command `dotnet add package Syncfusion.SmartDataExtractor.NET.Core` to install the NuGet package. +* Run the command `dotnet add package Syncfusion.SmartDataExtractor.NET` to install the NuGet package. ``` -dotnet add package Syncfusion.SmartDataExtractor.NET.Core +dotnet add package Syncfusion.SmartDataExtractor.NET ``` Step 3: Add a new button to the **MainPage.xaml** as shown below. @@ -195,7 +195,7 @@ using Syncfusion.SmartDataExtractor; {% endtabs %} -Step 5: Add a new action method **OnExtractDataClicked** in MainPage.xaml.cs and include the below code snippet to **Extract Data from PDF**. +Step 5: Add a new action method **OnExtractDataClicked** in MainPage.xaml.cs and include the following code snippet to **Extract Data from PDF**. {% tabs %} @@ -224,9 +224,9 @@ Step 6: Run the Application. 1. Select the target framework, device or emulator. 2. Press F5 to run the application. -By executing the program, you will get the **JSON File** as follows. +By executing the program, you will get the **JSON file** as follows. -![Extract Data in .NET MAUI](GettingStarted_images/JSON_Output.png) +![.NET MAUI output JSON document](GettingStarted_images/JSON_Output.png) {% endtabcontent %} diff --git a/Document-Processing/Data-Extraction/NET/Extract-Data-in-WPF.md b/Document-Processing/Data-Extraction/NET/Extract-Data-in-WPF.md index 9d5e2bed83..27c9316c41 100644 --- a/Document-Processing/Data-Extraction/NET/Extract-Data-in-WPF.md +++ b/Document-Processing/Data-Extraction/NET/Extract-Data-in-WPF.md @@ -1,8 +1,8 @@ --- title: Extract Data in WPF Application | Syncfusion -description: Learn how to extract data in a WPF application with easy steps using the Syncfusion Smart Data Extractor library. +description: Learn how to extract data in a WPF application with step‑by‑step guidance using the Syncfusion Smart Data Extractor library. platform: document-processing -control: Smart Data Extractor +control: SmartDataExtractor documentation: UG keywords: Assemblies @@ -17,7 +17,7 @@ The Syncfusion® Smart Data Extractor is a .NET library used to ex Step 1: Create a new WPF application project. ![Create WPF sample](GettingStarted_images/WPF_ProjectCreation.png) -In project configuration window, name your project and select Create. +In the project configuration window, name your project and select Create. ![WPF configuration window](GettingStarted_images/WPF_Data_ProjectName.png) Step 2: Install the [Syncfusion.SmartDataExtractor.WPF](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.WPF) NuGet package as a reference to your WPF application [NuGet.org](https://www.nuget.org/). @@ -36,7 +36,7 @@ using System.Windows; {% endhighlight %} -Step 4: Add a new button in MainWindow.xaml to Extract data from PDF document as follows. +Step 4: Add a new button in MainWindow.xaml to extract data from a PDF document as follows. {% highlight c# tabtitle="C#" %} @@ -50,7 +50,7 @@ Step 4: Add a new button in MainWindow.xaml to Extract data from PDF document as {% endhighlight %} -Step 5: Add the following code in `btnCreate_Click` to extract data from a PDF document using the **ExtractDataAsJson** method in the **DataExtractor** class. The extracted content will be saved as a JSON file +Step 5: Add the following code in `ExtractButton_Click` to extract data from a PDF document using the **ExtractDataAsJson** method in the **DataExtractor** class. The extracted content will be saved as a JSON file {% highlight c# tabtitle="C#" %} @@ -67,9 +67,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% endhighlight %} -By executing the program, you will get the PDF document as follows. -![Convert HTMLToPDF WPF output](GettingStarted_images/JSON_Output.png) +By executing the program, you will get the JSON document as follows. +![WPF output JSON document](GettingStarted_images/JSON_Output.png) -A complete working sample can be downloaded from [Github](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Getting-Started/WPF/Extract_Data). +A complete working sample can be downloaded from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Getting-Started/WPF/Extract_Data). Click [here](https://www.syncfusion.com/document-sdk/net-pdf-data-extraction) to explore the rich set of Syncfusion®Data Extraction library features. \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/Extract-Data-in-Windows-Forms.md b/Document-Processing/Data-Extraction/NET/Extract-Data-in-Windows-Forms.md index e72b136033..52866531f2 100644 --- a/Document-Processing/Data-Extraction/NET/Extract-Data-in-Windows-Forms.md +++ b/Document-Processing/Data-Extraction/NET/Extract-Data-in-Windows-Forms.md @@ -49,7 +49,7 @@ using Syncfusion.SmartDataExtractor; {% endtabs %} -Step 5: Add a new button in **Form1.Designer.cs** to create Word file as follows. +Step 5: Add a new button in **Form1.Designer.cs** to extract data from PDF as follows. {% tabs %} @@ -85,7 +85,7 @@ Text = "Extract Data from PDF"; {% endtabs %} -Step 6: Add the following code in **btnExtract_Click** to **Extract data PDF** with simple text. +Step 6: Add the following code in **btnExtract_Click** to extract data from PDF. {% tabs %} @@ -116,7 +116,7 @@ Click the Start button (green arrow) or press F5 to run the app. By executing the program, you will get the **JSON** as follows. -![Word to PDF in Windows Forms](GettingStarted_images/JSON_Output.png) +![Windows Forms output JSON document](GettingStarted_images/JSON_Output.png) {% endtabcontent %} diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ASPNETCoreAdditionalInfo.png b/Document-Processing/Data-Extraction/NET/GettingStarted_images/ASPNETCoreAdditionalInfo.png similarity index 100% rename from Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ASPNETCoreAdditionalInfo.png rename to Document-Processing/Data-Extraction/NET/GettingStarted_images/ASPNETCoreAdditionalInfo.png diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ASPNETCoreConfiguration.png b/Document-Processing/Data-Extraction/NET/GettingStarted_images/ASPNETCoreConfiguration.png similarity index 100% rename from Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ASPNETCoreConfiguration.png rename to Document-Processing/Data-Extraction/NET/GettingStarted_images/ASPNETCoreConfiguration.png diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ASPNETCoreProjectCreation.png b/Document-Processing/Data-Extraction/NET/GettingStarted_images/ASPNETCoreProjectCreation.png similarity index 100% rename from Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ASPNETCoreProjectCreation.png rename to Document-Processing/Data-Extraction/NET/GettingStarted_images/ASPNETCoreProjectCreation.png diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ASPNETCore_Nuget.png b/Document-Processing/Data-Extraction/NET/GettingStarted_images/ASPNETCore_Nuget.png similarity index 100% rename from Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ASPNETCore_Nuget.png rename to Document-Processing/Data-Extraction/NET/GettingStarted_images/ASPNETCore_Nuget.png diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ConsoleCoreNuget.png b/Document-Processing/Data-Extraction/NET/GettingStarted_images/ConsoleCoreNuget.png similarity index 100% rename from Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ConsoleCoreNuget.png rename to Document-Processing/Data-Extraction/NET/GettingStarted_images/ConsoleCoreNuget.png diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ConsoleCreation.png b/Document-Processing/Data-Extraction/NET/GettingStarted_images/ConsoleCreation.png similarity index 100% rename from Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ConsoleCreation.png rename to Document-Processing/Data-Extraction/NET/GettingStarted_images/ConsoleCreation.png diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ConsoleName.png b/Document-Processing/Data-Extraction/NET/GettingStarted_images/ConsoleName.png similarity index 100% rename from Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ConsoleName.png rename to Document-Processing/Data-Extraction/NET/GettingStarted_images/ConsoleName.png diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ConsoleNuget.png b/Document-Processing/Data-Extraction/NET/GettingStarted_images/ConsoleNuget.png similarity index 100% rename from Document-Processing/Data-Extraction/NET/tabcontent-support/Images/ConsoleNuget.png rename to Document-Processing/Data-Extraction/NET/GettingStarted_images/ConsoleNuget.png diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Images/Framework-name.png b/Document-Processing/Data-Extraction/NET/GettingStarted_images/Framework-name.png similarity index 100% rename from Document-Processing/Data-Extraction/NET/tabcontent-support/Images/Framework-name.png rename to Document-Processing/Data-Extraction/NET/GettingStarted_images/Framework-name.png diff --git a/Document-Processing/Data-Extraction/NET/Nuget-packages-required-data-extraction.md b/Document-Processing/Data-Extraction/NET/Nuget-packages-required.md similarity index 91% rename from Document-Processing/Data-Extraction/NET/Nuget-packages-required-data-extraction.md rename to Document-Processing/Data-Extraction/NET/Nuget-packages-required.md index 2a3f94f5c4..3a4a72874f 100644 --- a/Document-Processing/Data-Extraction/NET/Nuget-packages-required-data-extraction.md +++ b/Document-Processing/Data-Extraction/NET/Nuget-packages-required.md @@ -122,9 +122,9 @@ Windows UI (WinUI)
    -## NuGet Packages Required for SmartFormRecognizer +## NuGet Packages Required for Smart Form Recognizer -To work with SmartFormRecognizer, the following NuGet packages need to be installed in your application. +To work with Smart Form Recognizer, the following NuGet packages need to be installed in your application from [nuget.org](https://www.nuget.org/). @@ -146,7 +146,7 @@ Windows Forms WPF @@ -170,4 +170,3 @@ ASP.NET MVC5
    -{{'[Syncfusion.SmartFormRecognizer.WPF.nupkg](https://www.nuget.org/packages/Syncfusion.SmartFormRecognizer.wpf)'| markdownify }} +{{'[Syncfusion.SmartFormRecognizer.Wpf.nupkg](https://www.nuget.org/packages/Syncfusion.SmartFormRecognizer.Wpf)'| markdownify }}
    -N> The above mentioned NuGet packages are available in [nuget.org](https://www.nuget.org/). diff --git a/Document-Processing/Data-Extraction/NET/conversions/image-to-json.md b/Document-Processing/Data-Extraction/NET/conversions/image-to-json.md deleted file mode 100644 index 6a41f6502f..0000000000 --- a/Document-Processing/Data-Extraction/NET/conversions/image-to-json.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: Extract Image to JSON in C# | Smart Data Extractor | Syncfusion -description: Learn how to extract structured data from images as JSON in C# using the Syncfusion® Smart Data Extractor library for .NET applications. -platform: document-processing -control: SmartDataExtractor -documentation: UG -keywords: Data Extraction, Image to JSON, Smart Data Extractor, Assemblies, NuGet Packages ---- - -# Image to JSON Extraction - -JavaScript Object Notation (JSON) is a lightweight data‑interchange format that is easy for humans to read and write, and simple for machines to parse and generate. The Syncfusion® Smart Data Extractor library extracts structured information from images and outputs the content as JSON. It analyzes text blocks, tables, headers, and form fields to preserve structure, enabling developers to integrate image to JSON extraction into their applications. - -## Assemblies and NuGet packages required - -Refer to the following links for the assemblies and NuGet packages required on different platforms to extract data as a JSON file using the Smart Data Extractor library. - -* [Assemblies required for Image to JSON Extraction](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/assemblies-required) -* [NuGet packages required for Image to JSON Extraction](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/nuget-packages-required) - -## Extract Data as JSON from an Image - -To extract structured data from an image document using the **ExtractDataAsJson** method of the **DataExtractor** class, refer to the following code examples. - -{% tabs %} - -{% highlight c# tabtitle="C# [Cross-platform]" %} - -using System.IO; -using Syncfusion.SmartDataExtractor; -using System.Text; - -//Open the input image file as a stream. -using (FileStream stream = new FileStream("Image.png", FileMode.Open, FileAccess.Read)) -{ - //Initialize the Data Extractor. - DataExtractor extractor = new DataExtractor(); - //Extract data as JSON from the image stream. - string data = extractor.ExtractDataAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); -} - -{% endhighlight %} - -{% highlight c# tabtitle="C# [Windows-specific]" %} - -using System.IO; -using Syncfusion.SmartDataExtractor; -using System.Text; - -//Open the input image file as a stream. -using (FileStream stream = new FileStream("Image.png", FileMode.Open, FileAccess.Read)) -{ - //Initialize the Data Extractor. - DataExtractor extractor = new DataExtractor(); - //Extract data as JSON from the image stream. - string data = extractor.ExtractDataAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); -} - -{% endhighlight %} - -{% endtabs %} - -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-JSON-from-an-Image/.NET). \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/conversions/image-to-md.md b/Document-Processing/Data-Extraction/NET/conversions/image-to-md.md deleted file mode 100644 index 4126ed3ce4..0000000000 --- a/Document-Processing/Data-Extraction/NET/conversions/image-to-md.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -title: Extract Image to Markdown in C# | Smart Data Extractor | Syncfusion -description: Learn how to extract structured content from images as Markdown in C# using the Syncfusion® Smart Data Extractor library for .NET applications. -platform: document-processing -control: SmartDataExtractor -documentation: UG -keywords: Assemblies ---- - -# Image to Markdown Extraction - -Markdown is a lightweight markup language that adds formatting elements to plain text documents. The Syncfusion® Smart Data Extractor library extracts structured information from scanned images and outputs the content as Markdown (MD). It analyzes text blocks, tables, headers, and form fields to preserve layout and formatting, enabling developers to integrate image to Markdown extraction into their applications. - -## Assemblies and NuGet packages required - -Refer to the following links for the assemblies and NuGet packages required on different platforms to extract data as a Markdown file using the Smart Data Extractor library. - -* [Assemblies required for Image to Markdown Extraction](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/assemblies-required) -* [NuGet packages required for Image to Markdown Extraction](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/nuget-packages-required) - -## Extract Data as Markdown from Image - -To extract form fields across a PDF document using the **ExtractDataAsMarkdown** method of the **DataExtractor** class, refer to the following code example: - -{% tabs %} - -{% highlight c# tabtitle="C# [Cross-platform]" %} - -using System.IO; -using Syncfusion.SmartDataExtractor; -using System.Text; - -//Open the input image file as a stream. -using (FileStream stream = new FileStream("InputImage.jpg", FileMode.Open, FileAccess.Read)) -{ - //Initialize the Smart Data Extractor. - DataExtractor extractor = new DataExtractor(); - - //Extract data as Markdown. - string data = extractor.ExtractDataAsMarkdown(stream); - - //Save the extracted Markdown data into an output file. - File.WriteAllText("Output.md", data, Encoding.UTF8); -} - -{% endhighlight %} - -{% highlight c# tabtitle="C# [Windows-specific]" %} - -using System.IO; -using Syncfusion.SmartDataExtractor; -using System.Text; - -//Open the input image file as a stream. -using (FileStream stream = new FileStream("InputImage.jpg", FileMode.Open, FileAccess.Read)) -{ - //Initialize the Smart Data Extractor. - DataExtractor extractor = new DataExtractor(); - - //Extract data as Markdown. - string data = extractor.ExtractDataAsMarkdown(stream); - - //Save the extracted Markdown data into an output file. - File.WriteAllText("Output.md", data, Encoding.UTF8); -} - -{% endhighlight %} - -{% endtabs %} \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-json.md b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-json.md index a37333a5e6..4c4711f37b 100644 --- a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-json.md +++ b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-json.md @@ -18,7 +18,7 @@ Refer to the following links for the assemblies and NuGet packages required on d * [Assemblies required for PDF to JSON Extraction](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/assemblies-required) * [NuGet packages required for PDF to JSON Extraction](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/nuget-packages-required) -## Extract Data as JSON from PDF Document +## Extract Data as JSON from PDF or Image To extract form fields across a PDF document using the **ExtractDataAsJson** method of the **DataExtractor** class, refer to the following code example: @@ -66,6 +66,8 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% endtabs %} +N> If you want to extract data from an image instead of a PDF, replace the input stream with the image file (for example, Input.jpg or Input.png). The rest of the code remains unchanged. + You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-JSON-from-PDF/.NET). diff --git a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md index 6408a08d19..a9157274f4 100644 --- a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md +++ b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md @@ -18,13 +18,13 @@ Refer to the following links for assemblies and NuGet packages required based on * [PDF to Markdown Extraction assemblies](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/assemblies-required) * [PDF to Markdown Extraction NuGet packages](https://help.syncfusion.com/document-processing/data-extraction/smart-data-extractor/net/nuget-packages-required) -## Extract Data as Markdown from PDF Document +## Extract Data as Markdown from PDF or Image To extract form fields across a PDF document using the **ExtractDataAsMarkdown** method of the **DataExtractor** class, refer to the following code example: {% tabs %} -{% highlight c# tabtitle="C# [Cross-platform]" %} +{% highlight c# tabtitle="C# [Cross-platform]" playgroundButtonLink="https://raw.githubusercontent.com/SyncfusionExamples/PDF-Examples/refs/heads/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-MD-from-PDF/.NET/Extract-data-as-MD-from-PDF/Program.cs" %} using System.IO; using Syncfusion.SmartDataExtractor; @@ -66,6 +66,8 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% endtabs %} +N> If you want to extract data from an image instead of a PDF, replace the input stream with the image file (for example, Input.jpg or Input.png). The rest of the code remains unchanged. + You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-MD-from-PDF/.NET). ## Extract a specific page to Markdown diff --git a/Document-Processing/Data-Extraction/NET/extract-structured-data.md b/Document-Processing/Data-Extraction/NET/extract-structured-data.md index ed80244e6e..91b416bbd3 100644 --- a/Document-Processing/Data-Extraction/NET/extract-structured-data.md +++ b/Document-Processing/Data-Extraction/NET/extract-structured-data.md @@ -10,123 +10,119 @@ keywords: Assemblies # Working with Data Extraction -## Extract Data from PDF Document +Refer to the following links for the assemblies and NuGet packages required on different platforms to perform data extraction with the Smart Data Extractor library. -To extract structured data such as text, form fields, tables and images from an entire PDF document using the **ExtractDataAsPdfDocument** method of the **DataExtractor** class, refer to the following code example: +* [Assemblies required to perform data extraction]() +* [NuGet packages required to perform data extraction]() -{% tabs %} +## Extract Data as JSON from PDF or Image -{% highlight c# tabtitle="C# [Cross-platform]" %} +The **Smart Data Extractor** enables you to process PDF documents or scanned images and export the structured content as JSON. +This section covers two scenarios: +- Extracting data as JSON from a **PDF document**. +- Extracting data as JSON from an **image**. + +### Extract Data as JSON from PDF + +To extract form fields across a PDF document using the **ExtractDataAsJson** method of the **DataExtractor** class, refer to the following code example: + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" playgroundButtonLink="https://raw.githubusercontent.com/SyncfusionExamples/PDF-Examples/refs/heads/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-JSON-from-PDF/.NET/Extract-data-as-JSON-from-PDF-document/Program.cs" %} -using System.IO; -using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; +using System.Text; //Open the input PDF file as a stream. -using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read, FileShare.Read)) +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) { //Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - //Extract data and return as a loaded PDF document. - PdfLoadedDocument document = extractor.ExtractDataAsPdfDocument(inputStream); - //Save the extracted output as a new PDF file. - document.Save("Output.pdf"); - //Close the document to release resources. - document.Close(true); + //Extract data as JSON. + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; -using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; +using System.Text; //Open the input PDF file as a stream. -using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read, FileShare.Read)) +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) { //Initialize the Smart Data Extractor. - DataExtractor extractor = new DataExtractor(); - //Extract data and return as a loaded PDF document. - PdfLoadedDocument document = extractor.ExtractDataAsPdfDocument(inputStream); - //Save the extracted output as a new PDF file. - document.Save("Output.pdf"); - //Close the document to release resources. - document.Close(true); + DataExtractor extractor = new DataExtractor(); + //Extract data as JSON. + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } - + {% endhighlight %} -{% endtabs %} +{% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-from-PDF-document/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-JSON-from-PDF/.NET). -## Extract Data as Stream +### Extract Data as JSON from an Image -To extract structured data from a PDF document and return the output as a stream using the **ExtractDataAsPdfStream** method of the **DataExtractor** class, refer to the following example. +To extract structured data from an image document using the **ExtractDataAsJson** method of the **DataExtractor** class, refer to the following code examples. {% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using Syncfusion.SmartDataExtractor; +using System.Text; -//Open the input PDF file as a stream. -using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read, FileShare.Read)) +//Open the input image file as a stream. +using (FileStream stream = new FileStream("Image.png", FileMode.Open, FileAccess.Read)) { - //Initialize the Smart Data Extractor. + //Initialize the Data Extractor. DataExtractor extractor = new DataExtractor(); - //Extract data and return as a PDF stream. - Stream pdfStream = extractor.ExtractDataAsPdfStream(inputStream); - - //Save the extracted PDF stream into an output file. - using (FileStream outputStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.Write)) - { - pdfStream.CopyTo(outputStream); - } + //Extract data as JSON from the image stream. + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } - + {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using Syncfusion.SmartDataExtractor; +using System.Text; -//Open the input PDF file as a stream. -using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read, FileShare.Read)) +//Open the input image file as a stream. +using (FileStream stream = new FileStream("Image.png", FileMode.Open, FileAccess.Read)) { - //Initialize the Smart Data Extractor. + //Initialize the Data Extractor. DataExtractor extractor = new DataExtractor(); - //Extract data and return as a PDF stream. - Stream pdfStream = extractor.ExtractDataAsPdfStream(inputStream); - - //Save the extracted PDF stream into an output file. - using (FileStream outputStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.Write)) - { - pdfStream.CopyTo(outputStream); - } + //Extract data as JSON from the image stream. + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } - + {% endhighlight %} -{% endtabs %} +{% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-stream/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-JSON-from-an-Image/.NET). -## Extract Data as JSON from PDF Document +## Extract Data as Markdown from PDF or Image -To extract form fields across a PDF document using the **ExtractDataAsJson** method of the **DataExtractor** class, refer to the following code example: +To extract form fields across a PDF document using the **ExtractDataAsMarkdown** method of the **DataExtractor** class, refer to the following code example: {% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using Syncfusion.SmartDataExtractor; -using Syncfusion.SmartFormRecognizer; using System.Text; //Open the input PDF file as a stream. @@ -134,148 +130,162 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess { //Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - //Extract data as JSON. - string data = extractor.ExtractDataAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); + //Extract data as Markdown. + string data = extractor.ExtractDataAsMarkdown(stream); + //Save the extracted Markdown data into an output file. + File.WriteAllText("Output.md", data, Encoding.UTF8); } {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using Syncfusion.SmartDataExtractor; -using Syncfusion.SmartFormRecognizer; using System.Text; //Open the input PDF file as a stream. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) { //Initialize the Smart Data Extractor. - DataExtractor extractor = new DataExtractor(); - //Extract data as JSON. - string data = extractor.ExtractDataAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); + DataExtractor extractor = new DataExtractor(); + //Extract data as Markdown. + string data = extractor.ExtractDataAsMarkdown(stream); + //Save the extracted Markdown data into an output file. + File.WriteAllText("Output.md", data, Encoding.UTF8); } {% endhighlight %} {% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-JSON-from-PDF/.NET). +N>> **Note:** To convert an image instead of a PDF, replace the input stream with the image file (for example, *Input.jpg* or *Input.png*). The rest of the code remains unchanged. -## Extract Data as Markdown from PDF Document -To extract form fields across a PDF document using the **ExtractDataAsMarkdown** method of the **DataExtractor** class, refer to the following code example: +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-MD-from-PDF/.NET). -{% tabs %} + +## Extract Data from PDF or Image and Save as Digital PDF + +The **Smart Data Extractor** allows you to process PDF documents or scanned images and generate a digital PDF output. + +In this section, you will learn how to: +- Extract structured content and save it directly as a **PDF document**. +- Work with the extracted content as a **PDF stream** for flexible storage or further processing. + + +### Extract Data from PDF Document + +To extract structured data such as text, form fields, tables and images from an entire PDF document using the **ExtractDataAsPdfDocument** method of the **DataExtractor** class, refer to the following code example: + +{% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; +using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; -using Syncfusion.SmartFormRecognizer; -using System.Text; //Open the input PDF file as a stream. -using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read, FileShare.Read)) { //Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - //Extract data as Markdown. - string data = extractor.ExtractDataAsMarkdown(stream); - //Save the extracted Markdown data into an output file. - File.WriteAllText("Output.md", data, Encoding.UTF8); + //Extract data and return as a loaded PDF document. + PdfLoadedDocument document = extractor.ExtractDataAsPdfDocument(inputStream); + //Save the extracted output as a new PDF file. + document.Save("Output.pdf"); + //Close the document to release resources. + document.Close(true); } {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; +using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; -using Syncfusion.SmartFormRecognizer; -using System.Text; //Open the input PDF file as a stream. -using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read, FileShare.Read)) { //Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - //Extract data as Markdown. - string data = extractor.ExtractDataAsMarkdown(stream); - //Save the extracted Markdown data into an output file. - File.WriteAllText("Output.md", data, Encoding.UTF8); + //Extract data and return as a loaded PDF document. + PdfLoadedDocument document = extractor.ExtractDataAsPdfDocument(inputStream); + //Save the extracted output as a new PDF file. + document.Save("Output.pdf"); + //Close the document to release resources. + document.Close(true); } - + {% endhighlight %} -{% endtabs %} +{% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-MD-from-PDF/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-from-PDF-document/.NET). -## Extract Data as JSON from an Image +### Extract Data as Stream -To extract structured data from an image document using the **ExtractDataAsJson** method of the **DataExtractor** class, refer to the following code examples. +To extract structured data from a PDF document and return the output as a stream using the **ExtractDataAsPdfStream** method of the **DataExtractor** class, refer to the following example. {% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using Syncfusion.SmartDataExtractor; -using System.Text; -//Open the input image file as a stream. -using (FileStream stream = new FileStream("Image.png", FileMode.Open, FileAccess.Read)) +//Open the input PDF file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read, FileShare.Read)) { - //Initialize the Data Extractor. + //Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - //Extract data as JSON from the image stream. - string data = extractor.ExtractDataAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); -} + //Extract data and return as a PDF stream. + Stream pdfStream = extractor.ExtractDataAsPdfStream(inputStream); + //Save the extracted PDF stream into an output file. + using (FileStream outputStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.Write)) + { + pdfStream.CopyTo(outputStream); + } +} + {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using Syncfusion.SmartDataExtractor; -using System.Text; -//Open the input image file as a stream. -using (FileStream stream = new FileStream("Image.png", FileMode.Open, FileAccess.Read)) +//Open the input PDF file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read, FileShare.Read)) { - //Initialize the Data Extractor. + //Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - //Extract data as JSON from the image stream. - string data = extractor.ExtractDataAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); -} + //Extract data and return as a PDF stream. + Stream pdfStream = extractor.ExtractDataAsPdfStream(inputStream); + //Save the extracted PDF stream into an output file. + using (FileStream outputStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.Write)) + { + pdfStream.CopyTo(outputStream); + } +} + {% endhighlight %} -{% endtabs %} +{% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-JSON-from-an-Image/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-stream/.NET). -## Form Detection -To extract form fields across a PDF document and save them as a PDF output using the **ExtractDataAsPdfDocument** method of the **DataExtractor** class with form recognition options, refer to the following code example: +## Disable Form Detection during Data Extraction + +To disable form field detection while extracting structured data from a PDF document using the **ExtractDataAsPdfDocument** method of the **DataExtractor** class, refer to the following code example: {% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; -using Syncfusion.SmartFormRecognizer; //Open the input PDF file as a stream. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) @@ -283,7 +293,7 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - //Enable form detection in the document to identify form fields. + //Disable form detection in the document to identify form fields. //By default - true extractor.EnableFormDetection = false; //Extract form data and return as a loaded PDF document. @@ -300,10 +310,8 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; -using Syncfusion.SmartFormRecognizer; //Open the input PDF file as a stream. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) @@ -311,7 +319,7 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - //Enable form detection in the document to identify form fields. + //Disable form detection in the document to identify form fields. //By default - true extractor.EnableFormDetection = false; //Extract form data and return as a loaded PDF document. @@ -329,18 +337,16 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Disable-Form-detection/.NET). -## Table Detection +## Disable Table detection during Data Extraction -To extract tables across a PDF document and save them as a PDF output using the **ExtractDataAsPdfDocument** method of the **DataExtractor** class with table extraction options, refer to the following code example: +To disable table detection while extracting structured data from a PDF document using the **ExtractDataAsPdfDocument** method of the **DataExtractor** class, refer to the following code example: {% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; -using Syncfusion.SmartTableExtractor; // Load the input PDF file. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) @@ -348,7 +354,7 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess // Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - // Enable table detection and set confidence threshold. + // Disable table detection. //By default - true extractor.EnableTableDetection = false; // Extract data and return as a loaded PDF document. @@ -365,10 +371,8 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; -using Syncfusion.SmartTableExtractor; // Load the input PDF file. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) @@ -376,7 +380,7 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess // Initialize the Smart Data Extractor. DataExtractor extractor = new DataExtractor(); - // Enable table detection and set confidence threshold. + // Disable table detection. //By default - true extractor.EnableTableDetection = false; // Extract data and return as a loaded PDF document. @@ -402,7 +406,6 @@ To extract structured data from a PDF document using different Form Recognizer o {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; using Syncfusion.SmartFormRecognizer; @@ -446,7 +449,6 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; using Syncfusion.SmartFormRecognizer; @@ -499,7 +501,7 @@ To extract structured table data from a PDF document using advanced Table Extrac {% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; + using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; using Syncfusion.SmartTableExtractor; @@ -536,7 +538,6 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; using Syncfusion.SmartTableExtractor; @@ -583,7 +584,6 @@ To apply confidence thresholding when extracting data from a PDF document using {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; @@ -610,7 +610,6 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; @@ -647,7 +646,6 @@ To extract data from a specific range of pages in a PDF document using the Extra {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; @@ -672,7 +670,6 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using Syncfusion.Pdf.Parsing; using Syncfusion.SmartDataExtractor; diff --git a/Document-Processing/Data-Extraction/NET/extract-table-data.md b/Document-Processing/Data-Extraction/NET/extract-table-data.md index 6fb8db4b7d..a527ff73cc 100644 --- a/Document-Processing/Data-Extraction/NET/extract-table-data.md +++ b/Document-Processing/Data-Extraction/NET/extract-table-data.md @@ -9,15 +9,14 @@ keywords: Table Extraction, PDF, Image, Smart Table Extractor, Assemblies, NuGet # Working with Table Extraction -## Extract Tables from PDF Document +## Extract Table Data as JSON from PDF or Image To extract structured table data from a PDF document using the **ExtractTableAsJson** method of the **TableExtractor** class, refer to the following code {% tabs %} -{% highlight c# tabtitle="C# [Cross-platform]" %} +{% highlight c# tabtitle="C# [Cross-platform]" playgroundButtonLink="https://raw.githubusercontent.com/SyncfusionExamples/PDF-Examples/refs/heads/master/Data-Extraction/Smart-Table-Extractor/Extract-tables-from-pdf-document/.NET/Extract-tables-from-pdf-document/Program.cs" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; @@ -36,7 +35,6 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; @@ -55,68 +53,65 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% endtabs %} +**Note:** To convert an image instead of a PDF, replace the input stream with the image file (for example, *Input.jpg* or *Input.png*). The rest of the code remains unchanged. + You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Table-Extractor/Extract-tables-from-pdf-document/.NET). -## Extract Tables with Border-less Table Detection -To extract structured table data from a PDF document that contains tables without visible borders using the **ExtractTableAsJson** method of the **TableExtractor** class, refer to the following code examples. -{% tabs %} + + +## Extract Table Data as Markdown from PDF or Image + +To extract structured table data from a PDF document using the **ExtractTableAsMarkdown** method of the **TableExtractor** class, refer to the following code + +{% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; //Open the input PDF file as a stream. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) { - //Initialize the Smart Table Extractor. + // Initialize the Smart Table Extractor TableExtractor extractor = new TableExtractor(); - //Configure the table extraction option to detect border-less tables in the document. - TableExtractionOptions options = new TableExtractionOptions(); - options.DetectBorderlessTables = true; - - //Assign the configured options to the extractor. - extractor.TableExtractionOptions = options; - //Extract table data from the PDF document as a JSON string. - string data = extractor.ExtractTableAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); + //Extract table data from the PDF document as markdown. + string data = extractor.ExtractTableAsMarkdown(stream); + //Save the extracted markdown data into an output file. + File.WriteAllText("Output.md", data, Encoding.UTF8); } {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; //Open the input PDF file as a stream. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) { - //Initialize the Smart Table Extractor. + // Initialize the Smart Table Extractor TableExtractor extractor = new TableExtractor(); - //Configure the table extraction option to detect border-less tables in the document. - TableExtractionOptions options = new TableExtractionOptions(); - options.DetectBorderlessTables = true; - //Assign the configured options to the extractor. - extractor.TableExtractionOptions = options; - //Extract table data from the PDF document as a JSON string. - string data = extractor.ExtractTableAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); + //Extract table data from the PDF document as markdown. + string data = extractor.ExtractTableAsMarkdown(stream); + //Save the extracted markdown data into an output file. + File.WriteAllText("Output.md", data, Encoding.UTF8); } {% endhighlight %} -{% endtabs %} +{% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Table-Extractor/Extract-border-less-table-detection/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Table-Extractor/Extract-table-data-as-MD-from-PDF/.NET). -## Extract Tables within Specific Page Range +**Note:** To convert an image instead of a PDF, replace the input stream with the image file (for example, *Input.jpg* or *Input.png*). The rest of the code remains unchanged. + +## Extract Table Data within a Specific Page Range + +### Extract as JSON To extract structured table data from a specific range of pages in a PDF document using the **ExtractTableAsJson** method of the **TableExtractor** class, refer to the following code example: @@ -124,7 +119,6 @@ To extract structured table data from a specific range of pages in a PDF documen {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; @@ -148,7 +142,6 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; @@ -174,65 +167,65 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Table-Extractor/Extract-tables-by-page-range/.NET). -## Apply Confidence Threshold for Table Data Extraction +### Extract as Markdown -To apply confidence thresholding when extracting table data from a PDF document using the **ExtractTableAsJson** method of the **TableExtractor** class, refer to the following code example: +To extract structured table data from a specific range of pages in a PDF document or Image using the **ExtractTableAsMarkdown** method of the **TableExtractor** class, refer to the following code example: {% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; //Open the input PDF file as a stream. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) { - //Initialize the Smart Table Extractor. - TableExtractor extractor = new TableExtractor(); - //Configure table extraction options to set the confidence threshold for detection. - TableExtractionOptions options = new TableExtractionOptions(); - options.ConfidenceThreshold = 0.6; - //Assign the configured options to the extractor. - extractor.TableExtractionOptions = options; - //Extract table data from the PDF document as a JSON string. - string data = extractor.ExtractTableAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); -} + //Initialize the Smart Table Extractor. + TableExtractor extractor = new TableExtractor(); + + //Set the page range for extraction (pages 1 to 3). + TableExtractionOptions options = new TableExtractionOptions(); + options.PageRange = new int[,] { { 1, 3 } }; + extractor.TableExtractionOptions = options; + + //Extract table data from the specified page range as a Markdown string. + string data = extractor.ExtractTableAsMarkdown(stream); + //Save the extracted output as a new Markdown file. + File.WriteAllText("Output.md", data, Encoding.UTF8); +} + {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; //Open the input PDF file as a stream. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) { - //Initialize the Smart Table Extractor. - TableExtractor extractor = new TableExtractor(); - //Configure table extraction options to set the confidence threshold for detection. - TableExtractionOptions options = new TableExtractionOptions(); - options.ConfidenceThreshold = 0.6; - //Assign the configured options to the extractor. - extractor.TableExtractionOptions = options; - //Extract table data from the PDF document as a JSON string. - string data = extractor.ExtractTableAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); + //Initialize the Smart Table Extractor. + TableExtractor extractor = new TableExtractor(); + + //Set the page range for extraction (pages 1 to 3). + TableExtractionOptions options = new TableExtractionOptions(); + options.PageRange = new int[,] { { 1, 3 } }; + extractor.TableExtractionOptions = options; + + //Extract table data from the specified page range as a Markdown string. + string data = extractor.ExtractTableAsMarkdown(stream); + + //Save the extracted output as a new Markdown file. + File.WriteAllText("Output.md", data, Encoding.UTF8); } {% endhighlight %} -{% endtabs %} - -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Table-Extractor/Apply-confidence-threshold/.NET). +{% endtabs %} -## Extract Table Data Asynchronously from PDF Document +## Extract Table Data Asynchronously from PDF or Image To extract table data asynchronously with cancellation support using the **ExtractTableAsJsonAsync** method of the **TableExtractor** class, refer to the following code example: @@ -240,9 +233,7 @@ To extract table data asynchronously with cancellation support using the **Extra {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using System.Text; -using System.Threading; using Syncfusion.SmartTableExtractor; //Open the input PDF file as a stream. @@ -262,9 +253,7 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using System.Text; -using System.Threading; using Syncfusion.SmartTableExtractor; //Open the input PDF file as a stream. @@ -286,53 +275,125 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Table-Extractor/Extract-table-data-async/.NET). -## Extract Table data as Markdown from PDF Document -To extract structured table data from a PDF document using the **ExtractTableAsMarkdown** method of the **TableExtractor** class, refer to the following code +## Table Extraction Options -{% tabs %} +### Disable Border-less Table Detection + +To disable detection of tables without visible borders in a PDF document or Image using the **ExtractTableAsJson** method of the **TableExtractor** class, refer to the following code examples. + +{% tabs %} {% highlight c# tabtitle="C# [Cross-platform]" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; //Open the input PDF file as a stream. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) { - // Initialize the Smart Table Extractor + //Initialize the Smart Table Extractor. TableExtractor extractor = new TableExtractor(); - //Extract table data from the PDF document as markdown. - string data = extractor.ExtractTableAsMarkdown(stream); - //Save the extracted markdown data into an output file. - File.WriteAllText("Output.md", data, Encoding.UTF8); + //Configure the table extraction option to disable border-less tables in the document. + TableExtractionOptions options = new TableExtractionOptions(); + //By default *DetectBorderlessTables is true* + options.DetectBorderlessTables = false; + + //Assign the configured options to the extractor. + extractor.TableExtractionOptions = options; + //Extract table data from the PDF document as a JSON string. + string data = extractor.ExtractTableAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} -using System.IO; using System.Text; using Syncfusion.SmartTableExtractor; //Open the input PDF file as a stream. using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) { - // Initialize the Smart Table Extractor + //Initialize the Smart Table Extractor. TableExtractor extractor = new TableExtractor(); - //Extract table data from the PDF document as markdown. - string data = extractor.ExtractTableAsMarkdown(stream); - //Save the extracted markdown data into an output file. - File.WriteAllText("Output.md", data, Encoding.UTF8); + //Configure the table extraction option to detect border-less tables in the document. + TableExtractionOptions options = new TableExtractionOptions(); + options.DetectBorderlessTables = true; + //Assign the configured options to the extractor. + extractor.TableExtractionOptions = options; + //Extract table data from the PDF document as a JSON string. + string data = extractor.ExtractTableAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} -{% endtabs %} +{% endtabs %} + +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Table-Extractor/Extract-border-less-table-detection/.NET). + +### Apply Confidence Threshold for Table Data Extraction + +To apply confidence thresholding when extracting table data from a PDF document using the **ExtractTableAsJson** method of the **TableExtractor** class, refer to the following code example: + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +using System.Text; +using Syncfusion.SmartTableExtractor; + +//Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Smart Table Extractor. + TableExtractor extractor = new TableExtractor(); + //Configure table extraction options to set the confidence threshold for detection. + TableExtractionOptions options = new TableExtractionOptions(); + options.ConfidenceThreshold = 0.6; + //Assign the configured options to the extractor. + extractor.TableExtractionOptions = options; + //Extract table data from the PDF document as a JSON string. + string data = extractor.ExtractTableAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +using System.Text; +using Syncfusion.SmartTableExtractor; + +//Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Smart Table Extractor. + TableExtractor extractor = new TableExtractor(); + //Configure table extraction options to set the confidence threshold for detection. + TableExtractionOptions options = new TableExtractionOptions(); + options.ConfidenceThreshold = 0.6; + //Assign the configured options to the extractor. + extractor.TableExtractionOptions = options; + //Extract table data from the PDF document as a JSON string. + string data = extractor.ExtractTableAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); +} + +{% endhighlight %} + +{% endtabs %} + +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Table-Extractor/Apply-confidence-threshold/.NET). + + -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Table-Extractor/Extract-table-data-as-MD-from-PDF/.NET). ## PDF to Markdown Preservation Mapping diff --git a/Document-Processing/Data-Extraction/NET/how-to-download-and-install.md b/Document-Processing/Data-Extraction/NET/how-to-download-and-install.md new file mode 100644 index 0000000000..a120a03486 --- /dev/null +++ b/Document-Processing/Data-Extraction/NET/how-to-download-and-install.md @@ -0,0 +1,110 @@ +--- +layout: post +title: Installing Syncfusion Data Extraction - Syncfusion +description: Learn how to install the Syncfusion Smart Data Extractor library for extracting structured data from PDFs and images in .NET applications. +platform: document-processing +control: Installation and Deployment +documentation: ug + +--- + +# Download Syncfusion® Data Extraction Installer + +The Syncfusion® installer can be downloaded from the [Syncfusion](https://www.syncfusion.com/) website. You can either download the licensed installer or try our trial installer depending on your license. + + - Trial Installer + - Licensed Installer + +You can download the Syncfusion® installer from [Syncfusion.com](https://www.syncfusion.com/) website + +## Download the Trial Version + +Our 30-day trial can be downloaded in two ways. + +* Download Free Trial Setup +* Start Trials if using components through [NuGet.org](https://www.nuget.org/packages?q=syncfusion) + + +### Download Free Trial Setup + +1. You can evaluate our 30-day free trial by visiting the [Download Free Trial](https://www.syncfusion.com/downloads) page and select the product +2. After completing the required form or logging in with your registered Syncfusion® account, you can download the trial installer from the confirmation page. (as shown in the screenshot below.) +![Trial and downloads of Syncfusion SmartDataExtractor](images/trial-confirmation.png) + +3. With a trial license, only the latest version’s trial installer can be downloaded. +4. Unlock key is not required to install the Syncfusion® Data Extraction trial installer. +5. Before the trial expires, you can download the trial installer at any time from your registered account’s [Trials & Downloads](https://www.syncfusion.com/account/manage-trials/downloads) page (as shown in the screenshot below.) + + ![Trial and downloads of Syncfusion SmartDataExtractor](images/trial-download.png) + +6. Click the More Download Options (element 2 in the above screenshot) button to get the Data Extraction Product Offline trial installer which is available in ZIP format. + + ![License and downloads of Syncfusion SmartDataExtractor](images/start-trial-download-offline-installer.png) + +### Start Trials if using components through [NuGet.org](https://www.nuget.org/packages?q=syncfusion) + +You should initiate an evaluation if you have already obtained our components through [NuGet.org](https://www.nuget.org/packages?q=syncfusion) + +1. You can start your 30-day free trial from the [Start Trial](https://www.syncfusion.com/account/manage-trials/start-trials) page from your account. + + N> You can generate the license key for your active trial products from [Trials & Downloads](https://www.syncfusion.com/account/manage-trials/downloads) page. This license key will be mandatory to use our trial products in your application. To know more about License key, refer this [help topic](https://help.syncfusion.com/common/essential-studio/licensing/overview). + + ![Trial and downloads of Syncfusion SmartDataExtractor](images/start-trial-download.png) + +2. To access this page, you must sign up\log in with your Syncfusion® account. +3. Begin your trial by selecting the Syncfusion® product. + + N> If you've already used the trial products and they haven't expired, you won't be able to start the trial for the same product again. + +4. After you've started the trial, go to the [Trials & Downloads](https://www.syncfusion.com/account/manage-trials/downloads) page to get the latest version trial installer. You can generate the [unlock key](https://www.syncfusion.com/kb/8069/how-to-generate-unlock-key-for-essentials-studio-products) and [license key](https://help.syncfusion.com/common/essential-studio/licensing/how-to-generate) here at any time before the trial period expires. (as shown in below screenshot.) + + ![License and downloads of Syncfusion SmartDataExtractor](images/start-trial-download-installer.png) + +5. You can find your current active trial products on the [Trials & Downloads](https://www.syncfusion.com/account/manage-trials/downloads) page. + + +## Download the License Version + +1. Syncfusion® licensed products will be available in the [License & Downloads](https://www.syncfusion.com/account/downloads) page under your registered Syncfusion® account. +2. You can view all the licenses (both active and expired) associated with your account. +3. You can download SmartDataExtractor licensed installer by going to More Downloads Options (element 3 in the screenshot below). + + ![License and downloads of Syncfusion SmartDataExtractor](images/license-download.png) + +4. Unlock key is not required to install the Syncfusion® SmartDataExtractor trial installer. +5. For each OS, ZIP format is available for download. + + ![License and downloads of Syncfusion SmartDataExtractor](images/start-trial-download-offline-installer.png) + +N> The Smart Data Extractor installer is provided in **ZIP format**. +This is common for **Windows, Linux, and Mac OS** platforms. +After downloading, you must **extract the ZIP file** before proceeding with installation. + + +## Installing Syncfusion® SmartDataExtractor installer + +The steps below show how to install the Syncfusion® Smart Data Extractor installer. + +1. Extract the Syncfusion® Smart Data Extractor installer (.zip) file. The files will be extracted on your machine. + + ![Welcome wizard](images/Installer1.png) + +2. The ZIP file contains the following folders. + + ![License Agreement](images/Installer2.png) + + N> The Unlock key is not required to install the Smart Data Extractor installer. + +3. You can launch the demo source and use the NuGet packages included in the installer. + +4. Run the following command in your machine to deploy the ASP.NET Core samples: + + **dotnet restore project name -s \nuget** in order to restore. + + +## License key registration in samples + +After the installation, the license key is required to register the demo source that is included in the Linux installer. To learn about the steps for license registration for the ASP.NET Core - EJ2 samples in the Essential Studio® Smart Data Extractor Linux installer, please refer to this. + +* Register the license key in the [Program.cs](https://ej2.syncfusion.com/aspnetcore/documentation/licensing/how-to-register-in-an-application#for-aspnet-core-application-using-net-60) file if you created the ASP.NET Core web application with Visual Studio 2022 and .NET 6.0. +* Register the license key in Configure method of [Startup.cs](https://ej2.syncfusion.com/aspnetcore/documentation/licensing/how-to-register-in-an-application#for-aspnet-core-application-using-net-50-or-net-31) \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/images/Installer1.png b/Document-Processing/Data-Extraction/NET/images/Installer1.png new file mode 100644 index 0000000000..2ca55e97f4 Binary files /dev/null and b/Document-Processing/Data-Extraction/NET/images/Installer1.png differ diff --git a/Document-Processing/Data-Extraction/NET/images/Installer2.png b/Document-Processing/Data-Extraction/NET/images/Installer2.png new file mode 100644 index 0000000000..5c6b8bcb5e Binary files /dev/null and b/Document-Processing/Data-Extraction/NET/images/Installer2.png differ diff --git a/Document-Processing/Data-Extraction/NET/images/license-download.png b/Document-Processing/Data-Extraction/NET/images/license-download.png new file mode 100644 index 0000000000..4f307eac36 Binary files /dev/null and b/Document-Processing/Data-Extraction/NET/images/license-download.png differ diff --git a/Document-Processing/Data-Extraction/NET/images/start-trial-download-installer.png b/Document-Processing/Data-Extraction/NET/images/start-trial-download-installer.png new file mode 100644 index 0000000000..77684a92ba Binary files /dev/null and b/Document-Processing/Data-Extraction/NET/images/start-trial-download-installer.png differ diff --git a/Document-Processing/Data-Extraction/NET/images/start-trial-download-offline-installer.png b/Document-Processing/Data-Extraction/NET/images/start-trial-download-offline-installer.png new file mode 100644 index 0000000000..7534af8a9a Binary files /dev/null and b/Document-Processing/Data-Extraction/NET/images/start-trial-download-offline-installer.png differ diff --git a/Document-Processing/Data-Extraction/NET/images/start-trial-download.png b/Document-Processing/Data-Extraction/NET/images/start-trial-download.png new file mode 100644 index 0000000000..301ac6d14c Binary files /dev/null and b/Document-Processing/Data-Extraction/NET/images/start-trial-download.png differ diff --git a/Document-Processing/Data-Extraction/NET/images/trial-confirmation.png b/Document-Processing/Data-Extraction/NET/images/trial-confirmation.png new file mode 100644 index 0000000000..841e88c60e Binary files /dev/null and b/Document-Processing/Data-Extraction/NET/images/trial-confirmation.png differ diff --git a/Document-Processing/Data-Extraction/NET/images/trial-download.png b/Document-Processing/Data-Extraction/NET/images/trial-download.png new file mode 100644 index 0000000000..dc562b3f47 Binary files /dev/null and b/Document-Processing/Data-Extraction/NET/images/trial-download.png differ diff --git a/Document-Processing/Data-Extraction/NET/ocr-overview.md b/Document-Processing/Data-Extraction/NET/ocr-overview.md new file mode 100644 index 0000000000..733184a8b0 --- /dev/null +++ b/Document-Processing/Data-Extraction/NET/ocr-overview.md @@ -0,0 +1,14 @@ +--- +title: Intro to OCR Processor | Syncfusion +description: This page introduces the Syncfusion OCR Processor, describing its purpose, key capabilities, and how to get started with optical character recognition in .NET applications. +platform: document-processing +control: OCRProcessor +documentation: UG +keywords: OCR, Optical Character Recognition, Text Recognition +--- + +# Welcome to Syncfusion OCR Processor Library + +Syncfusion® OCR Processor is a high‑performance .NET library that enables accurate text recognition from scanned documents, images, and PDF files. Designed for modern .NET workflows, it processes raster images and document pages to recognize printed text, analyze page layouts, and extract textual content programmatically. + +The OCR Processor supports common document formats and provides a streamlined API for converting image‑based content into machine‑readable text, making it suitable for scenarios such as document digitization, text search, content indexing, and data processing in enterprise applications. \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/ocr-processor/Assemblies-Required.md b/Document-Processing/Data-Extraction/NET/ocr-processor/Assemblies-Required.md index 8f19c56d27..bf6285b477 100644 --- a/Document-Processing/Data-Extraction/NET/ocr-processor/Assemblies-Required.md +++ b/Document-Processing/Data-Extraction/NET/ocr-processor/Assemblies-Required.md @@ -13,53 +13,51 @@ Get the following required assemblies by downloading the OCR library installer. #### Syncfusion® assemblies - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + +
    Platform(s)Assemblies
    -Windows Forms, WPF, ASP.NET, and ASP.NET MVC - -
      -
    • Syncfusion.OCRProcessor.Base.dll
    • -
    • Syncfusion.Pdf.Base.dll
    • -
    • Syncfusion.Compression.Base.dll
    • -
    • Syncfusion.ImagePreProcessor.Base.dll
    • -
    -
    -.NET Standard 2.0 - -
      -
    • Syncfusion.OCRProcessor.Portable.dll
    • -
    • Syncfusion.PdfImaging.Portable.dll
    • -
    • Syncfusion.Pdf.Portable.dll
    • -
    • Syncfusion.Compression.Portable.dll
    • -
    • {{'[SkiaSharp](https://www.nuget.org/packages/SkiaSharp/3.119.1)'| markdownify }} package
    • -
    • Syncfusion.ImagePreProcessor.Portable.dll
    • -
    -
    -.NET 8/.NET 9/.NET 10 - -
      -
    • Syncfusion.OCRProcessor.NET.dll
    • -
    • Syncfusion.PdfImaging.NET.dll
    • -
    • Syncfusion.Pdf.NET.dll
    • -
    • Syncfusion.Compression.NET.dll
    • -
    • {{'[SkiaSharp](https://www.nuget.org/packages/SkiaSharp/3.119.1)'| markdownify }} package
    • -
    • Syncfusion.ImagePreProcessor.NET.dll
    • -
    -
    Platform(s)Assemblies
    + {{'WPF'| markdownify }}, + {{'Windows Forms'| markdownify }}, + {{'ASP.NET'| markdownify }} and {{'ASP.NET MVC'| markdownify }} + + Syncfusion.OCRProcessor.Base
    + Syncfusion.Pdf.Base
    + Syncfusion.Compression.Base
    + Syncfusion.ImagePreProcessor.Base
    +
    + {{'.NET Standard 2.0'| markdownify }} + + Syncfusion.OCRProcessor.Portable
    + Syncfusion.Pdf.Imaging.Portable
    + Syncfusion.Pdf.Portable
    + Syncfusion.Compression.Portable
    + {{'[SkiaSharp](https://www.nuget.org/packages/SkiaSharp/3.119.1)'| markdownify }} package
    + Syncfusion.ImagePreProcessor.Portable
    +
    + {{'.NET 8/.NET 9/.NET 10'| markdownify }} + + Syncfusion.OCRProcessor.NET
    + Syncfusion.Pdf.Imaging.NET
    + Syncfusion.Pdf.NET
    + Syncfusion.Compression.NET
    + {{'[SkiaSharp](https://www.nuget.org/packages/SkiaSharp/3.119.1)'| markdownify }} package
    + Syncfusion.ImagePreProcessor.NET
    +
    \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/ocr-processor/Features.md b/Document-Processing/Data-Extraction/NET/ocr-processor/Features.md index 57782a9569..33d897c5bb 100644 --- a/Document-Processing/Data-Extraction/NET/ocr-processor/Features.md +++ b/Document-Processing/Data-Extraction/NET/ocr-processor/Features.md @@ -150,13 +150,13 @@ End Using You can downloaded a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/OCR/.NET/Perform-OCR-on-particular-region-of-PDF-document). -## Performing OCR with tesseract version 3.05 -The [TesseractVersion](https://help.syncfusion.com/cr/document-processing/Syncfusion.OCRProcessor.OCRSettings.html#Syncfusion_OCRProcessor_OCRSettings_TesseractVersion) property is used to switch the tesseract version between 3.02 and 3.05. By default, OCR works with tesseract version 5.0. -N> The starting supported version of tesseract in ASP.NET Core is 4.0. So the lower tesseract versions 3.02 and 3.05 are not supported and we don't have the property called ``TesseractVersion`` in ASP.NET Core platform. +## Performing OCR with Tesseract Version 5.0 -The following code sample demonstrates the OCR processor with Tesseract version 3.05 for PDF documents. +The [TesseractVersion](https://help.syncfusion.com/cr/document-processing/Syncfusion.OCRProcessor.OCRSettings.html#Syncfusion_OCRProcessor_OCRSettings_TesseractVersion) property is used to switch the tesseract version to 5.0. By default, OCR will be performed with tesseract version 5.0. + +The following code sample explains the OCR processor with Tesseract version 5.0 for PDF documents. {% tabs %} @@ -174,7 +174,7 @@ using (OCRProcessor processor = new OCRProcessor()) //Set OCR language. processor.Settings.Language = Languages.English; //Set tesseract OCR Engine. - processor.Settings.TesseractVersion = TesseractVersion.Version3_05; + processor.Settings.TesseractVersion = TesseractVersion.Version5_0; //Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property. processor.PerformOCR(document); @@ -192,27 +192,26 @@ Imports Syncfusion.OCRProcessor Imports Syncfusion.Pdf.Parsing 'Initialize the OCR processor with tesseract binaries folder path. -Using processor As OCRProcessor = New OCRProcessor("TesseractBinaries/3.05/") +Using processor As OCRProcessor = New OCRProcessor("TesseractBinaries/5.0/") 'Load an existing PDF document. Dim document As PdfLoadedDocument = New PdfLoadedDocument("Input.pdf") 'Set OCR language. processor.Settings.Language = Languages.English 'Set tesseract OCR Engine. - processor.Settings.TesseractVersion = TesseractVersion.Version3_05 + processor.Settings.TesseractVersion = TesseractVersion.Version5_0 'Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property. processor.PerformOCR(document) - 'Save the PDF document. document.Save("Output.pdf") 'Close the document. document.Close(True) End Using - + {% endhighlight %} -{% endtabs %} +{% endtabs %} ## Performing OCR with Tesseract Version 4.0 @@ -275,11 +274,13 @@ End Using {% endtabs %} -## Performing OCR with Tesseract Version 5.0 +## Performing OCR with tesseract version 3.05 -The [TesseractVersion](https://help.syncfusion.com/cr/document-processing/Syncfusion.OCRProcessor.OCRSettings.html#Syncfusion_OCRProcessor_OCRSettings_TesseractVersion) property is used to switch the tesseract version to 5.0. By default, OCR will be performed with tesseract version 5.0. +The [TesseractVersion](https://help.syncfusion.com/cr/document-processing/Syncfusion.OCRProcessor.OCRSettings.html#Syncfusion_OCRProcessor_OCRSettings_TesseractVersion) property is used to switch the tesseract version between 3.02 and 3.05. By default, OCR works with tesseract version 5.0. -The following code sample explains the OCR processor with Tesseract version 5.0 for PDF documents. +N> The starting supported version of tesseract in ASP.NET Core is 4.0. So the lower tesseract versions 3.02 and 3.05 are not supported and we don't have the property called ``TesseractVersion`` in ASP.NET Core platform. + +The following code sample demonstrates the OCR processor with Tesseract version 3.05 for PDF documents. {% tabs %} @@ -297,7 +298,7 @@ using (OCRProcessor processor = new OCRProcessor()) //Set OCR language. processor.Settings.Language = Languages.English; //Set tesseract OCR Engine. - processor.Settings.TesseractVersion = TesseractVersion.Version5_0; + processor.Settings.TesseractVersion = TesseractVersion.Version3_05; //Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property. processor.PerformOCR(document); @@ -315,26 +316,27 @@ Imports Syncfusion.OCRProcessor Imports Syncfusion.Pdf.Parsing 'Initialize the OCR processor with tesseract binaries folder path. -Using processor As OCRProcessor = New OCRProcessor("TesseractBinaries/5.0/") +Using processor As OCRProcessor = New OCRProcessor("TesseractBinaries/3.05/") 'Load an existing PDF document. Dim document As PdfLoadedDocument = New PdfLoadedDocument("Input.pdf") 'Set OCR language. processor.Settings.Language = Languages.English 'Set tesseract OCR Engine. - processor.Settings.TesseractVersion = TesseractVersion.Version5_0 + processor.Settings.TesseractVersion = TesseractVersion.Version3_05 'Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property. processor.PerformOCR(document) + 'Save the PDF document. document.Save("Output.pdf") 'Close the document. document.Close(True) End Using - + {% endhighlight %} -{% endtabs %} +{% endtabs %} ## Performing OCR on image diff --git a/Document-Processing/Data-Extraction/NET/ocr-processor/NuGet-Packages-Required.md b/Document-Processing/Data-Extraction/NET/ocr-processor/NuGet-Packages-Required.md index 84326ae80d..08292135b6 100644 --- a/Document-Processing/Data-Extraction/NET/ocr-processor/NuGet-Packages-Required.md +++ b/Document-Processing/Data-Extraction/NET/ocr-processor/NuGet-Packages-Required.md @@ -5,9 +5,9 @@ platform: document-processing control: PDF documentation: UG --- -# NuGet Packages Required for OCR processor +## NuGet Packages Required for OCR Processor -Directly install the NuGet package to your application from [nuget.org](https://www.nuget.org/). +To work with the OCR Processor, the following NuGet packages need to be installed in your application from [nuget.org](https://www.nuget.org/). @@ -48,7 +48,7 @@ Console Application (Targeting .NET Core)
    Blazor
    -{{'[Syncfusion.PDF.OCR.Net.Core](https://www.nuget.org/packages/Syncfusion.PDF.OCR.Net.Core)'| markdownify }} +{{'[Syncfusion.Pdf.OCR.Net.Core](https://www.nuget.org/packages/Syncfusion.Pdf.OCR.Net.Core)'| markdownify }}
    \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/overview.md b/Document-Processing/Data-Extraction/NET/overview.md index 88579121b1..0bac72cb9c 100644 --- a/Document-Processing/Data-Extraction/NET/overview.md +++ b/Document-Processing/Data-Extraction/NET/overview.md @@ -1,5 +1,5 @@ --- -title: Extract Structured Data from PDF and Image Files in .NET | Syncfusion +title: Extract Structured Data in .NET | Syncfusion description: Syncfusion Data Extraction is a .NET library that extracts tables, forms, text, and images from documents, and recognizes form data to produce outputs such as PDFs and JSON. platform: document-processing control: DataExtraction @@ -11,9 +11,10 @@ keywords: Assemblies Syncfusion® Smart Data Extractor is a high performance, deterministic C# library that extracts complete document structures from PDFs and images. Designed for .NET workflows, it analyzes visual layout lines, boxes, labels, and alignment to locate and extract elements such as table structure, text elements, images, headers, footers, and form fields with per field confidence scores for immediate review, export, or integration. -## Key Features of Essential® Smart Data Extractor -The following list shows the key features available in the Essential® SmartDataExtractor. +## Key Features of Syncfusion® Smart Data Extractor + +The following list shows the key features available in the Syncfusion® Smart Data Extractor. * **Document structure extraction:** Detects text elements, images, headers/footers, and complete table structure (regions, header rows, columns, cell boundaries, merged cells). * **File format support:** Works with PDF and common image formats (JPEG, PNG). @@ -23,46 +24,10 @@ The following list shows the key features available in the Essential®® Smart Table Extractor - -The following list shows the key features available in the Essential® SmartTableExtractor. - -* **Table structure extraction:** Identifies table regions, header rows, columns, row and column spans, and cell boundaries. -* **File format support:** Works with PDF and common image formats (JPEG, PNG). -* **Border type handling:** Extract both bordered and border-less tables. -* **Page-level control:** Extract tables from specific pages or defined page ranges. -* **Confidence thresholding:** Results are filtered based on a configurable confidence score (0.0–1.0). -* **Deterministic performance:** Designed for predictable, repeatable extraction across environments (Windows, Linux, Azure, Docker). - -# Overview of Smart Form Recognizer Library - -Smart Form Recognizer is a deterministic, on premise C# library for .NET designed to reliably detect form data from PDFs and scanned images. Unlike AI‑based approaches, this library uses visual layout heuristics including lines, boxes, and circular markers to identify form structures with high consistency and predictability.It supports to identify the common form controls such as text fields, checkboxes, radio buttons, and signature regions, producing clean, structured JSON that can be fed directly into review and workflow systems. - - -## Core Capabilities - -* **Form layout detection**: Locate form regions using graphical heuristics (lines, boxes, circles) for consistent field discovery. - -* **Fillable PDF export**: Create a PDF with detected form fields added so documents are immediately usable in form workflows. - -* **Page-level control**: Process specific pages or page ranges for targeted extraction. - -* **Multi-format support**: Works with PDF, JPEG, PNG and other common image formats. - -* **Confidence filtering**: Per-field confidence scores with configurable thresholds to control output quality and drive review logic. - -* **Ready for .NET integration**: Deterministic, on premise library that outputs JSON and integrates into existing .NET pipelines and review UIs. - - ## JSON Output Structure and Attributes The Syncfusion® Data Extraction libraries process PDFs and scanned images to extract structured document data—including tables, form fields, text elements, images, headers, and footers—by analyzing layout patterns, table regions, borders, alignment patterns, and cell structures. The extracted output is returned as structured JSON with per‑field and per‑cell confidence scores, along with complete document and table hierarchies, making it ready for immediate review, export, or integration into downstream workflows. - ### Root Structure Below is the root structure of the JSON result: @@ -89,7 +54,6 @@ Below is the root structure of the JSON result: N> In the Smart Table Extractor root structure, the **form** object will not be present. - ### JSON Attributes #### Page Object @@ -128,14 +92,13 @@ The Page Object represents the metadata of a page along with all the detected el
    FormObjects ArrayList of detected form fields (checkboxes, text boxes, radio button, signature etc..)List of detected form fields (checkboxes, text boxes, radio buttons, signatures etc.)
    N> The **FormObjects** array is not included in the Smart Table Extractor output structure, as it is specific to Smart Data Extractor and Smart Form Recognizer. - #### PageObjects PageObjects represent the metadata of a page along with the detected elements it contains—such as text, headers, footers, tables, images, and numbers—in Smart Data Extractor, while in Smart Table Extractor they represent the detected table elements on a page. @@ -321,4 +284,4 @@ Represents the text formatting attributes (font family, font style, font size) a Specifies the font size used for the text. - \ No newline at end of file + diff --git a/Document-Processing/Data-Extraction/NET/recognize-forms.md b/Document-Processing/Data-Extraction/NET/recognize-forms.md index 1989fd6054..45ed82fc53 100644 --- a/Document-Processing/Data-Extraction/NET/recognize-forms.md +++ b/Document-Processing/Data-Extraction/NET/recognize-forms.md @@ -1,6 +1,6 @@ --- title: Working with Form Recognition | Syncfusion® -description: Learn how to recognize forms and configure FormRecognizeOptions in the Syncfusion® SmartFormRecognizer library to process and detect form elements with ease. +description: Learn how to recognize forms and configure FormRecognizeOptions in Syncfusion® SmartFormRecognizer to detect and process form elements easily. platform: document-processing control: SmartFormRecognizer documentation: UG @@ -8,150 +8,181 @@ documentation: UG # Working with Form Recognition -## Recognize forms using PdfLoadedDocument -To recognize form data from a PDF or image and get the output as a `PdfLoadedDocument` using the **RecognizeFormAsPdfDocument** (synchronous) and **RecognizeFormAsPdfDocumentAsync** (asynchronous) methods of the **FormRecognizer** class, refer to the following code examples. +## Recognize Forms as JSON + +To recognize form data from a PDF or image and get the output as a JSON string using the **RecognizeFormAsJson** (synchronous) and **RecognizeFormAsJsonAsync** (asynchronous) methods of the **FormRecognizer** class, refer to the following code examples. Example (synchronous): {% tabs %} -{% highlight c# tabtitle="C#" %} -public void Button_Click(object sender, RoutedEventArgs e) - { - //Initialize the Form Recognizer - FormRecognizer smartFormRecognizer = new FormRecognizer(); - //Read the input PDF file as stream - FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite); - //Recognize the form and get the output as PDF stream - PdfLoadedDocument pdfLoadedDocument = smartFormRecognizer.RecognizeFormAsPdfDocument(inputStream); - //Save the loadeddocument - pdfLoadedDocument.Save(Output.pdf); - } +{% highlight c# tabtitle="C#" playgroundButtonLink="https://raw.githubusercontent.com/SyncfusionExamples/PDF-Examples/refs/heads/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-JSON/.NET/Recognize-forms-using-JSON/Program.cs %} + + +using Syncfusion.SmartFormRecognizer; + +// Open the input PDF file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite)) +{ + // Initialize the Smart Form Recognizer. + FormRecognizer recognizer = new FormRecognizer(); + // Recognize the form and get the output as a JSON string. + string outputJson = recognizer.RecognizeFormAsJson(inputStream); + // Save the extracted JSON data into an output file. + File.WriteAllText("result.json", outputJson); +} + {% endhighlight %} {% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-Pdf/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-JSON/.NET). Example (asynchronous): {% tabs %} {% highlight c# tabtitle="C#" %} -public async void Button_Click(object sender, RoutedEventArgs e) - { - //Initialize the Form Recognizer - FormRecognizer smartFormRecognizer = new FormRecognizer(); - //Read the input PDF file as stream - FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite); - //Recognize the form and get the output as PDF stream - PdfLoadedDocument pdfLoadedDocument = await smartFormRecognizer.RecognizeFormAsPdfDocumentAsync(inputStream); - //Save the loadeddocument - pdfLoadedDocument.Save(Output.pdf); + +using Syncfusion.SmartFormRecognizer; + +// Open the input PDF file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite)) +{ + // Initialize the Smart Form Recognizer. + FormRecognizer recognizer = new FormRecognizer(); + // Recognize the form asynchronously and get the output as a JSON string. + string outputJson = await recognizer.RecognizeFormAsJsonAsync(inputStream); + // Save the extracted JSON data into an output file. + File.WriteAllText("result.json", outputJson); } {% endhighlight %} {% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-Pdf-async/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-JSON-async/.NET). -## Recognize forms using Stream -To recognize form data from a PDF or image and get the output as a `Stream` using the **RecognizeFormAsPdfStream** (synchronous) and **RecognizeFormAsPdfStreamAsync** (asynchronous) methods of the **FormRecognizer** class, refer to the following code examples. + +## Recognize Forms from PDF or Image + +### Recognize forms as PDF + +To recognize form data from a PDF or image and get the output as a `PdfLoadedDocument` using the **RecognizeFormAsPdfDocument** (synchronous) and **RecognizeFormAsPdfDocumentAsync** (asynchronous) methods of the **FormRecognizer** class, refer to the following code examples. Example (synchronous): {% tabs %} {% highlight c# tabtitle="C#" %} -public void Button_Click(object sender, RoutedEventArgs e) + +using Syncfusion.SmartFormRecognizer; +using Syncfusion.Pdf.Parsing; + +// Open the input PDF file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite)) { - //Initialize the Form Recognizer - FormRecognizer smartFormRecognizer = new FormRecognizer(); - //Read the input PDF file as stream - FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite); - //Recognize the form and get the output as PDF stream - Stream outputStream = smartFormRecognizer.RecognizeFormAsPdfStream(inputStream); - //Save the output PDF stream to file - using (FileStream fileStream = File.Create("Output.pdf")) - { - outputStream.Seek(0, SeekOrigin.Begin); - outputStream.CopyTo(fileStream); - } + // Initialize the Smart Form Recognizer. + FormRecognizer recognizer = new FormRecognizer(); + // Recognize the form and get the output as a PDF document. + PdfLoadedDocument document = recognizer.RecognizeFormAsPdfDocument(inputStream); + // Save the recognized document. + document.Save("Output.pdf"); + // Close the document to release resources. + document.Close(); } + {% endhighlight %} {% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-Stream/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-Pdf/.NET). Example (asynchronous): {% tabs %} {% highlight c# tabtitle="C#" %} -public async void Button_Click(object sender, RoutedEventArgs e) + +using Syncfusion.Pdf.Parsing; +using Syncfusion.SmartFormRecognizer; + +// Open the input PDF file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite)) { - //Initialize the Form Recognizer - FormRecognizer smartFormRecognizer = new FormRecognizer(); - //Read the input PDF file as stream - FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite); - //Recognize the form and get the output as PDF stream - Stream outputStream = await smartFormRecognizer.RecognizeFormAsPdfStreamAsync(inputStream); - //Save the output PDF stream to file - using (FileStream fileStream = File.Create("Output.pdf")) - { - outputStream.Seek(0, SeekOrigin.Begin); - outputStream.CopyTo(fileStream); - } + // Initialize the Smart Form Recognizer. + FormRecognizer recognizer = new FormRecognizer(); + // Recognize the form asynchronously and get the output as a PDF document. + PdfLoadedDocument document = await recognizer.RecognizeFormAsPdfDocumentAsync(inputStream); + // Save the recognized document. + document.Save("Output.pdf"); + // Close the document to release resources. + document.Close(); } + {% endhighlight %} {% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-Stream-async/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-Pdf-async/.NET). -## Recognize forms using JSON -To recognize form data from a PDF or image and get the output as a JSON string using the **RecognizeFormAsJson** (synchronous) and **RecognizeFormAsJsonAsync** (asynchronous) methods of the **FormRecognizer** class, refer to the following code examples. +### Recognize forms as Stream + +To recognize form data from a PDF or image and get the output as a `Stream` using the **RecognizeFormAsPdfStream** (synchronous) and **RecognizeFormAsPdfStreamAsync** (asynchronous) methods of the **FormRecognizer** class, refer to the following code examples. Example (synchronous): {% tabs %} {% highlight c# tabtitle="C#" %} -public void Button_Click(object sender, RoutedEventArgs e) + +using Syncfusion.SmartFormRecognizer; + +// Open the input PDF file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite)) { - //Initialize the Form Recognizer - FormRecognizer smartFormRecognizer = new FormRecognizer(); - //Read the input PDF file as stream - FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite); - //Recognize the form and get the output as JSON string - String outputJson = smartFormRecognizer.RecognizeFormAsJson(inputStream); - //Save the outputJson - File.Create("D:\\result.json").Close(); - File.WriteAllText("D:\\result.json", outputJson); + // Initialize the Smart Form Recognizer. + FormRecognizer recognizer = new FormRecognizer(); + // Recognize the form and get the output as a PDF stream. + using (Stream outputStream = recognizer.RecognizeFormAsPdfStream(inputStream)) + { + // Save the output PDF stream to a file. + using (FileStream fileStream = File.Create("Output.pdf")) + { + outputStream.Seek(0, SeekOrigin.Begin); + outputStream.CopyTo(fileStream); + } + } } + {% endhighlight %} {% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-JSON/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-Stream/.NET). Example (asynchronous): {% tabs %} {% highlight c# tabtitle="C#" %} -public async void Button_Click(object sender, RoutedEventArgs e) -{ - //Initialize the Form Recognizer - FormRecognizer smartFormRecognizer = new FormRecognizer(); - //Read the input PDF file as stream - FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite); - //Recognize the form and get the output as JSON string - String outputJson = await smartFormRecognizer.RecognizeFormAsJsonAsync(inputStream); - //Save the outputJson - File.Create("D:\\result.json").Close(); - File.WriteAllText("D:\\result.json", outputJson); -} +using Syncfusion.SmartFormRecognizer; +// Open the input PDF file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.ReadWrite)) +{ + // Initialize the Smart Form Recognizer. + FormRecognizer recognizer = new FormRecognizer(); + // Recognize the form asynchronously and get the output as a PDF stream. + using (Stream outputStream = await recognizer.RecognizeFormAsPdfStreamAsync(inputStream)) + { + // Save the output PDF stream to a file. + using (FileStream fileStream = File.Create("Output.pdf")) + { + outputStream.Seek(0, SeekOrigin.Begin); + outputStream.CopyTo(fileStream); + } + } +} {% endhighlight %} {% endtabs %} -You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-JSON-async/.NET). +You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Recognize-forms-using-Stream-async/.NET). + ## Async variants with CancellationToken + To recognize form data asynchronously with cancellation support using the **RecognizeFormAsPdfStreamAsync** method of the **FormRecognizer** class, refer to the following code example. Example with cancellation token (PDF stream): @@ -159,34 +190,34 @@ Example with cancellation token (PDF stream): {% tabs %} {% highlight c# tabtitle="C#" %} -public async Task RecognizeWithCancellationAsync() -{ - FormRecognizer recognizer = new FormRecognizer(); - - using FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read); - CancellationTokenSource cts = new CancellationTokenSource(); - cts.CancelAfter(TimeSpan.FromSeconds(5)); // cancel in 5 seconds - CancellationToken token = cts.Token; - using Stream resultStream = await recognizer.RecognizeFormAsPdfStreamAsync(inputStream, token); +using Syncfusion.SmartFormRecognizer; - using FileStream fileStream = File.Create("Output.pdf"); - await resultStream.CopyToAsync(fileStream, token); -} +// Open the input PDF file as a stream. +using FileStream inputStream = new FileStream(Path.GetFullPath("Input.pdf"), FileMode.Open, FileAccess.Read); +// Initialize the Smart Form Recognizer. +FormRecognizer recognizer = new FormRecognizer(); +// Create a cancellation token that cancels after 5 seconds. +using CancellationTokenSource cts = new CancellationTokenSource(TimeSpan.FromSeconds(5)); +CancellationToken token = cts.Token; +// Recognize the form asynchronously and get the output as a PDF stream. +using Stream resultStream = await recognizer.RecognizeFormAsPdfStreamAsync(inputStream, token); +// Save the output PDF stream to a file. +using FileStream fileStream = File.Create(Path.GetFullPath("Output.pdf")); +await resultStream.CopyToAsync(fileStream, token); {% endhighlight %} {% endtabs %} You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Form-Recognizer/Asyncvariants-with-CancellationToken/.NET). -# Working with FormRecognizeOptions +## Working with Form Recognize Options `FormRecognizeOptions` provides configurable settings that control how the SmartFormRecognizer detects elements from a document. It allows you to enable or disable the detection of specific form controls such as checkboxes, radio buttons, textboxes, and signatures—while also letting you fine-tune the recognition results using a confidence threshold. Additionally, it supports restricting processing to specific pages through an optional 1‑based inclusive PageRange. By adjusting these options, developers can optimize performance, reduce noise in results, and tailor form extraction precisely to the needs of their application. -## Properties +### Disable Textbox Detection -### DetectTextboxes To disable textbox field detection in the **FormRecognizeOptions** of the **FormRecognizer** class, refer to the following code example. {% tabs %} @@ -199,7 +230,7 @@ recognizer.FormRecognizeOptions.DetectTextboxes = false; {% endhighlight %} {% endtabs %} -### DetectCheckboxes +### Disable Checkbox Detection To disable checkbox detection in the **FormRecognizeOptions** of the **FormRecognizer** class, refer to the following code example. {% tabs %} @@ -212,7 +243,7 @@ recognizer.FormRecognizeOptions.DetectCheckboxes = false; {% endhighlight %} {% endtabs %} -### DetectRadioButtons +### Disable RadioButtons Detection To disable radio button detection in the **FormRecognizeOptions** of the **FormRecognizer** class, refer to the following code example. {% tabs %} @@ -225,7 +256,7 @@ recognizer.FormRecognizeOptions.DetectRadioButtons = false; {% endhighlight %} {% endtabs %} -### DetectSignatures +### Disable Signature Detection To disable signature field detection in the **FormRecognizeOptions** of the **FormRecognizer** class, refer to the following code example. {% tabs %} @@ -238,7 +269,8 @@ recognizer.FormRecognizeOptions.DetectSignatures = false; {% endhighlight %} {% endtabs %} -### ConfidenceThreshold +### Set Confidence Threshold + To set a minimum confidence score for detected form elements using the **ConfidenceThreshold** in the **FormRecognizeOptions** of the **FormRecognizer** class, refer to the following code example. {% tabs %} @@ -251,7 +283,8 @@ recognizer.FormRecognizeOptions.ConfidenceThreshold = 0.9; {% endhighlight %} {% endtabs %} -### PageRange +### Set Page Range + To specify which pages to process using the **PageRange** in the **FormRecognizeOptions** of the **FormRecognizer** class, refer to the following code example. {% tabs %} diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-ASP-NET-Core-VS-Code.md b/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-ASP-NET-Core-VS-Code.md deleted file mode 100644 index 01d8dd6baa..0000000000 --- a/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-ASP-NET-Core-VS-Code.md +++ /dev/null @@ -1,88 +0,0 @@ -**Prerequisites**: - -* Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). -* Install Visual Studio Code: Download and install Visual Studio Code from the [official website](https://code.visualstudio.com/download). -* Install C# Extension for VS Code: Open Visual Studio Code, go to the Extensions view (Ctrl+Shift+X), and search for 'C#'. Install the official [C# extension provided by Microsoft](https://marketplace.visualstudio.com/items?itemName=ms-dotnettools.csharp). - -Step 1: Open the terminal (Ctrl+` ) and run the following command to create a C# ASP.NET Core Web Application project. - -``` -dotnet new mvc -n ExtractDataASPNETCoreAPP -``` -Step 2: Replace **ExtractDataASPNETCoreAPP** with your desired project name. - -Step 3: Navigate to the project directory using the following command - -``` -cd ExtractDataASPNETCoreAPP -``` -Step 4: Use the following command in the terminal to add the [Syncfusion.SmartDataExtractor.Net.Core ](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.Net.Core) package to your project. - -``` -dotnet add package Syncfusion.SmartDataExtractor.Net.Core -``` - -Step 5: A default controller with name HomeController.cs gets added on creation of ASP.NET Core project. Include the following namespaces in that HomeController.cs file. - -{% highlight c# tabtitle="C#" %} - - using Syncfusion.SmartDataExtractor; - using System.Diagnostics; - using System.Text; - -{% endhighlight %} - -Step 6: A default action method named Index will be present in HomeController.cs. Right click on Index method and select Go To View where you will be directed to its associated view page Index.cshtml. Add a new button in the Index.cshtml as shown below. - -{% highlight c# tabtitle="C#" %} - -@{ - Html.BeginForm("ExtractData", "Home", FormMethod.Get); - { -
    - -
    - } - Html.EndForm(); -} - -{% endhighlight %} - -Step 7: Add a new action method named ``ExportToJson`` in HomeController.cs and include the below code example to extract data as JSON using the DataExtractor (help.syncfusion.com in Bing) class. Then use the Extract (help.syncfusion.com in Bing) method of the DataExtractor object to process the input and export the results in JSON format. - -{% highlight c# tabtitle="C#" %} - -// Open the input PDF file as a stream. -using (FileStream stream = new FileStream(Path.GetFullPath("Input.pdf"), FileMode.Open, FileAccess.Read)) -{ - // Initialize the Smart Data Extractor. - DataExtractor extractor = new DataExtractor(); - // Extract form data as JSON. - string data = extractor.ExtractDataAsJson(stream); - // Convert JSON string into a MemoryStream for download. - MemoryStream outputStream = new MemoryStream(Encoding.UTF8.GetBytes(data)); - // Reset stream position. - outputStream.Position = 0; - // Return JSON file as download in browser. - FileStreamResult fileStreamResult = new FileStreamResult(outputStream, "application/json"); - fileStreamResult.FileDownloadName = "Output.json"; - return fileStreamResult; -} - -{% endhighlight %} - -Step 8: Build the project. - -Run the following command in terminal to build the project. - -``` -dotnet build -``` - -Step 9: Run the project. - -Run the following command in terminal to build the project. - -``` -dotnet run -``` \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-ASP-NET-Core-Visual-Studio.md b/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-ASP-NET-Core-Visual-Studio.md deleted file mode 100644 index adee1125df..0000000000 --- a/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-ASP-NET-Core-Visual-Studio.md +++ /dev/null @@ -1,71 +0,0 @@ -**Prerequisites**: - -* Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). -* Install Visual Studio: Download and install Visual Studio from the [official website](https://visualstudio.microsoft.com/downloads/). - - - -Step 1: Create a new C# ASP.NET Core Web Application project. - ![Create ASP.NET Core Web application in Visual Studio](Images/ASPNETCoreProjectCreation.png) - -Step 2: In configuration windows, name your project and click Next. - ![Select Web Application pattern](Images/ASPNETCoreConfiguration.png) - ![Select version](Images/ASPNETCoreAdditionalInfo.png) - - -Step 3: Install the [Syncfusion.SmartDataExtractor.Net.Core](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.Net.Core/) package as reference to your ASP.NET Core applications from [NuGet.org](https://www.nuget.org/). - ![Install SmartDataExtractor .NET Core NuGet package](Images/ASPNETCore_Nuget.png) - -Step 4: A default controller with name HomeController.cs gets added on creation of ASP.NET Core project. Include the following namespaces in that HomeController.cs file. - -{% highlight c# tabtitle="C#" %} - - using Syncfusion.SmartDataExtractor; - using System.Diagnostics; - using System.Text; - -{% endhighlight %} - -Step 5: A default action method named Index will be present in HomeController.cs. Right click on Index method and select Go To View where you will be directed to its associated view page Index.cshtml. Add a new button in the Index.cshtml as shown below. -{% highlight c# tabtitle="C#" %} - -@{ - Html.BeginForm("ExtractData", "Home", FormMethod.Get); - { -
    - -
    - } - Html.EndForm(); -} - -{% endhighlight %} - -Step 6: Add a new action method named ``ExportToJson`` in HomeController.cs and include the below code example to extract data as JSON using the DataExtractor (help.syncfusion.com in Bing) class. Then use the Extract (help.syncfusion.com in Bing) method of the DataExtractor object to process the input and export the results in JSON format. - -{% highlight c# tabtitle="C#" %} - -// Open the input PDF file as a stream. -using (FileStream stream = new FileStream(Path.GetFullPath("Input.pdf"), FileMode.Open, FileAccess.Read)) -{ - // Initialize the Smart Data Extractor. - DataExtractor extractor = new DataExtractor(); - // Extract form data as JSON. - string data = extractor.ExtractDataAsJson(stream); - // Convert JSON string into a MemoryStream for download. - MemoryStream outputStream = new MemoryStream(Encoding.UTF8.GetBytes(data)); - // Reset stream position. - outputStream.Position = 0; - // Return JSON file as download in browser. - FileStreamResult fileStreamResult = new FileStreamResult(outputStream, "application/json"); - fileStreamResult.FileDownloadName = "Output.json"; - return fileStreamResult; -} - -{% endhighlight %} - -Step 7: Build the project. - Click on Build > Build Solution or press Ctrl + Shift + B to build the project. - -Step 8: Run the project. - Click the Start button (green arrow) or press F5 to run the app. diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-Console-VS-Code.md b/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-Console-VS-Code.md deleted file mode 100644 index 1080e83596..0000000000 --- a/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-Console-VS-Code.md +++ /dev/null @@ -1,68 +0,0 @@ -**Prerequisites**: - -* Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). -* Install Visual Studio Code: Download and install Visual Studio Code from the [official website](https://code.visualstudio.com/download). -* Install C# Extension for VS Code: Open Visual Studio Code, go to the Extensions view (Ctrl+Shift+X), and search for 'C#'. Install the official [C# extension provided by Microsoft](https://marketplace.visualstudio.com/items?itemName=ms-dotnettools.csharp). - - - -Step 1: Open the terminal (Ctrl+` ) and run the following command to create a new .NET Core console application project. - -``` -dotnet new console -n ExtractDataConsoleApp -``` -Step 2: Replace ****ExtractDataConsoleApp** with your desired project name. - -Step 3: Navigate to the project directory using the following command - -``` -cd ExtractDataConsoleApp -``` -Step 4: Use the following command in the terminal to add the [Syncfusion.SmartDataExtractor.Net.Core](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.Net.Core) package to your project. - -``` -dotnet add package Syncfusion.SmartDataExtractor.Net.Core -``` - -Step 5: Include the following namespaces in the *Program.cs* file. - -{% highlight c# tabtitle="C#" %} - -using System.IO; -using Syncfusion.Pdf.Parsing; -using Syncfusion.SmartDataExtractor; - -{% endhighlight %} - -Step 6: Include the below code snippet in *Program.cs* to Extract data from an PDF file. - -{% highlight c# tabtitle="C# [Cross-platform]" %} - -//Open the input PDF file as a stream. -using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) -{ - //Initialize the Smart Data Extractor. - DataExtractor extractor = new DataExtractor(); - //Extract data as JSON. - string data = extractor.ExtractDataAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); -} - -{% endhighlight %} - -Step 7: Build the project. - -Run the following command in terminal to build the project. - -``` -dotnet build -``` - -Step 8: Run the project. - -Run the following command in terminal to build the project. - -``` -dotnet run -``` \ No newline at end of file diff --git a/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-Console-Visual-Studio.md b/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-Console-Visual-Studio.md deleted file mode 100644 index f541fcf0ed..0000000000 --- a/Document-Processing/Data-Extraction/NET/tabcontent-support/Extract-Data-in-Console-Visual-Studio.md +++ /dev/null @@ -1,52 +0,0 @@ -**Prerequisites**: - -* Install .NET SDK: Ensure that you have the .NET SDK installed on your system. You can download it from the [.NET Downloads page](https://dotnet.microsoft.com/en-us/download). -* Install Visual Studio: Download and install Visual Studio Code from the [official website](https://code.visualstudio.com/download). - -Step 1: Create a new C# Console Application project. -![Console sample creation](Images/ConsoleCreation.png) - -Step 2: Name the project. -![Name the application](Images/ConsoleName.png) - -Step 3: Install the [Syncfusion.SmartDataExtractor.Net.Core](https://www.nuget.org/packages/Syncfusion.SmartDataExtractor.Net.Core) NuGet package as reference to your .NET Standard applications from [NuGet.org](https://www.nuget.org). -![NET Core NuGet package](Images/ConsoleCoreNuget.png) - -Step 4: Include the following namespaces in the *Program.cs* file. - -{% highlight c# tabtitle="C#" %} - -using System.IO; -using Syncfusion.Pdf.Parsing; -using Syncfusion.SmartDataExtractor; - -{% endhighlight %} - -Step 5: Include the below code snippet in *Program.cs* to Extract data from an PDF file. - -{% tabs %} - -{% highlight c# tabtitle="C# [Cross-platform]" %} - -//Open the input PDF file as a stream. -using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) -{ - //Initialize the Smart Data Extractor. - DataExtractor extractor = new DataExtractor(); - //Extract data as JSON. - string data = extractor.ExtractDataAsJson(stream); - //Save the extracted JSON data into an output file. - File.WriteAllText("Output.json", data, Encoding.UTF8); -} - -{% endhighlight %} - -{% endtabs %} - -Step 6: Build the project. - -Click on Build > Build Solution or press Ctrl + Shift + B to build the project. - -Step 7: Run the project. - -Click the Start button (green arrow) or press F5 to run the app. \ No newline at end of file