(NET)
NET (2019 год)
Tesseract OCR Project template
This is my template of OCR projects. Firstly need to add Tesseract package from Nuget. Also you need to download tessdata. By the way, you can download it by unix utility CURL.
After that need to place data to BIN folder and include it to project.
And this is simple test project from documentation for quick start.
1: Imports System.Reflection
2: Imports Tesseract
3:
4:
5: Module Module1
6: Public Sub Main(ByVal args As String())
7: Dim testImagePath = "upwork-sample-2-1.png"
8:
9: Try
10: Dim logger = New FormattedConsoleLogger()
11: Dim resultPrinter = New ResultPrinter(logger)
12:
13: Dim path = IO.Path.GetDirectoryName(Assembly.GetExecutingAssembly().CodeBase)
14: path = IO.Path.Combine(path, "tessdata")
15: path = path.Replace("file:\", "")
16:
17: Using engine = New TesseractEngine(path, "eng", EngineMode.[Default])
18:
19: 'engine.SetVariable("tessedit_char_whitelist", "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,")
20: 'engine.SetVariable("tessedit_unrej_any_wd", True)
21:
22: Using img = Pix.LoadFromFile(testImagePath)
23:
24: Using logger.Begin("Process image " & testImagePath)
25: Dim i = 1
26:
27: Using page = engine.Process(img)
28: Dim text = page.GetText()
29: logger.Log("Text: {0}", text)
30: logger.Log("Mean confidence: {0}", page.GetMeanConfidence())
31:
32: Using iter = page.GetIterator()
33: iter.Begin()
34:
35: Do
36:
37: If i Mod 2 = 0 Then
38:
39: Using logger.Begin("Line {0}", i)
40:
41: Do
42:
43: Using logger.Begin("Word Iteration")
44:
45: If iter.IsAtBeginningOf(PageIteratorLevel.Block) Then
46: logger.Log("New block")
47: End If
48:
49: If iter.IsAtBeginningOf(PageIteratorLevel.Para) Then
50: logger.Log("New paragraph")
51: End If
52:
53: If iter.IsAtBeginningOf(PageIteratorLevel.TextLine) Then
54: logger.Log("New line")
55: End If
56:
57: logger.Log("word: " & iter.GetText(PageIteratorLevel.Word))
58: End Using
59: Loop While iter.[Next](PageIteratorLevel.TextLine, PageIteratorLevel.Word)
60: End Using
61: End If
62:
63: i += 1
64: Loop While iter.[Next](PageIteratorLevel.Para, PageIteratorLevel.TextLine)
65: End Using
66: End Using
67: End Using
68: End Using
69: End Using
70:
71: Catch ex As Exception
72: Trace.TraceError(ex.ToString())
73: Console.WriteLine("Unexpected Error: " & ex.Message)
74: Console.WriteLine("Details: ")
75: Console.WriteLine(ex.ToString())
76: End Try
77:
78: Console.Write("Press any key to continue . . . ")
79: Console.ReadKey(True)
80: End Sub
81:
82: End Module
This project use Iterator function and sophisticated formated console logger.
1: Imports System
2: Imports System.Collections.Generic
3: Imports System.Text
4: Imports Tesseract
5:
6:
7: Public Class FormattedConsoleLogger
8: Const Tab As String = " "
9:
10: Private Class Scope
11: Inherits DisposableBase
12:
13: Private indentLevel As Integer
14: Private indent As String
15: Private container As FormattedConsoleLogger
16:
17: Public Sub New(ByVal container As FormattedConsoleLogger, ByVal indentLevel As Integer)
18: Me.container = container
19: Me.indentLevel = indentLevel
20: Dim indent As StringBuilder = New StringBuilder()
21:
22: For i As Integer = 0 To indentLevel - 1
23: indent.Append(Tab)
24: Next
25:
26: Me.indent = indent.ToString()
27: End Sub
28:
29: Public Sub Log(ByVal format As String, ByVal args As Object())
30: Dim message = String.Format(format, args)
31: Dim indentedMessage As StringBuilder = New StringBuilder(message.Length + indent.Length * 10)
32: Dim i As Integer = 0
33: Dim isNewLine As Boolean = True
34:
35: While i < message.Length
36:
37: If message.Length > i AndAlso message(i) = vbCr AndAlso message(i + 1) = vbLf Then
38: indentedMessage.AppendLine()
39: isNewLine = True
40: i += 2
41: ElseIf message(i) = vbCr OrElse message(i) = vbLf Then
42: indentedMessage.AppendLine()
43: isNewLine = True
44: i += 1
45: Else
46:
47: If isNewLine Then
48: indentedMessage.Append(indent)
49: isNewLine = False
50: End If
51:
52: indentedMessage.Append(message(i))
53: i += 1
54: End If
55: End While
56:
57: Console.WriteLine(indentedMessage.ToString())
58: End Sub
59:
60: Public Function Begin() As Scope
61: Return New Scope(container, indentLevel + 1)
62: End Function
63:
64: Protected Overrides Sub Dispose(ByVal disposing As Boolean)
65: If disposing Then
66: Dim scope = container.scopes.Pop()
67:
68: If scope IsNot Me Then
69: Throw New InvalidOperationException("Format scope removed out of order.")
70: End If
71: End If
72: End Sub
73: End Class
74:
75: Private scopes As Stack(Of Scope) = New Stack(Of Scope)()
76:
77: Public Function Begin(ByVal title As String, ParamArray args As Object()) As IDisposable
78: Log(title, args)
79: Dim scope As Scope
80:
81: If scopes.Count = 0 Then
82: scope = New Scope(Me, 1)
83: Else
84: scope = ActiveScope.Begin()
85: End If
86:
87: scopes.Push(scope)
88: Return scope
89: End Function
90:
91: Public Sub Log(ByVal format As String, ParamArray args As Object())
92: If scopes.Count > 0 Then
93: ActiveScope.Log(format, args)
94: Else
95: Console.WriteLine(String.Format(format, args))
96: End If
97: End Sub
98:
99: Private ReadOnly Property ActiveScope As Scope
100: Get
101: Dim top = scopes.Peek()
102: If top Is Nothing Then Throw New InvalidOperationException("No current scope")
103: Return top
104: End Get
105: End Property
106: End Class
107:
1: Imports Tesseract
2:
3: Public Class ResultPrinter
4: ReadOnly logger As FormattedConsoleLogger
5:
6: Public Sub New(ByVal logger As FormattedConsoleLogger)
7: Me.logger = logger
8: End Sub
9:
10: Public Sub Print(ByVal iter As ResultIterator)
11: logger.Log("Is beginning of block: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Block))
12: logger.Log("Is beginning of para: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Para))
13: logger.Log("Is beginning of text line: {0}", iter.IsAtBeginningOf(PageIteratorLevel.TextLine))
14: logger.Log("Is beginning of word: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Word))
15: logger.Log("Is beginning of symbol: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Symbol))
16: logger.Log("Block text: ""{0}""", iter.GetText(PageIteratorLevel.Block))
17: logger.Log("Para text: ""{0}""", iter.GetText(PageIteratorLevel.Para))
18: logger.Log("TextLine text: ""{0}""", iter.GetText(PageIteratorLevel.TextLine))
19: logger.Log("Word text: ""{0}""", iter.GetText(PageIteratorLevel.Word))
20: logger.Log("Symbol text: ""{0}""", iter.GetText(PageIteratorLevel.Symbol))
21: End Sub
22: End Class
And next most biggest problem is quality of image. For example in this image with high quality all text is recognize (but not with yellow background !).
But is you have low quality image (in screen below you can see the same image, but in low quality) we receive unexpected result.