@inproceedings{d2ac04d8ac744856bedb19bde55e5590,
title = "Yarmouk Arabic OCR Dataset",
abstract = "Optical Character Recognition (OCR) is the process of recognizing characters automatically from scanned or image documents. OCR software uses machine learning to recognize characters in the document. Such software needs to pass a training phase to learn how to recognize the letters in the text. In order to implement the training phase the OCR needs to use a standard dataset. The dataset can be used to evaluate the obtained results. In this research, we propose an Arabic printed OCR dataset. To the best of our knowledge, there is no Arabic OCR dataset that is available to be used by the research community with its ground truth with a size that is suitable to build a robust Arabic OCR. The proposed dataset is extracted randomly from Wikipedia to have different topics. It consists of 4,587 Arabic articles with a total of 8,994 images.",
keywords = "Arabic OCR Dataset, Image Dataset, Optical Character Recognition",
author = "Doush, \{Iyad Abu\} and Faisal Aikhateeb and Gharibeh, \{Anwaar Hamdi\}",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 8th International Conference on Computer Science and Information Technology, CSIT 2018 ; Conference date: 11-07-2018 Through 12-07-2018",
year = "2018",
month = oct,
day = "8",
doi = "10.1109/CSIT.2018.8486162",
language = "English",
series = "2018 8th International Conference on Computer Science and Information Technology, CSIT 2018",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "150--154",
booktitle = "2018 8th International Conference on Computer Science and Information Technology, CSIT 2018",
address = "United States",
}