forked from johnwhitington/camlpdf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdftext.mli
156 lines (118 loc) · 4.33 KB
/
pdftext.mli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
(** Parsing fonts and extracting text from content streams and PDF strings *)
(** {2 Data Types } *)
type type3_glpyhs =
{fontbbox : float * float * float * float;
fontmatrix : Pdftransform.transform_matrix;
charprocs : (string * Pdf.pdfobject) list;
type3_resources : Pdf.pdfobject}
type simple_fonttype =
| Type1
| MMType1
| Type3 of type3_glpyhs
| Truetype
type fontmetrics = float array
type fontfile =
| FontFile of int
| FontFile2 of int
| FontFile3 of int
type fontdescriptor =
{ascent : float;
descent : float;
leading : float;
avgwidth : float;
maxwidth : float;
fontfile : fontfile option}
type differences = (string * int) list
type encoding =
| ImplicitInFontFile
| StandardEncoding
| MacRomanEncoding
| WinAnsiEncoding
| MacExpertEncoding
| CustomEncoding of encoding * differences
| FillUndefinedWithStandard of encoding
type simple_font =
{fonttype : simple_fonttype;
basefont : string;
fontmetrics : fontmetrics option;
fontdescriptor : fontdescriptor option;
encoding : encoding}
type standard_font =
| TimesRoman
| TimesBold
| TimesItalic
| TimesBoldItalic
| Helvetica
| HelveticaBold
| HelveticaOblique
| HelveticaBoldOblique
| Courier
| CourierBold
| CourierOblique
| CourierBoldOblique
| Symbol
| ZapfDingbats
type cid_system_info =
{registry : string;
ordering : string;
supplement : int}
type composite_CIDfont =
{cid_system_info : cid_system_info;
cid_basefont : string;
cid_fontdescriptor : fontdescriptor;
cid_widths : (int * float) list;
cid_default_width : int}
type cmap_encoding =
| Predefined of string
| CMap of int (* indirect reference to CMap stream *)
type font =
| StandardFont of standard_font * encoding
| SimpleFont of simple_font
| CIDKeyedFont of string * composite_CIDfont * cmap_encoding
(** {2 String representations of fonts } *)
(** Returns a string such as "Times-Bold" for Pdftext.TimesBold etc. *)
val string_of_standard_font : standard_font -> string
(** Parses a string such as "/Times-Bold" or "/TimesNewRoman,Bold" to Pdftext.TimesRomanBold etc. *)
val standard_font_of_name : string -> standard_font option
(** A debug string for the whole font datatype. *)
val string_of_font : font -> string
(** {2 Reading a Font} *)
(** Read a font from a given document and object *)
val read_font : Pdf.t -> Pdf.pdfobject -> font
(** {2 Writing a Font} *)
(** Write a font to a given document, returning the object number for the main
font dictionary *)
val write_font : Pdf.t -> font -> int
(** {2 Utility functions} *)
(** A list of unicode codepoints for a UTF8 string *)
val codepoints_of_utf8 : string -> int list
(** A UTF8 string for a list of unicode codepoints *)
val utf8_of_codepoints : int list -> string
(** {2 Text from strings outside page content} *)
(** Take a pdf string (which will be either pdfdocencoding or UTF16BE) and
return a string representing the same unicode codepoints in UTF8 *)
val utf8_of_pdfdocstring : string -> string
(** Take a UTF8 string and convert to pdfdocencoding (if no unicode-only
characters are used) or UTF16BE (if they are)) *)
val pdfdocstring_of_utf8 : string -> string
(** Build a pdf string in pdfdocencoding (if no unicode-only characters are
used) or UTF16BE (if they are) *)
val pdfdocstring_of_codepoints : int list -> string
(** Produce a list of unicode codepoints from a pdfdocencoding or UTF16BE pdf
document string *)
val codepoints_of_pdfdocstring : string -> int list
(** {2 Text from strings inside page content} *)
(** The type of text extractors. *)
type text_extractor
(** Build a text extractor from a document and font object *)
val text_extractor_of_font : Pdf.t -> Pdf.pdfobject -> text_extractor
(** Return a list of unicode points from a given extractor and string (for
example from a [Pdfpages.Op_Tj] or [Op_TJ] operator). *)
val codepoints_of_text : text_extractor -> string -> int list
(** Return a list of glyph names from a given extractor and string *)
val glyphnames_of_text : text_extractor -> string -> string list
(** {2 Building text for strings inside page content} *)
(** Return the character code for a given unicode codepoint, if it exists in
this encoding. This is only really suitable for simple stuff like standard 14
fonts, or editing text in existing fonts. *)
val charcode_extractor_of_encoding : encoding -> (int -> int option)