1LIBEXTRACTOR(3)            Library Functions Manual            LIBEXTRACTOR(3)
2
3
4

NAME

6       libextractor - meta-information extraction library 0.6.0
7

SYNOPSIS

9       #include <extractor.h>
10
11       const char *EXTRACTOR_metatype_to_string(enum EXTRACTOR_MetaType type);
12
13       const  char  *EXTRACTOR_metatype_to_description(enum EXTRACTOR_MetaType
14       type);
15
16       enum EXTRACTOR_MetaTypeEXTRACTOR_metatype_get_max (void);
17
18       struct EXTRACTOR_PluginList *EXTRACTOR_plugin_add_defaults(enum EXTRAC‐
19       TOR_Options flags);
20
21       struct   EXTRACTOR_PluginList   *EXTRACTOR_plugin_add  (struct  EXTRAC‐
22       TOR_PluginList * prev, const char * library, const char * options, enum
23       EXTRACTOR_Options flags);
24
25
26       struct  EXTRACTOR_PluginList  *EXTRACTOR_plugin_add_last(struct EXTRAC‐
27       TOR_PluginList *prev, const char *library, const  char  *options,  enum
28       EXTRACTOR_Options flags);
29
30       struct    EXTRACTOR_PluginList   *EXTRACTOR_plugin_add_config   (struct
31       EXTRACTOR_PluginList * prev, const char *config, enum EXTRACTOR_Options
32       flags);                struct   EXTRACTOR_PluginList   *EXTRACTOR_plug‐
33       in_remove(struct EXTRACTOR_PluginList * prev, const char * library);
34
35       void EXTRACTOR_plugin_remove_all(struct EXTRACTOR_PluginList *plugins);
36
37       void EXTRACTOR_extract(struct EXTRACTOR_PluginList *plugins, const char
38       *filename,  const  void *data, size_t size, EXTRACTOR_MetaDataProcessor
39       proc, void *proc_cls);
40
41       int EXTRACTOR_meta_data_print(void * handle, const  char  *plugin_name,
42       enum  EXTRACTOR_MetaType  type, enum EXTRACTOR_MetaFormat format, const
43       char *data_mime_type, const char *data, size_t data_len);
44
45       EXTRACTOR_VERSION
46
47

DESCRIPTION

49       GNU libextractor is a simple library for  keyword  extraction.   libex‐
50       tractor  does  not  support  all formats but supports a simple plugging
51       mechanism such that you can quickly add extractors for additional  for‐
52       mats,  even  without  recompiling libextractor.  libextractor typically
53       ships with dozens of plugins that can be used to obtain meta data  from
54       common file-types.  If you want to write your own plugin for some file‐
55       type, all you need to do is write a little library  that  implements  a
56       single method with this signature:
57
58        int  EXTRACTOR_name_extract(const char *data, size_t datasize, EXTRAC‐
59       TOR_MetaDataProcessor proc, void *proc_cls, const char *options);
60
61
62       Data is a pointer to the contents of the file and datasize is the  size
63       of  data.   The  extract  method  must  call proc for meta data that it
64       finds.  The interpretation of options is up to the plugin.   The  func‐
65       tion  should  return 0 if 'proc' always returned 0, otherwise 1.  After
66       'proc' returned a non-zero value, proc should not be called  again.  An
67       example  implementation  can  be  found  in  html_extractor.c.  Plugins
68       should be automatically found and used once they are installed  in  the
69       respective directory (typically something like /usr/lib/libextractor/).
70
71       The application extract gives an example how to use libextractor.
72
73       The  basic use of libextractor is to load the plugins (for example with
74       EXTRACTOR_plugin_add_defaults), then to extract the keyword list  using
75       EXTRACTOR_extract,  and  finally  unloading  the  plugins (with EXTRAC‐
76       TOR_plugin_remove_all).
77
78       Textual meta data obtained from libextractor is supposed  to  be  UTF-8
79       encoded if the text encoding is known.  Plugins are supposed to convert
80       meta-data to UTF-8 if necessary.    The EXTRACTOR_meta_data_print func‐
81       tion  converts the UTF-8 keywords to the character set from the current
82       locale before printing them.
83

SEE ALSO

85       extract(1)
86
87
89       libextractor  is  released  under   the   GPL   and   a   GNU   package
90       (http://www.gnu.org/).
91
92

BUGS

94       A couple of file-formats (on the order of 10^3) are not recognized...
95
96

AUTHORS

98       extract   was   originally   written   by  Christian  Grothoff  <chris‐
99       tian@grothoff.org> and Vidyut Samanta <vids@cs.ucla.edu>.  Use  <libex‐
100       tractor@gnu.org> to contact the current maintainer(s).
101
102

AVAILABILITY

104       You   can   obtain   the   original   author's   latest   version  from
105       http://www.gnu.org/software/libextractor/.
106
107
108
109                                 Dec 14, 2009                  LIBEXTRACTOR(3)
Impressum