Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members

HTMLparser.h

Go to the documentation of this file.
00001 /*
00002  * HTMLparser.h : interface for an HTML 4.0 non-verifying parser
00003  *
00004  * See Copyright for the status of this software.
00005  *
00006  * daniel@veillard.com
00007  */
00008 
00009 #ifndef __HTML_PARSER_H__
00010 #define __HTML_PARSER_H__
00011 #include <libxml/xmlversion.h>
00012 #include <libxml/parser.h>
00013 
00014 #ifdef __cplusplus
00015 extern "C" {
00016 #endif
00017 
00018 /*
00019  * Most of the back-end structures from XML and HTML are shared.
00020  */
00021 typedef xmlParserCtxt htmlParserCtxt;
00022 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
00023 typedef xmlParserNodeInfo htmlParserNodeInfo;
00024 typedef xmlSAXHandler htmlSAXHandler;
00025 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
00026 typedef xmlParserInput htmlParserInput;
00027 typedef xmlParserInputPtr htmlParserInputPtr;
00028 typedef xmlDocPtr htmlDocPtr;
00029 typedef xmlNodePtr htmlNodePtr;
00030 
00031 /*
00032  * Internal description of an HTML element, representing HTML 4.01
00033  * and XHTML 1.0 (which share the same structure).
00034  */
00035 typedef struct _htmlElemDesc htmlElemDesc;
00036 typedef htmlElemDesc *htmlElemDescPtr;
00037 struct _htmlElemDesc {
00038     const char *name;   /* The tag name */
00039     char startTag;      /* Whether the start tag can be implied */
00040     char endTag;        /* Whether the end tag can be implied */
00041     char saveEndTag;    /* Whether the end tag should be saved */
00042     char empty;         /* Is this an empty element ? */
00043     char depr;          /* Is this a deprecated element ? */
00044     char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
00045     char isinline;      /* is this a block 0 or inline 1 element */
00046     const char *desc;   /* the description */
00047 
00048 /* NRK Jan.2003
00049  * New fields encapsulating HTML structure
00050  *
00051  * Bugs:
00052  *      This is a very limited representation.  It fails to tell us when
00053  *      an element *requires* subelements (we only have whether they're
00054  *      allowed or not), and it doesn't tell us where CDATA and PCDATA
00055  *      are allowed.  Some element relationships are not fully represented:
00056  *      these are flagged with the word MODIFIER
00057  */
00058     const char** subelts;               /* allowed sub-elements of this element */
00059     const char* defaultsubelt;  /* subelement for suggested auto-repair
00060                                            if necessary or NULL */
00061     const char** attrs_opt;             /* Optional Attributes */
00062     const char** attrs_depr;            /* Additional deprecated attributes */
00063     const char** attrs_req;             /* Required attributes */
00064 };
00065 
00066 /*
00067  * Internal description of an HTML entity.
00068  */
00069 typedef struct _htmlEntityDesc htmlEntityDesc;
00070 typedef htmlEntityDesc *htmlEntityDescPtr;
00071 struct _htmlEntityDesc {
00072     unsigned int value; /* the UNICODE value for the character */
00073     const char *name;   /* The entity name */
00074     const char *desc;   /* the description */
00075 };
00076 
00077 /*
00078  * There is only few public functions.
00079  */
00080 XMLPUBFUN const htmlElemDesc * XMLCALL  
00081                         htmlTagLookup   (const xmlChar *tag);
00082 XMLPUBFUN const htmlEntityDesc * XMLCALL        
00083                         htmlEntityLookup(const xmlChar *name);
00084 XMLPUBFUN const htmlEntityDesc * XMLCALL        
00085                         htmlEntityValueLookup(unsigned int value);
00086 
00087 XMLPUBFUN int XMLCALL                   
00088                         htmlIsAutoClosed(htmlDocPtr doc,
00089                                          htmlNodePtr elem);
00090 XMLPUBFUN int XMLCALL                   
00091                         htmlAutoCloseTag(htmlDocPtr doc,
00092                                          const xmlChar *name,
00093                                          htmlNodePtr elem);
00094 XMLPUBFUN const htmlEntityDesc * XMLCALL        
00095                         htmlParseEntityRef(htmlParserCtxtPtr ctxt,
00096                                          const xmlChar **str);
00097 XMLPUBFUN int XMLCALL                   
00098                         htmlParseCharRef(htmlParserCtxtPtr ctxt);
00099 XMLPUBFUN void XMLCALL                  
00100                         htmlParseElement(htmlParserCtxtPtr ctxt);
00101 
00102 XMLPUBFUN htmlParserCtxtPtr XMLCALL     
00103                         htmlCreateMemoryParserCtxt(const char *buffer,
00104                                                    int size);
00105 
00106 XMLPUBFUN int XMLCALL                   
00107                         htmlParseDocument(htmlParserCtxtPtr ctxt);
00108 XMLPUBFUN htmlDocPtr XMLCALL            
00109                         htmlSAXParseDoc (xmlChar *cur,
00110                                          const char *encoding,
00111                                          htmlSAXHandlerPtr sax,
00112                                          void *userData);
00113 XMLPUBFUN htmlDocPtr XMLCALL            
00114                         htmlParseDoc    (xmlChar *cur,
00115                                          const char *encoding);
00116 XMLPUBFUN htmlDocPtr XMLCALL            
00117                         htmlSAXParseFile(const char *filename,
00118                                          const char *encoding,
00119                                          htmlSAXHandlerPtr sax,
00120                                          void *userData);
00121 XMLPUBFUN htmlDocPtr XMLCALL            
00122                         htmlParseFile   (const char *filename,
00123                                          const char *encoding);
00124 XMLPUBFUN int XMLCALL                   
00125                         UTF8ToHtml      (unsigned char *out,
00126                                          int *outlen,
00127                                          const unsigned char *in,
00128                                          int *inlen);
00129 XMLPUBFUN int XMLCALL                   
00130                         htmlEncodeEntities(unsigned char *out,
00131                                          int *outlen,
00132                                          const unsigned char *in,
00133                                          int *inlen, int quoteChar);
00134 XMLPUBFUN int XMLCALL                   
00135                         htmlIsScriptAttribute(const xmlChar *name);
00136 XMLPUBFUN int XMLCALL                   
00137                         htmlHandleOmittedElem(int val);
00138 
00139 #ifdef LIBXML_PUSH_ENABLED
00140 
00143 XMLPUBFUN htmlParserCtxtPtr XMLCALL     
00144                         htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
00145                                                  void *user_data,
00146                                                  const char *chunk,
00147                                                  int size,
00148                                                  const char *filename,
00149                                                  xmlCharEncoding enc);
00150 XMLPUBFUN int XMLCALL                   
00151                         htmlParseChunk          (htmlParserCtxtPtr ctxt,
00152                                                  const char *chunk,
00153                                                  int size,
00154                                                  int terminate);
00155 #endif /* LIBXML_PUSH_ENABLED */
00156 
00157 XMLPUBFUN void XMLCALL                  
00158                         htmlFreeParserCtxt      (htmlParserCtxtPtr ctxt);
00159 
00160 /*
00161  * New set of simpler/more flexible APIs
00162  */
00169 typedef enum {
00170     HTML_PARSE_NOERROR  = 1<<5, /* suppress error reports */
00171     HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
00172     HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
00173     HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
00174     HTML_PARSE_NONET    = 1<<11 /* Forbid network access */
00175 } htmlParserOption;
00176 
00177 XMLPUBFUN void XMLCALL
00178                 htmlCtxtReset           (htmlParserCtxtPtr ctxt);
00179 XMLPUBFUN int XMLCALL
00180                 htmlCtxtUseOptions      (htmlParserCtxtPtr ctxt,
00181                                          int options);
00182 XMLPUBFUN htmlDocPtr XMLCALL
00183                 htmlReadDoc             (const xmlChar *cur,
00184                                          const char *URL,
00185                                          const char *encoding,
00186                                          int options);
00187 XMLPUBFUN htmlDocPtr XMLCALL
00188                 htmlReadFile            (const char *URL,
00189                                          const char *encoding,
00190                                          int options);
00191 XMLPUBFUN htmlDocPtr XMLCALL
00192                 htmlReadMemory          (const char *buffer,
00193                                          int size,
00194                                          const char *URL,
00195                                          const char *encoding,
00196                                          int options);
00197 XMLPUBFUN htmlDocPtr XMLCALL
00198                 htmlReadFd              (int fd,
00199                                          const char *URL,
00200                                          const char *encoding,
00201                                          int options);
00202 XMLPUBFUN htmlDocPtr XMLCALL
00203                 htmlReadIO              (xmlInputReadCallback ioread,
00204                                          xmlInputCloseCallback ioclose,
00205                                          void *ioctx,
00206                                          const char *URL,
00207                                          const char *encoding,
00208                                          int options);
00209 XMLPUBFUN htmlDocPtr XMLCALL
00210                 htmlCtxtReadDoc         (xmlParserCtxtPtr ctxt,
00211                                          const xmlChar *cur,
00212                                          const char *URL,
00213                                          const char *encoding,
00214                                          int options);
00215 XMLPUBFUN htmlDocPtr XMLCALL
00216                 htmlCtxtReadFile                (xmlParserCtxtPtr ctxt,
00217                                          const char *filename,
00218                                          const char *encoding,
00219                                          int options);
00220 XMLPUBFUN htmlDocPtr XMLCALL
00221                 htmlCtxtReadMemory              (xmlParserCtxtPtr ctxt,
00222                                          const char *buffer,
00223                                          int size,
00224                                          const char *URL,
00225                                          const char *encoding,
00226                                          int options);
00227 XMLPUBFUN htmlDocPtr XMLCALL
00228                 htmlCtxtReadFd          (xmlParserCtxtPtr ctxt,
00229                                          int fd,
00230                                          const char *URL,
00231                                          const char *encoding,
00232                                          int options);
00233 XMLPUBFUN htmlDocPtr XMLCALL
00234                 htmlCtxtReadIO          (xmlParserCtxtPtr ctxt,
00235                                          xmlInputReadCallback ioread,
00236                                          xmlInputCloseCallback ioclose,
00237                                          void *ioctx,
00238                                          const char *URL,
00239                                          const char *encoding,
00240                                          int options);
00241 
00242 /* NRK/Jan2003: further knowledge of HTML structure
00243  */
00244 typedef enum {
00245   HTML_NA = 0 ,         /* something we don't check at all */
00246   HTML_INVALID = 0x1 ,
00247   HTML_DEPRECATED = 0x2 ,
00248   HTML_VALID = 0x4 ,
00249   HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
00250 } htmlStatus ;
00251 
00252 /* Using htmlElemDesc rather than name here, to emphasise the fact
00253    that otherwise there's a lookup overhead
00254 */
00255 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
00256 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
00257 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
00258 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;
00265 #define htmlDefaultSubelement(elt) elt->defaultsubelt
00266 
00276 #define htmlElementAllowedHereDesc(parent,elt) \
00277         htmlElementAllowedHere((parent), (elt)->name)
00278 
00284 #define htmlRequiredAttrs(elt) (elt)->attrs_req
00285 
00286 
00287 #ifdef __cplusplus
00288 }
00289 #endif
00290 
00291 #endif /* __HTML_PARSER_H__ */
00292 

Generated on Wed Mar 16 00:10:27 2005 for Dibbler - a portable DHCPv6 by  doxygen 1.3.9.1