/* Copyright (c) 2003, WebThing Ltd Author: Nick Kew This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by he Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef PARSERFACTORY #define PARSERFACTORY //#define UCS4321 (bom_t) { 4, "ucs-4" } //#define UCS3412 (bom_t) { 4, "ucs-4" } //#define UCS2143 (bom_t) { 4, "ucs-4" } //#define UCS1234 (bom_t) { 4, "ucs-4" } //#define UTF16L (bom_t) { 2, "utf-16" } //#define UTF16B (bom_t) { 2, "utf-16" } //#define UTF8 (bom_t) { 3, "utf-8" } //#define NONE (bom_t) { 0, NULL } #define NONE 0 class ParserFactory { request_rec* r ; Transcoder& trans ; BasicWriter& out ; parsetype sniffed_type ; char* lct ; typedef struct bom_t { size_t bytes ; char* name ; /* const bool operator==(struct bom_t& other) const { return ( name == other.name) ; } const bool operator!=(struct bom_t& other) const { return ( name != other.name) ; } */ } bom_t ; static bom_t UCS4321 ; static bom_t UCS3412 ; static bom_t UCS2143 ; static bom_t UCS1234 ; static bom_t UTF16L ; static bom_t UTF16B ; static bom_t UTF8 ; // static bom_t NONE ; bom_t* bom ; size_t offs ; //typedef enum { NONE, UTF8, UTF16L, UTF16B, UCS1234, UCS4321, UCS2143, UCS3412 } bom_type ; bom_t* read_bom(char* cbuf, size_t buflen) { unsigned char* buf = (unsigned char*) cbuf ; if ( (buf[0] == 0) && (buf[1] == 0) ) if ( (buf[2] == 0xfe) && (buf[3] == 0xff) ) bom = &UCS1234 ; else if ( (buf[2] == 0xff) && (buf[3] == 0xfe) ) bom = &UCS2143 ; else bom = NONE ; else if ( (buf[0] == 0xfe) && (buf[1] == 0xff) ) if ( (buf[2] == 0) && (buf[3] == 0) ) bom = &UCS3412 ; else bom = &UTF16B ; else if ( (buf[0] == 0xff) && (buf[1] == 0xfe) ) if ( (buf[2] == 0) && (buf[3] == 0) ) bom = &UCS4321 ; else bom = &UTF16L ; else if ( (buf[0] == 0xef) && (buf[1] == 0xbb) && (buf[2] == 0xbf) ) bom = &UTF8 ; else bom = NONE ; if ( bom ) { offs = bom->bytes ; trans.set_encoding(bom->name) ; } return bom ; } const void sniff_meta_encoding(char* buf, size_t buflen) const { regmatch_t match[2] ; regex_t* seek_meta= ap_pregcomp(r->pool, "(]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)", REG_EXTENDED|REG_ICASE) ; if ( ap_regexec(seek_meta, buf, 1, match, 0) == 0 ) { char* meta = apr_pstrndup(r->pool, buf + match[0].rm_so, match[0].rm_eo - match[0].rm_so) ; regex_t* seek_charset=ap_pregcomp(r->pool, "charset=([A-Za-z0-9_-]+)", REG_EXTENDED|REG_ICASE) ; if ( ap_regexec(seek_charset, meta, 2, match, 0) == 0 ) trans.set_encoding(apr_pstrndup(r->pool, meta+match[1].rm_so, match[1].rm_eo - match[1].rm_so)) ; ap_pregfree(r->pool, seek_charset) ; } ap_pregfree(r->pool, seek_meta) ; } bool has_xmldecl( char* buf, size_t buflen) { regmatch_t match[1] ; bool ret = false ; const char* str = "<\\?xml[ \t\r\n]*version.*\\?>" ; // "<\?xml[ \t\r\n]*version[ \t\r\n]*=[ \t\r\n]*[\"']1.0[\"'][^\?]*\?>[ \t\r\n]*" regex_t* seek_xmldecl = ap_pregcomp(r->pool, str, 0) ; //"<\?xml[ \t\r\n]*version[ \t\r\n]*=[ \t\r\n]*[\"']1.0[\"'][^\?]*\?>[ \t\r\n]*" , 0) ; if ( ap_regexec(seek_xmldecl, buf + offs, 1, match, 0) == 0 ) { if ( match[0].rm_so == 0 ) ret = true ; // ascii-compatible xmldecl with no BOM // look for encoding char tbr = buf[match[0].rm_eo-2] ; buf[match[0].rm_eo-2] = 0 ; char* encp = strstr(buf, "encoding") ; if ( encp ) { encp += 8 ; while ( *encp && ! isalnum(*encp) ) ++encp ; if ( encp ) { char* endp = strchr(encp, encp[-1]) ; if ( endp ) trans.set_encoding(apr_pstrndup(r->pool, encp, endp-encp) ) ; } } buf[match[0].rm_eo-2] = tbr ; offs += match[0].rm_eo ; } ap_pregfree(r->pool, seek_xmldecl) ; if ( ret && ! trans.encoding() ) trans.set_encoding(UTF8.name) ; return ret ; } const bool is_appendixc(char* buf, size_t buflen) const { regmatch_t match[1] ; char* fpi = 0 ; regex_t* seek_fpi = ap_pregcomp(r->pool, "]+>", REG_EXTENDED|REG_ICASE) ; if ( ap_regexec(seek_fpi, buf, 1, match, 0) == 0 ) fpi = apr_pstrndup(r->pool, buf + match[0].rm_so, match[0].rm_eo - match[0].rm_so) ; ap_pregfree(r->pool, seek_fpi) ; if ( fpi ) { const char* appendixc[] = { "-//W3C//DTD XHTML 1.0 Strict//EN" , "-//W3C//DTD XHTML 1.0 Transitional//EN" , "-//W3C//DTD XHTML 1.0 Frameset//EN" , NULL } ; for ( const char** decl = appendixc; *decl; ++decl ) if ( strstr(fpi, *decl) != NULL ) return true ; } return false ; } const bool is_xml_ctype() const { const char* types[] = { "application/smil" , "text/vnd.iptc.newsml" , "text/vnd.in3d.3dml" , NULL } ; if ( ! lct ) return false ; if ( strstr(lct, "xml") != 0 ) return true ; if ( strstr(lct, "vnd.wap") != 0 ) return true ; for (int i = 0; types[i]; ++i) if ( ! strcmp(lct, types[i]) ) return true ; return false ; } const bool is_sgml_ctype() const { const char* types[] = { NULL } ; if ( ! lct ) return false ; if ( strstr(lct, "sgml") != 0 ) return true ; for (int i = 0; types[i]; ++i) if ( ! strcmp(lct, types[i]) ) return true ; return false ; } public: ParserFactory(request_rec* rec, Transcoder& t, BasicWriter& w) : r(rec) , trans(t), out(w), sniffed_type(UNSET), lct(0), bom(NONE), offs(0) { } ParserFactory(request_rec* rec, Transcoder& t, BasicWriter& w, ApacheHTTP& http) : r(rec) , trans(t), out(w), sniffed_type(UNSET), bom(NONE), offs(0) { lct = apr_pstrdup(r->pool, http.content_type()) ; for ( char* c = lct; *c; ++c) if ( isupper(*c) ) *c = tolower(*c) ; } const char* xmlDecl() const { // HSivonen bug report - need to include other clauses in this return "" ; } #ifdef HAVE_UPLOAD private: bool sniff_doctype(char* buf, size_t buflen) { regex_t* seek_doctype = ap_pregcomp(r->pool, "pool, seek_doctype) ; return ret ; } parsetype sniff_quiet_(char* buf, size_t buflen) { if ( ( read_bom(buf, buflen) != NONE ) || has_xmldecl(buf, buflen) ) return XML ; sniff_meta_encoding(buf, buflen) ; if ( is_appendixc(buf, buflen) ) { if ( ! trans.encoding() ) trans.set_encoding("utf-8") ; return XHTML ; } if ( trans.encoding() ) return HTML ; trans.set_encoding("ascii") ; if ( sniff_doctype(buf, buflen) ) return SGML ; else { out.puts("Document doesn't look like any recognised markup type.") ; return UNSET ; } } public: parsetype sniff_quiet(char* buf, size_t buflen) { char last = buf[buflen-1] ; if ( buflen < BUFLEN ) buf[buflen] = 0 ; else buf[buflen-1] = 0 ; sniffed_type = sniff_quiet_(buf, buflen) ; buf[buflen-1] = last ; return sniffed_type ; } #endif parsetype sniff_doc(char* buf, size_t buflen) { char last = buf[buflen-1] ; if ( buflen < BUFLEN ) buf[buflen] = 0 ; else buf[buflen-1] = 0 ; if ( !strcmp( lct, "text/html" ) ) { sniffed_type = HTML ; if ( ! trans.encoding() ) sniff_meta_encoding(buf, buflen) ; if ( ! trans.encoding() ) { out.puts("No charset specified, but W3C rules for text/html require either an HTTP header or a META hack. I'll try provisional validation with the HTTP default iso-8859-1") ; trans.set_encoding("iso-8859-1") ; // HTTP default } if ( is_appendixc(buf, buflen) ) { out.puts("XHTML document served as text/html - parsing as XHTML under Appendix C rules") ; sniffed_type = XHTML ; } else if ( ( read_bom(buf, buflen) != NONE ) || has_xmldecl(buf, buflen) ) { out.puts("Start of document looks like XML but is not recognised as XHTML 1.0. This should not be served as text/html") ; sniffed_type = XML ; } } else if ( is_xml_ctype() ) { sniffed_type = XML ; if ( ( read_bom(buf, buflen) == NONE ) && !has_xmldecl(buf, buflen)) out.puts("Content type suggests XML, but the document doesn't look like XML") ; } else if ( is_sgml_ctype() ) { sniffed_type = SGML ; if ( ( read_bom(buf, buflen) != NONE ) && has_xmldecl(buf, buflen)) out.puts("Content type suggests SGML, but the document looks like XML") ; } else { if ( ( read_bom(buf, buflen) == NONE ) && !has_xmldecl(buf, buflen)) { out.puts("Content type ").escape(lct) .puts(" is not recognised as markup, nor does the document look like it. I won't try to validate this.") ; sniffed_type = UNSET ; } else { out.puts("Content type ").escape(lct) .puts(" is not recognised as markup. However, the document looks like XML, so I'll try validating as that.") ; sniffed_type = XML ; } } if ( ! trans.encoding() ) { if (sniffed_type == XML) // http.set_encoding("xml") ; trans.set_encoding("utf-8") ; else if (sniffed_type == XHTML) { out.puts("Since the document is served as text/html, HTML rules take precedence over XML rules. But I cannot determine charset under HTML rules. You should fix your server to set the charset explicitly, or serve the document as an XML type.") ; trans.set_encoding("utf-8") ; } else { out.puts("No charset specified in HTTP, and neither XML nor HTML rules apply. I'll use the HTTP default, iso-8859-1.") ; trans.set_encoding("iso-8859-1") ; // HTTP default } } buf[buflen-1] = last ; return sniffed_type ; } ApacheValidator* selectParser(int resultsMode, apr_table_t* args) const { validator_conf* conf = (validator_conf*) ap_get_module_config(r->per_dir_config, &validator_module) ; parser p = NULL_PARSER ; //conf->defaultparser ; parser_rec* pr = 0 ; if ( lct ) for ( pr = conf->plist; pr; pr = pr->next ) if ( ! strcmp ( lct, pr->ctype )) break ; parser dflt = pr ? pr->preferred : conf->defaultparser ; parsers ok = pr ? pr->allowed : conf->defaultallowed ; const char* pselected = args ? apr_table_get(args, "parser") : getArg(r->pool, r->args, "parser") ; if ( pselected ) if ( !strcmp(pselected, "OpenSP") ) p = OpenSP_ ; else if ( !strcmp(pselected, "Xerces") ) p = Xerces_ ; if ( ! (p & ok) ) { if ( ( p != NULL_PARSER ) ) out.puts("Parser ").puts(pselected) .puts(" is not allowed for content type ") .escape(lct).puts(" by server configuration") ; p = dflt ; } if ( ( p == Xerces_ ) && ( (sniffed_type == HTML) || (sniffed_type == SGML) ) ) { out.puts("Xerces cannot parse HTML or SGML documents; using OpenSP") ; p = OpenSP_ ; } if ( sniffed_type == UNSET ) { p = NULL_PARSER ; } switch ( p ) { case OpenSP_ : return new OpenSPValidator(r, out, resultsMode, sniffed_type, args) ; case Xerces_ : return new XercesValidator(r, out, resultsMode, args) ; default: out.puts("Document type is not supported by this service (no parser available by configuration).") ; return 0 ; } } const size_t xml_bytes() const { return offs ; } } ; ParserFactory::bom_t ParserFactory::UCS4321 = { 4, "ucs4" } ; ParserFactory::bom_t ParserFactory::UCS3412 = { 4, "ucs4" } ; ParserFactory::bom_t ParserFactory::UCS2143 = { 4, "ucs4" } ; ParserFactory::bom_t ParserFactory::UCS1234 = { 4, "ucs-4be" } ; ParserFactory::bom_t ParserFactory::UTF16L = { 2, "utf16" } ; ParserFactory::bom_t ParserFactory::UTF16B = { 2, "utf16BE" } ; ParserFactory::bom_t ParserFactory::UTF8 = { 3, "utf8" } ; //ParserFactory::bom_t ParserFactory::NONE = { 0, NULL } ; #endif