libdap++  Updated for version 3.8.2
escaping.cc
Go to the documentation of this file.
00001 
00002 // -*- mode: c++; c-basic-offset:4 -*-
00003 
00004 // This file is part of libdap, A C++ implementation of the OPeNDAP Data
00005 // Access Protocol.
00006 
00007 // Copyright (c) 2002,2003 OPeNDAP, Inc.
00008 // Author: James Gallagher <jgallagher@opendap.org>
00009 //
00010 // This library is free software; you can redistribute it and/or
00011 // modify it under the terms of the GNU Lesser General Public
00012 // License as published by the Free Software Foundation; either
00013 // version 2.1 of the License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful,
00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00023 //
00024 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
00025 
00026 // Copyright (c) 1996, California Institute of Technology.
00027 // ALL RIGHTS RESERVED.   U.S. Government Sponsorship acknowledged.
00028 //
00029 // Please read the full copyright notice in the file COPYRIGHT_URI
00030 // in this directory.
00031 //
00032 // Author: Todd Karakashian, NASA/Jet Propulsion Laboratory
00033 //         Todd.K.Karakashian@jpl.nasa.gov
00034 //
00035 // $RCSfile: escaping.cc,v $ - Miscellaneous routines for OPeNDAP HDF server
00036 //
00037 // These two routines are for escaping/unescaping strings that are identifiers
00038 // in DAP2
00039 // id2www() -- escape (using WWW hex codes) non-allowable characters in a
00040 // DAP2 identifier
00041 // www2id() -- given an WWW hexcode escaped identifier, restore it
00042 //
00043 // These two routines are for escaping/unescaping strings storing attribute
00044 // values.  They use traditional octal escapes (\nnn) because they are
00045 // intended to be viewed by a user
00046 // escattr() -- escape (using traditional octal backslash) non-allowable
00047 // characters in the value of a DAP2 attribute
00048 // unescattr() -- given an octally escaped string, restore it
00049 //
00050 // These are routines used by the above, not intended to be called directly:
00051 //
00052 // hexstring()
00053 // unhexstring()
00054 // octstring()
00055 // unoctstring()
00056 //
00057 // -Todd
00058 
00059 #include <ctype.h>
00060 
00061 #include <iomanip>
00062 #include <string>
00063 #include <sstream>
00064 
00065 #include "GNURegex.h"
00066 #include "Error.h"
00067 #include "InternalErr.h"
00068 //#define DODS_DEBUG
00069 #include "debug.h"
00070 
00071 using namespace std;
00072 
00073 namespace libdap {
00074 
00075 // The next four functions were originally defined static, but I removed that
00076 // to make testing them (see generalUtilTest.cc) easier to write. 5/7/2001
00077 // jhrg
00078 
00079 string
00080 hexstring(unsigned char val)
00081 {
00082     ostringstream buf;
00083     buf << hex << setw(2) << setfill('0') << static_cast<unsigned int>(val);
00084 
00085     return buf.str();
00086 }
00087 
00088 string
00089 unhexstring(string s)
00090 {
00091     int val;
00092     istringstream ss(s);
00093     ss >> hex >> val;
00094     char tmp_str[2];
00095     tmp_str[0] = static_cast<char>(val);
00096     tmp_str[1] = '\0';
00097     return string(tmp_str);
00098 }
00099 
00100 string
00101 octstring(unsigned char val)
00102 {
00103     ostringstream buf;
00104     buf << oct << setw(3) << setfill('0')
00105     << static_cast<unsigned int>(val);
00106 
00107     return buf.str();
00108 }
00109 
00110 string
00111 unoctstring(string s)
00112 {
00113     int val;
00114 
00115     istringstream ss(s);
00116     ss >> oct >> val;
00117 
00118     DBG(cerr << "unoctstring: " << val << endl);
00119 
00120     char tmp_str[2];
00121     tmp_str[0] = static_cast<char>(val);
00122     tmp_str[1] = '\0';
00123     return string(tmp_str);
00124 }
00125 
00150 string
00151 id2www(string in, const string &allowable)
00152 {
00153     string::size_type i = 0;
00154     DBG(cerr<<"Input string: [" << in << "]" << endl);
00155     while ((i = in.find_first_not_of(allowable, i)) != string::npos) {
00156         DBG(cerr<<"Found escapee: [" << in[i] << "]");
00157         in.replace(i, 1, "%" + hexstring(in[i]));
00158         DBGN(cerr<<" now the string is: " << in << endl);
00159         i += 3;//i++;
00160     }
00161 
00162     return in;
00163 }
00164 
00175 string
00176 id2www_ce(string in, const string &allowable)
00177 {
00178     return id2www(in, allowable);
00179 
00180 
00181 }
00182 
00217 string
00218 www2id(const string &in, const string &escape, const string &except)
00219 {
00220     string::size_type i = 0;
00221     string res = in;
00222     while ((i = res.find_first_of(escape, i)) != string::npos) {
00223         if (except.find(res.substr(i, 3)) != string::npos) {
00224             i += 3;
00225             continue;
00226         }
00227         res.replace(i, 3, unhexstring(res.substr(i + 1, 2)));
00228         ++i;
00229     }
00230 
00231     return res;
00232 }
00233 
00234 static string
00235 entity(char c)
00236 {
00237     switch (c) {
00238     case '>': return "&gt;";
00239     case '<': return "&lt;";
00240     case '&': return "&amp;";
00241     case '\'': return "&apos;";
00242     case '\"': return "&quot;";
00243     default:
00244         throw InternalErr(__FILE__, __LINE__, "Unrecognized character.");
00245     }
00246 }
00247 
00248 // Assumption: There are always exactly two octal digits in the input
00249 // and two hex digits in the result.
00250 string
00251 octal_to_hex(const string &octal_digits)
00252 {
00253     int val;
00254 
00255     istringstream ss(octal_digits);
00256     ss >> oct >> val;
00257 
00258     ostringstream ds;
00259     ds << hex << setw(2) << setfill('0') << val;
00260     return ds.str();
00261 }
00262 
00269 string
00270 id2xml(string in, const string &not_allowed)
00271 {
00272     string::size_type i = 0;
00273 
00274     while ((i = in.find_first_of(not_allowed, i)) != string::npos) {
00275         in.replace(i, 1, entity(in[i]));
00276         ++i;
00277     }
00278 #if 0
00279     // Removed the encoding of octal escapes. This function is used by
00280     // AttrTable to encode the stuff that is the value of the <value>
00281     // element in the DDX. The problem is that some of the values are not
00282     // valid UTF-8 and that makes a XML parser gag.; ticket 1512.
00283     // jhrg 3/19/10
00284 
00285     // OK, now scan for octal escape sequences like \\012 (where the '\'
00286     // is itself escaped). This type of attribute value comes from the netCDF
00287     // handler and maybe others. Assumption: The '\' will always appear as
00288     // in its escaped form: '\\'. NB: Both backslashes must be escaped in the
00289     // C++ string.
00290     string octal_escape = "\\\\";
00291     i = 0;
00292     string::size_type length = in.length();
00293     while ((i = in.find(octal_escape, i)) != string::npos) {
00294         // Get the three octal digits following the '\\0'
00295         string::size_type j = i + 2;
00296         if (j + 1 >= length)  // Check that we're not past the end
00297             break;
00298         string octal_digits = in.substr(j, 3);
00299         // convert to a &#xdd; XML escape
00300         string hex_escape = string("&#x");
00301         hex_escape.append(octal_to_hex(octal_digits));
00302         hex_escape.append(string(";"));
00303 
00304         // replace the octal escape with an XML/hex escape
00305         in.replace(i, 5, hex_escape);
00306 
00307         // increment i
00308         i += 6;
00309     }
00310 #endif
00311     return in;
00312 }
00313 
00319 string
00320 xml2id(string in)
00321 {
00322     string::size_type i = 0;
00323 
00324     while ((i = in.find("&gt;", i)) != string::npos)
00325         in.replace(i, 4, ">");
00326 
00327     i = 0;
00328     while ((i = in.find("&lt;", i)) != string::npos)
00329         in.replace(i, 4, "<");
00330 
00331     i = 0;
00332     while ((i = in.find("&amp;", i)) != string::npos)
00333         in.replace(i, 5, "&");
00334 
00335     i = 0;
00336     while ((i = in.find("&apos;", i)) != string::npos)
00337         in.replace(i, 6, "'");
00338 
00339     i = 0;
00340     while ((i = in.find("&quot;", i)) != string::npos)
00341         in.replace(i, 6, "\"");
00342 
00343     return in;
00344 }
00345 
00351 string
00352 esc2underscore(string s)
00353 {
00354     string::size_type pos;
00355     while ((pos = s.find('%')) != string::npos)
00356         s.replace(pos, 3, "_");
00357 
00358     return s;
00359 }
00360 
00361 
00365 string
00366 escattr(string s)
00367 {
00368     const string printable = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\"";
00369     const string ESC = "\\";
00370     const string DOUBLE_ESC = ESC + ESC;
00371     const string QUOTE = "\"";
00372     const string ESCQUOTE = ESC + QUOTE;
00373 
00374     // escape non-printing characters with octal escape
00375     string::size_type ind = 0;
00376     while ((ind = s.find_first_not_of(printable, ind)) != s.npos)
00377         s.replace(ind, 1, ESC + octstring(s[ind]));
00378 
00379     // escape \ with a second backslash
00380     ind = 0;
00381     while ((ind = s.find(ESC, ind)) != s.npos) {
00382         s.replace(ind, 1, DOUBLE_ESC);
00383         ind += DOUBLE_ESC.length();
00384     }
00385 
00386     // escape " with backslash
00387     ind = 0;
00388     while ((ind = s.find(QUOTE, ind)) != s.npos) {
00389         s.replace(ind, 1, ESCQUOTE);
00390         ind += ESCQUOTE.length();
00391     }
00392 
00393     return s;
00394 }
00395 
00404 string
00405 unescattr(string s)
00406 {
00407     Regex octal("\\\\[0-3][0-7][0-7]");  // matches 4 characters
00408     Regex esc_quote("\\\\\"");  // matches 3 characters
00409     Regex esc_esc("\\\\\\\\");      // matches 2 characters
00410     const string ESC = "\\";
00411     const string QUOTE = "\"";
00412     int matchlen;
00413     unsigned int index;
00414 
00415     DBG(cerr << "0XX" << s << "XXX" << endl);
00416     // unescape any escaped backslashes
00417     index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
00418     while (index < s.length()) {
00419         DBG(cerr << "1aXX" << s << "XXX index: " << index << endl);
00420         s.replace(index, 2, ESC);
00421         DBG(cerr << "1bXX" << s << "XXX index: " << index << endl);
00422         index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
00423     }
00424 
00425     // unescape any escaped double quote characters
00426     index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
00427     while (index < s.length()) {
00428         s.replace(index, 2, QUOTE);
00429         DBG(cerr << "2XX" << s << "XXX index: " << index << endl);
00430         index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
00431     }
00432 
00433     // unescape octal characters
00434     index = octal.search(s.c_str(), s.length(), matchlen, 0);
00435     while (index < s.length()) {
00436         s.replace(index, 4, unoctstring(s.substr(index + 1, 3)));
00437         DBG(cerr << "3XX" << s << "XXX index: " << index << endl);
00438         index = octal.search(s.c_str(), s.length(), matchlen, 0);
00439     }
00440 
00441     DBG(cerr << "4XX" << s << "XXX" << endl);
00442     return s;
00443 }
00444 
00445 string
00446 munge_error_message(string msg)
00447 {
00448     // First, add enclosing quotes if needed.
00449     if (*msg.begin() != '"')
00450         msg.insert(msg.begin(), '"');
00451     if (*(msg.end() - 1) != '"')
00452         msg += "\"";
00453 
00454     // Now escape any internal double quotes that aren't escaped.
00455     string::iterator miter;
00456     for (miter = msg.begin() + 1; miter != msg.end() - 1; miter++)
00457         if (*miter == '"' && *(miter - 1) != '\\')
00458             miter = msg.insert(miter, '\\');
00459 
00460     return msg;
00461 }
00462 
00467 string
00468 escape_double_quotes(string source)
00469 {
00470     string::size_type idx = 0;
00471     while((idx = source.find('\"', idx)) != string::npos) {
00472         source.replace(idx, 1, "\\\""); // a backslash and a double quote
00473         idx += 2;
00474     }
00475 
00476     return source;
00477 }
00478 
00484 string
00485 unescape_double_quotes(string source)
00486 {
00487     string::size_type idx = 0;
00488     while((idx = source.find("\\\"", idx)) != string::npos) {
00489         source.replace(idx, 2, "\""); // a backslash and a double quote
00490         ++idx;
00491     }
00492 
00493     return source;
00494 }
00495 
00496 } // namespace libdap
00497