org.jruby/jruby-core/9.2.9.0 : org/jruby/util/RegexpSupport.java

RegexpSupport
https://github.com/jruby/jruby/jruby-core: JRuby is the effort to recreate the Ruby (https://www.ruby-lang.org) interpreter in Java. (JRuby)
headius
enebo
wmeissner
BanzaiMan
mkristian
BEGIN LICENSE BLOCK *****
Version: EPL 2.0/GPL 2.0/LGPL 2.1
The contents of this file are subject to the Eclipse Public
License Version 2.0 (the "License"); you may not use this file
except in compliance with the License. You may obtain a copy of
the License at http://www.eclipse.org/legal/epl-v20.html
Software distributed under the License is distributed on an "AS
IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
implied. See the License for the specific language governing
rights and limitations under the License.
Alternatively, the contents of this file may be used under the terms of
either of the GNU General Public License Version 2 or later (the "GPL"),
or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
in which case the provisions of the GPL or the LGPL are applicable instead
of those above. If you wish to allow use of your version of this file only
under the terms of either the GPL or the LGPL, and not to allow others to
use your version of this file under the terms of the EPL, indicate your
decision by deleting the provisions above and replace them with the notice
and other provisions required by the GPL or the LGPL. If you do not delete
the provisions above, a recipient may use your version of this file under
the terms of any one of the EPL, the GPL or the LGPL.
END LICENSE BLOCK /***** BEGIN LICENSE BLOCK *****
 * Version: EPL 2.0/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Eclipse Public
 * License Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.eclipse.org/legal/epl-v20.html
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the EPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the EPL, the GPL or the LGPL.
 ***** END LICENSE BLOCK *****/

package org.jruby.util;

import org.jcodings.Encoding;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.USASCIIEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jruby.Ruby;

public class RegexpSupport {

    public enum ErrorMode {RAISE, PREPROCESS, DESC}

    Preprocess the given string for use in regexp, raising errors for encoding
incompatibilities that arise.
This version produces a new unescaped version of the string based on
fixes performed while walking.
Params: runtime – current runtime
str – string to preprocess
enc – string's encoding
fixedEnc – new encoding after fixing
mode – mode of errors
Returns: a new unescaped string/**
     * Preprocess the given string for use in regexp, raising errors for encoding
     * incompatibilities that arise.
     *
     * This version produces a new unescaped version of the string based on
     * fixes performed while walking.
     *
     * @param runtime current runtime
     * @param str string to preprocess
     * @param enc string's encoding
     * @param fixedEnc new encoding after fixing
     * @param mode mode of errors
     * @return a new unescaped string
     */
    public static ByteList preprocess(Ruby runtime, ByteList str, Encoding enc, Encoding[] fixedEnc, ErrorMode mode) {
        ByteList to = new ByteList(str.getRealSize());

        if (enc.isAsciiCompatible()) {
            fixedEnc[0] = null;
        } else {
            fixedEnc[0] = enc;
            to.setEncoding(enc);
        }

        boolean hasProperty = unescapeNonAscii(runtime, to, str.getUnsafeBytes(), str.getBegin(), str.getBegin() + str.getRealSize(), enc, fixedEnc, str, mode);
        if (hasProperty && fixedEnc[0] == null) fixedEnc[0] = enc;
        if (fixedEnc[0] != null) to.setEncoding(fixedEnc[0]);
        return to;
    }

    Unescape non-ascii elements in the given string, appending the results
to the given bytelist if provided.
Params: runtime – current runtime
to – output bytelist; if null, no appending will be done
bytes – the bytes to unescape
p – starting position
end – ending position
enc – bytes' encoding
encp – out param for fixed encoding
str – original wrapper for the bytes
mode – error mode
Returns: whether any propery elements were encountered while walking/**
     * Unescape non-ascii elements in the given string, appending the results
     * to the given bytelist if provided.
     *
     * @param runtime current runtime
     * @param to output bytelist; if null, no appending will be done
     * @param bytes the bytes to unescape
     * @param p starting position
     * @param end ending position
     * @param enc bytes' encoding
     * @param encp out param for fixed encoding
     * @param str original wrapper for the bytes
     * @param mode error mode
     * @return whether any propery elements were encountered while walking
     */
    public static boolean unescapeNonAscii(Ruby runtime, ByteList to, byte[] bytes, int p, int end, Encoding enc, Encoding[] encp, ByteList str, ErrorMode mode) {
        boolean hasProperty = false;
        byte[] buf = null;

        while (p < end) {
            int cl = StringSupport.preciseLength(enc, bytes, p, end);
            if (cl <= 0) raisePreprocessError(runtime, str, "invalid multibyte character", mode);
            if (cl > 1 || (bytes[p] & 0x80) != 0) {
                if (to != null) to.append(bytes, p, cl);
                p += cl;
                if (encp[0] == null) {
                    encp[0] = enc;
                } else if (encp[0] != enc) {
                    raisePreprocessError(runtime, str, "non ASCII character in UTF-8 regexp", mode);
                }
                continue;
            }
            int c;
            switch (c = bytes[p++] & 0xff) {
                case '\\':
                    if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);

                    switch (c = bytes[p++] & 0xff) {
                        case '1': case '2': case '3':
                        case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
                            if (StringSupport.scanOct(bytes, p - 1, end - (p - 1)) <= 0177) {
                                if (to != null) to.append('\\').append(c);
                                break;
                            }

                        case '0': /* \0, \0O, \0OO */
                        case 'x': /* \xHH */
                        case 'c': /* \cX, \c\M-X */
                        case 'C': /* \C-X, \C-\M-X */
                        case 'M': /* \M-X, \M-\C-X, \M-\cX */
                            p -= 2;
                            if (enc == USASCIIEncoding.INSTANCE) {
                                if (buf == null) buf = new byte[1];
                                int pbeg = p;
                                p = readEscapedByte(runtime, buf, 0, bytes, p, end, str, mode);
                                c = buf[0];
                                if (c == (char)-1) return false;
                                if (to != null) {
                                    to.append(bytes, pbeg, p - pbeg);
                                }
                            }
                            else {
                                p = unescapeEscapedNonAscii(runtime, to, bytes, p, end, enc, encp, str, mode);
                            }
                            break;

                        case 'u':
                            if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);
                            if (bytes[p] == (byte)'{') { /* \\u{H HH HHH HHHH HHHHH HHHHHH ...} */
                                p++;
                                p = unescapeUnicodeList(runtime, to, bytes, p, end, encp, str, mode);
                                if (p == end || bytes[p++] != (byte)'}') raisePreprocessError(runtime, str, "invalid Unicode list", mode);
                            } else { /* \\uHHHH */
                                p = unescapeUnicodeBmp(runtime, to, bytes, p, end, encp, str, mode);
                            }
                            break;
                        case 'p': /* \p{Hiragana} */
                            if (encp[0] == null) hasProperty = true;
                            if (to != null) to.append('\\').append(c);
                            break;

                        default:
                            if (to != null) to.append('\\').append(c);
                            break;
                    } // inner switch
                    break;

                default:
                    if (to != null) to.append(c);
            } // switch
        } // while
        return hasProperty;
    }

    public static int raisePreprocessError(Ruby runtime, ByteList str, String err, ErrorMode mode) {
        switch (mode) {
            case RAISE:
                raiseRegexpError19(runtime, str, str.getEncoding(), RegexpOptions.NULL_OPTIONS, err);
            case PREPROCESS:
                throw runtime.newArgumentError("regexp preprocess failed: " + err);
            case DESC:
                // silent ?
        }
        return 0;
    }

    // rb_enc_reg_raise
    public static void raiseRegexpError19(Ruby runtime, ByteList bytes, Encoding enc, RegexpOptions options, String err) {
        // TODO: we loose encoding information here, fix it
        throw runtime.newRegexpError(err + ": " + regexpDescription19(runtime, bytes, options, enc));
    }

    // rb_enc_reg_error_desc
    public static ByteList regexpDescription19(Ruby runtime, ByteList bytes, RegexpOptions options, Encoding enc) {
        return regexpDescription19(runtime, bytes.getUnsafeBytes(), bytes.getBegin(), bytes.getRealSize(), options, enc);
    }

    private static ByteList regexpDescription19(Ruby runtime, byte[] s, int start, int len, RegexpOptions options, Encoding enc) {
        ByteList description = new ByteList();
        description.setEncoding(enc);
        description.append((byte)'/');
        Encoding resultEnc = runtime.getDefaultInternalEncoding();
        if (resultEnc == null) resultEnc = runtime.getDefaultExternalEncoding();

        appendRegexpString19(runtime, description, s, start, len, enc, resultEnc);
        description.append((byte)'/');
        appendOptions(description, options);
        if (options.isEncodingNone()) description.append((byte) 'n');
        return description;
    }

    public static void appendRegexpString19(Ruby runtime, ByteList to, byte[] bytes, int start, int len, Encoding enc, Encoding resEnc) {
        int p = start;
        int end = p + len;
        boolean needEscape = false;
        while (p < end) {
            final int c;
            final int cl;
            if (enc.isAsciiCompatible()) {
                cl = 1;
                c = bytes[p] & 0xff;
            } else {
                cl = StringSupport.preciseLength(enc, bytes, p, end);
                c = enc.mbcToCode(bytes, p, end);
            }

            if (!Encoding.isAscii(c)) {
                p += StringSupport.length(enc, bytes, p, end);
            } else if (c != '/' && enc.isPrint(c)) {
                p += cl;
            } else {
                needEscape = true;
                break;
            }
        }
        if (!needEscape) {
            to.append(bytes, start, len);
        } else {
            boolean isUnicode = enc.isUnicode();
            p = start;
            while (p < end) {
                final int c;
                final int cl;
                if (enc.isAsciiCompatible()) {
                    cl = 1;
                    c = bytes[p] & 0xff;
                } else {
                    cl = StringSupport.preciseLength(enc, bytes, p, end);
                    c = enc.mbcToCode(bytes, p, end);
                }

                if (c == '\\' && p + cl < end) {
                    int n = cl + StringSupport.length(enc, bytes, p + cl, end);
                    to.append(bytes, p, n);
                    p += n;
                    continue;
                } else if (c == '/') {
                    to.append((byte) '\\');
                    to.append(bytes, p, cl);
                } else if (!Encoding.isAscii(c)) {
                    int l = StringSupport.preciseLength(enc, bytes, p, end);
                    if (l <= 0) {
                        l = 1;
                        Sprintf.sprintf(runtime, to, "\\x%02X", c);
                    } else if (resEnc != null) {
                        int code = enc.mbcToCode(bytes, p, end);
                        Sprintf.sprintf(runtime, to , StringSupport.escapedCharFormat(code, isUnicode), code);
                    } else {
                        to.append(bytes, p, l);
                    }
                    p += l;

                    continue;
                } else if (enc.isPrint(c)) {
                    to.append(bytes, p, cl);
                } else if (!enc.isSpace(c)) {
                    Sprintf.sprintf(runtime, to, "\\x%02X", c);
                } else {
                    to.append(bytes, p, cl);
                }
                p += cl;
            }
        }
    }

    // option_to_str
    public static void appendOptions(ByteList to, RegexpOptions options) {
        if (options.isMultiline()) to.append((byte)'m');
        if (options.isIgnorecase()) to.append((byte)'i');
        if (options.isExtended()) to.append((byte)'x');
    }

    public static int readEscapedByte(Ruby runtime, byte[] to, int toP, byte[] bytes, int p, int end, ByteList str, ErrorMode mode) {
        if (p == end || bytes[p++] != (byte)'\\') raisePreprocessError(runtime, str, "too short escaped multibyte character", mode);

        boolean metaPrefix = false, ctrlPrefix = false;
        int code = 0;
        while (true) {
            if (p == end) raisePreprocessError(runtime, str, "too short escape sequence", mode);

            switch (bytes[p++]) {
                case '\\': code = '\\'; break;
                case 'n': code = '\n'; break;
                case 't': code = '\t'; break;
                case 'r': code = '\r'; break;
                case 'f': code = '\f'; break;
                case 'v': code = '\013'; break;
                case 'a': code = '\007'; break;
                case 'e': code = '\033'; break;

            /* \OOO */
                case '0': case '1': case '2': case '3':
                case '4': case '5': case '6': case '7':
                    p--;
                    int olen = end < p + 3 ? end - p : 3;
                    code = StringSupport.scanOct(bytes, p, olen);
                    p += StringSupport.octLength(bytes, p, olen);
                    break;

                case 'x': /* \xHH */
                    int hlen = end < p + 2 ? end - p : 2;
                    code = StringSupport.scanHex(bytes, p, hlen);
                    int len = StringSupport.hexLength(bytes, p, hlen);
                    if (len < 1) raisePreprocessError(runtime, str, "invalid hex escape", mode);
                    p += len;
                    break;

                case 'M': /* \M-X, \M-\C-X, \M-\cX */
                    if (metaPrefix) raisePreprocessError(runtime, str, "duplicate meta escape", mode);
                    metaPrefix = true;
                    if (p + 1 < end && bytes[p++] == (byte)'-' && (bytes[p] & 0x80) == 0) {
                        if (bytes[p] == (byte)'\\') {
                            p++;
                            continue;
                        } else {
                            code = bytes[p++] & 0xff;
                            break;
                        }
                    }
                    raisePreprocessError(runtime, str, "too short meta escape", mode);

                case 'C': /* \C-X, \C-\M-X */
                    if (p == end || bytes[p++] != (byte)'-') raisePreprocessError(runtime, str, "too short control escape", mode);

                case 'c': /* \cX, \c\M-X */
                    if (ctrlPrefix) raisePreprocessError(runtime, str, "duplicate control escape", mode);
                    ctrlPrefix = true;
                    if (p < end && (bytes[p] & 0x80) == 0) {
                        if (bytes[p] == (byte)'\\') {
                            p++;
                            continue;
                        } else {
                            code = bytes[p++] & 0xff;
                            break;
                        }
                    }
                    raisePreprocessError(runtime, str, "too short control escape", mode);
                default:
                    raisePreprocessError(runtime, str, "unexpected escape sequence", mode);
            } // switch

            if (code < 0 || code > 0xff) raisePreprocessError(runtime, str, "invalid escape code", mode);

            if (ctrlPrefix) code &= 0x1f;
            if (metaPrefix) code |= 0x80;

            to[toP] = (byte)code;
            return p;
        } // while
    }

    Unescape escaped non-ascii character at start position, appending all
to the given bytelist if provided.
Params: runtime – current runtime
to – output bytelist; if null, no appending will be done
bytes – incoming bytes
p – start position
end – end position
enc – bytes' encoding
encp – out param for fixed encoding
str – original bytes wrapper
mode – error mode
Returns: new position after performing unescaping/**
     * Unescape escaped non-ascii character at start position, appending all
     * to the given bytelist if provided.
     *
     * @param runtime current runtime
     * @param to output bytelist; if null, no appending will be done
     * @param bytes incoming bytes
     * @param p start position
     * @param end end position
     * @param enc bytes' encoding
     * @param encp out param for fixed encoding
     * @param str original bytes wrapper
     * @param mode error mode
     * @return new position after performing unescaping
     */
    // MRI: unescape_escapted_nonascii
    private static int unescapeEscapedNonAscii(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding enc, Encoding[]encp, ByteList str, ErrorMode mode) {
        byte[]chBuf = new byte[enc.maxLength()];
        int chLen = 0;

        p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
        while (chLen < enc.maxLength() && StringSupport.MBCLEN_NEEDMORE_P(StringSupport.preciseLength(enc, chBuf, 0, chLen))) {
            p = readEscapedByte(runtime, chBuf, chLen++, bytes, p, end, str, mode);
        }

        int cl = StringSupport.preciseLength(enc, chBuf, 0, chLen);
        if (cl == -1) {
            raisePreprocessError(runtime, str, "invalid multibyte escape", mode); // MBCLEN_INVALID_P
        }

        if (chLen > 1 || (chBuf[0] & 0x80) != 0) {
            if (to != null) to.append(chBuf, 0, chLen);

            if (encp[0] == null) {
                encp[0] = enc;
            } else if (encp[0] != enc) {
                raisePreprocessError(runtime, str, "escaped non ASCII character in UTF-8 regexp", mode);
            }
        } else {
            if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", chBuf[0] & 0xff);
        }
        return p;
    }

    Unescape unicode characters at given offset, appending to the given
out buffer if provided.
Params: runtime – current runtime
to – output buffer; if null, no appending will be done
bytes – input bytes
p – start position
end – end position
encp – out param for fixed encoding
str – original bytes wrapper
mode – error mode
Returns: new position after unescaping/**
     * Unescape unicode characters at given offset, appending to the given
     * out buffer if provided.
     *
     * @param runtime current runtime
     * @param to output buffer; if null, no appending will be done
     * @param bytes input bytes
     * @param p start position
     * @param end end position
     * @param encp out param for fixed encoding
     * @param str original bytes wrapper
     * @param mode error mode
     * @return new position after unescaping
     */
    private static int unescapeUnicodeList(Ruby runtime, ByteList to, byte[]bytes, int p, int end, Encoding[]encp, ByteList str, ErrorMode mode) {
        while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;

        boolean hasUnicode = false;
        while (true) {
            int code = StringSupport.scanHex(bytes, p, end - p);
            int len = StringSupport.hexLength(bytes, p, end - p);
            if (len == 0) break;
            if (len > 6) raisePreprocessError(runtime, str, "invalid Unicode range", mode);
            p += len;
            if (to != null) appendUtf8(runtime, to, code, encp, str, mode);
            hasUnicode = true;
            while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) p++;
        }

        if (!hasUnicode) raisePreprocessError(runtime, str, "invalid Unicode list", mode);
        return p;
    }

    Unescape unicode BMP char at given offset, appending to the specified
buffer if non-null.
Params: runtime – current runtime
to – output buffer; if null, no appending will be done
bytes – input bytes
p – start position
end – end position
encp – out param for fixed encoding
str – original bytes wrapper
mode – error mode
Returns: new position after unescaping/**
     * Unescape unicode BMP char at given offset, appending to the specified
     * buffer if non-null.
     *
     * @param runtime current runtime
     * @param to output buffer; if null, no appending will be done
     * @param bytes input bytes
     * @param p start position
     * @param end end position
     * @param encp out param for fixed encoding
     * @param str original bytes wrapper
     * @param mode error mode
     * @return new position after unescaping
     */
    private static int unescapeUnicodeBmp(Ruby runtime, ByteList to, byte[] bytes, int p, int end, Encoding[] encp, ByteList str, ErrorMode mode) {
        if (p + 4 > end) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
        int code = StringSupport.scanHex(bytes, p, 4);
        int len = StringSupport.hexLength(bytes, p, 4);
        if (len != 4) raisePreprocessError(runtime, str, "invalid Unicode escape", mode);
        appendUtf8(runtime, to, code, encp, str, mode);
        return p + 4;
    }

    Append the given utf8 characters to the buffer, if given, checking for
errors along the way.
Params: runtime – current runtime
to – output buffer; if null, no appending will be done
code – utf8 character code
enc – output param for new encoding
str – original wrapper of source bytes
mode – error mode/**
     * Append the given utf8 characters to the buffer, if given, checking for
     * errors along the way.
     *
     * @param runtime current runtime
     * @param to output buffer; if null, no appending will be done
     * @param code utf8 character code
     * @param enc output param for new encoding
     * @param str original wrapper of source bytes
     * @param mode error mode
     */
    private static void appendUtf8(Ruby runtime, ByteList to, int code, Encoding[] enc, ByteList str, ErrorMode mode) {
        checkUnicodeRange(runtime, code, str, ErrorMode.PREPROCESS);

        if (code < 0x80) {
            if (to != null) Sprintf.sprintf(runtime, to, "\\x%02X", code);
        } else {
            if (to != null) {
                to.ensure(to.getRealSize() + 6);
                to.setRealSize(to.getRealSize() + Pack.utf8Decode(runtime, to.getUnsafeBytes(), to.getBegin() + to.getRealSize(), code));
            }
            if (enc[0] == null) {
                enc[0] = UTF8Encoding.INSTANCE;
            } else if (!(enc[0].isUTF8())) {
                raisePreprocessError(runtime, str, "UTF-8 character in non UTF-8 regexp", mode);
            }
        }
    }

    private static void checkUnicodeRange(Ruby runtime, int code, ByteList str, ErrorMode mode) {
        // Unicode is can be only 21 bits long, int is enough
        if ((0xd800 <= code && code <= 0xdfff) /* Surrogates */ || 0x10ffff < code) {
            raisePreprocessError(runtime, str, "invalid Unicode range", mode);
        }
    }
}
Params:	runtime – current runtime str – string to preprocess enc – string's encoding fixedEnc – new encoding after fixing mode – mode of errors
Returns:	a new unescaped string
Params:	runtime – current runtime to – output bytelist; if null, no appending will be done bytes – the bytes to unescape p – starting position end – ending position enc – bytes' encoding encp – out param for fixed encoding str – original wrapper for the bytes mode – error mode
Returns:	whether any propery elements were encountered while walking
Params:	runtime – current runtime to – output buffer; if null, no appending will be done bytes – input bytes p – start position end – end position encp – out param for fixed encoding str – original bytes wrapper mode – error mode
Returns:	new position after unescaping
/

org.jruby/ jruby-core/ 9.2.9.0/ org/jruby/util/RegexpSupport.java