Mega Code Archive

URL utilities class that makes it easy to create new URLs based off of old URLs without having to assemble or parse them

/* * Copyright (c) 2002-2009 Gargoyle Software Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.net.MalformedURLException; import java.net.URL; import java.util.List; /** * URL utilities class that makes it easy to create new URLs based off of old URLs * without having to assemble or parse them yourself. * * @version $Revision: 4387 $ * @author Daniel Gredler * @author Martin Tamme * @author Sudhan Moghe */ public final class UrlUtils { /** * Disallow instantiation of this class. */ private UrlUtils() { // Empty. } /** * Creates and returns a new URL identical to the specified URL, except using the specified protocol. * @param u the URL on which to base the returned URL * @param newProtocol the new protocol to use in the returned URL * @return a new URL identical to the specified URL, except using the specified protocol * @throws MalformedURLException if there is a problem creating the new URL */ public static URL getUrlWithNewProtocol(final URL u, final String newProtocol) throws MalformedURLException { return createNewUrl(newProtocol, u.getHost(), u.getPort(), u.getPath(), u.getRef(), u.getQuery()); } /** * Creates and returns a new URL identical to the specified URL, except using the specified host. * @param u the URL on which to base the returned URL * @param newHost the new host to use in the returned URL * @return a new URL identical to the specified URL, except using the specified host * @throws MalformedURLException if there is a problem creating the new URL */ public static URL getUrlWithNewHost(final URL u, final String newHost) throws MalformedURLException { return createNewUrl(u.getProtocol(), newHost, u.getPort(), u.getPath(), u.getRef(), u.getQuery()); } /** * Creates and returns a new URL identical to the specified URL, except using the specified port. * @param u the URL on which to base the returned URL * @param newPort the new port to use in the returned URL * @return a new URL identical to the specified URL, except using the specified port * @throws MalformedURLException if there is a problem creating the new URL */ public static URL getUrlWithNewPort(final URL u, final int newPort) throws MalformedURLException { return createNewUrl(u.getProtocol(), u.getHost(), newPort, u.getPath(), u.getRef(), u.getQuery()); } /** * Creates and returns a new URL identical to the specified URL, except using the specified path. * @param u the URL on which to base the returned URL * @param newPath the new path to use in the returned URL * @return a new URL identical to the specified URL, except using the specified path * @throws MalformedURLException if there is a problem creating the new URL */ public static URL getUrlWithNewPath(final URL u, final String newPath) throws MalformedURLException { return createNewUrl(u.getProtocol(), u.getHost(), u.getPort(), newPath, u.getRef(), u.getQuery()); } /** * Creates and returns a new URL identical to the specified URL, except using the specified reference. * @param u the URL on which to base the returned URL * @param newRef the new reference to use in the returned URL * @return a new URL identical to the specified URL, except using the specified reference * @throws MalformedURLException if there is a problem creating the new URL */ public static URL getUrlWithNewRef(final URL u, final String newRef) throws MalformedURLException { return createNewUrl(u.getProtocol(), u.getHost(), u.getPort(), u.getPath(), newRef, u.getQuery()); } /** * Creates and returns a new URL identical to the specified URL, except using the specified query string. * @param u the URL on which to base the returned URL * @param newQuery the new query string to use in the returned URL * @return a new URL identical to the specified URL, except using the specified query string * @throws MalformedURLException if there is a problem creating the new URL */ public static URL getUrlWithNewQuery(final URL u, final String newQuery) throws MalformedURLException { return createNewUrl(u.getProtocol(), u.getHost(), u.getPort(), u.getPath(), u.getRef(), newQuery); } /** * Creates a new URL based on the specified fragments. * @param protocol the protocol to use (may not be <tt>null</tt>) * @param host the host to use (may not be <tt>null</tt>) * @param port the port to use (may be <tt>-1</tt> if no port is specified) * @param path the path to use (may be <tt>null</tt> and may omit the initial <tt>'/'</tt>) * @param ref the reference to use (may be <tt>null</tt> and must not include the <tt>'#'</tt>) * @param query the query to use (may be <tt>null</tt> and must not include the <tt>'?'</tt>) * @return a new URL based on the specified fragments * @throws MalformedURLException if there is a problem creating the new URL */ private static URL createNewUrl(final String protocol, final String host, final int port, final String path, final String ref, final String query) throws MalformedURLException { final StringBuilder s = new StringBuilder(); s.append(protocol); s.append("://"); s.append(host); if (port != -1) { s.append(":").append(port); } if (path != null && path.length() > 0) { if (!path.startsWith("/")) { s.append("/"); } s.append(path); } if (query != null) { s.append("?").append(query); } if (ref != null) { if (!ref.startsWith("#")) { s.append("#"); } s.append(ref); } final URL url = new URL(s.toString()); return url; } /** * Resolves a given relative URL against a base URL. See * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a> * Section 4 for more details. * * @param baseUrl The base URL in which to resolve the specification. * @param relativeUrl The relative URL to resolve against the base URL. * @return the resolved specification. */ public static String resolveUrl(final String baseUrl, final String relativeUrl) { if (baseUrl == null) { throw new IllegalArgumentException("Base URL must not be null"); } if (relativeUrl == null) { throw new IllegalArgumentException("Relative URL must not be null"); } final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim()); return url.toString(); } /** * Resolves a given relative URL against a base URL. See * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a> * Section 4 for more details. * * @param baseUrl The base URL in which to resolve the specification. * @param relativeUrl The relative URL to resolve against the base URL. * @return the resolved specification. */ public static String resolveUrl(final URL baseUrl, final String relativeUrl) { if (baseUrl == null) { throw new IllegalArgumentException("Base URL must not be null"); } return resolveUrl(baseUrl.toExternalForm(), relativeUrl); } /** * Parses a given specification using the algorithm depicted in * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>: * * Section 2.4: Parsing a URL * * An accepted method for parsing URLs is useful to clarify the * generic-RL syntax of Section 2.2 and to describe the algorithm for * resolving relative URLs presented in Section 4. This section * describes the parsing rules for breaking down a URL (relative or * absolute) into the component parts described in Section 2.1. The * rules assume that the URL has already been separated from any * surrounding text and copied to a "parse string". The rules are * listed in the order in which they would be applied by the parser. * * @param spec The specification to parse. * @return the parsed specification. */ private static Url parseUrl(final String spec) { final Url url = new Url(); int startIndex = 0; int endIndex = spec.length(); // Section 2.4.1: Parsing the Fragment Identifier // // If the parse string contains a crosshatch "#" character, then the // substring after the first (left-most) crosshatch "#" and up to the // end of the parse string is the <fragment> identifier. If the // crosshatch is the last character, or no crosshatch is present, then // the fragment identifier is empty. The matched substring, including // the crosshatch character, is removed from the parse string before // continuing. // // Note that the fragment identifier is not considered part of the URL. // However, since it is often attached to the URL, parsers must be able // to recognize and set aside fragment identifiers as part of the // process. final int crosshatchIndex = StringUtils.indexOf(spec, '#', startIndex, endIndex); if (crosshatchIndex >= 0) { url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex); endIndex = crosshatchIndex; } // Section 2.4.2: Parsing the Scheme // // If the parse string contains a colon ":" after the first character // and before any characters not allowed as part of a scheme name (i.e., // any not an alphanumeric, plus "+", period ".", or hyphen "-"), the // <scheme> of the URL is the substring of characters up to but not // including the first colon. These characters and the colon are then // removed from the parse string before continuing. final int colonIndex = StringUtils.indexOf(spec, ':', startIndex, endIndex); if (colonIndex > 0) { final String scheme = spec.substring(startIndex, colonIndex); if (isValidScheme(scheme)) { url.scheme_ = scheme; startIndex = colonIndex + 1; } } // Section 2.4.3: Parsing the Network Location/Login // // If the parse string begins with a double-slash "//", then the // substring of characters after the double-slash and up to, but not // including, the next slash "/" character is the network location/login // (<net_loc>) of the URL. If no trailing slash "/" is present, the // entire remaining parse string is assigned to <net_loc>. The double- // slash and <net_loc> are removed from the parse string before // continuing. // // Note: We also accept a question mark "?" or a semicolon ";" character as // delimiters for the network location/login (<net_loc>) of the URL. final int locationStartIndex; int locationEndIndex; if (spec.startsWith("//", startIndex)) { locationStartIndex = startIndex + 2; locationEndIndex = StringUtils.indexOf(spec, '/', locationStartIndex, endIndex); if (locationEndIndex >= 0) { startIndex = locationEndIndex; } } else { locationStartIndex = -1; locationEndIndex = -1; } // Section 2.4.4: Parsing the Query Information // // If the parse string contains a question mark "?" character, then the // substring after the first (left-most) question mark "?" and up to the // end of the parse string is the <query> information. If the question // mark is the last character, or no question mark is present, then the // query information is empty. The matched substring, including the // question mark character, is removed from the parse string before // continuing. final int questionMarkIndex = StringUtils.indexOf(spec, '?', startIndex, endIndex); if (questionMarkIndex >= 0) { if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { // The substring of characters after the double-slash and up to, but not // including, the question mark "?" character is the network location/login // (<net_loc>) of the URL. locationEndIndex = questionMarkIndex; startIndex = questionMarkIndex; } url.query_ = spec.substring(questionMarkIndex + 1, endIndex); endIndex = questionMarkIndex; } // Section 2.4.5: Parsing the Parameters // // If the parse string contains a semicolon ";" character, then the // substring after the first (left-most) semicolon ";" and up to the end // of the parse string is the parameters (<params>). If the semicolon // is the last character, or no semicolon is present, then <params> is // empty. The matched substring, including the semicolon character, is // removed from the parse string before continuing. final int semicolonIndex = StringUtils.indexOf(spec, ';', startIndex, endIndex); if (semicolonIndex >= 0) { if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { // The substring of characters after the double-slash and up to, but not // including, the semicolon ";" character is the network location/login // (<net_loc>) of the URL. locationEndIndex = semicolonIndex; startIndex = semicolonIndex; } url.parameters_ = spec.substring(semicolonIndex + 1, endIndex); endIndex = semicolonIndex; } // Section 2.4.6: Parsing the Path // // After the above steps, all that is left of the parse string is the // URL <path> and the slash "/" that may precede it. Even though the // initial slash is not part of the URL path, the parser must remember // whether or not it was present so that later processes can // differentiate between relative and absolute paths. Often this is // done by simply storing the preceding slash along with the path. if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { // The entire remaining parse string is assigned to the network // location/login (<net_loc>) of the URL. locationEndIndex = endIndex; } else if (startIndex < endIndex) { url.path_ = spec.substring(startIndex, endIndex); } // Set the network location/login (<net_loc>) of the URL. if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) { url.location_ = spec.substring(locationStartIndex, locationEndIndex); } return url; } /* * Returns true if specified string is a valid scheme name. */ private static boolean isValidScheme(final String scheme) { final int length = scheme.length(); if (length < 1) { return false; } char c = scheme.charAt(0); if (!Character.isLetter(c)) { return false; } for (int i = 1; i < length; i++) { c = scheme.charAt(i); if (!Character.isLetterOrDigit(c) && c != '.' && c != '+' && c != '-') { return false; } } return true; } /** * Resolves a given relative URL against a base URL using the algorithm * depicted in <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>: * * Section 4: Resolving Relative URLs * * This section describes an example algorithm for resolving URLs within * a context in which the URLs may be relative, such that the result is * always a URL in absolute form. Although this algorithm cannot * guarantee that the resulting URL will equal that intended by the * original author, it does guarantee that any valid URL (relative or * absolute) can be consistently transformed to an absolute form given a * valid base URL. * * @param baseUrl The base URL in which to resolve the specification. * @param relativeUrl The relative URL to resolve against the base URL. * @return the resolved specification. */ private static Url resolveUrl(final Url baseUrl, final String relativeUrl) { final Url url = parseUrl(relativeUrl); // Step 1: The base URL is established according to the rules of // Section 3. If the base URL is the empty string (unknown), // the embedded URL is interpreted as an absolute URL and // we are done. if (baseUrl == null) { return url; } // Step 2: Both the base and embedded URLs are parsed into their // component parts as described in Section 2.4. // a) If the embedded URL is entirely empty, it inherits the // entire base URL (i.e., is set equal to the base URL) // and we are done. if (relativeUrl.length() == 0) { return new Url(baseUrl); } // b) If the embedded URL starts with a scheme name, it is // interpreted as an absolute URL and we are done. if (url.scheme_ != null) { return url; } // c) Otherwise, the embedded URL inherits the scheme of // the base URL. url.scheme_ = baseUrl.scheme_; // Step 3: If the embedded URL's <net_loc> is non-empty, we skip to // Step 7. Otherwise, the embedded URL inherits the <net_loc> // (if any) of the base URL. if (url.location_ != null) { return url; } url.location_ = baseUrl.location_; // Step 4: If the embedded URL path is preceded by a slash "/", the // path is not relative and we skip to Step 7. if ((url.path_ != null) && url.path_.startsWith("/")) { url.path_ = removeLeadingSlashPoints(url.path_); return url; } // Step 5: If the embedded URL path is empty (and not preceded by a // slash), then the embedded URL inherits the base URL path, // and if (url.path_ == null) { url.path_ = baseUrl.path_; // a) if the embedded URL's <params> is non-empty, we skip to // step 7; otherwise, it inherits the <params> of the base // URL (if any) and if (url.parameters_ != null) { return url; } url.parameters_ = baseUrl.parameters_; // b) if the embedded URL's <query> is non-empty, we skip to // step 7; otherwise, it inherits the <query> of the base // URL (if any) and we skip to step 7. if (url.query_ != null) { return url; } url.query_ = baseUrl.query_; return url; } // Step 6: The last segment of the base URL's path (anything // following the rightmost slash "/", or the entire path if no // slash is present) is removed and the embedded URL's path is // appended in its place. The following operations are // then applied, in order, to the new path: final String basePath = baseUrl.path_; String path = new String(); if (basePath != null) { final int lastSlashIndex = basePath.lastIndexOf('/'); if (lastSlashIndex >= 0) { path = basePath.substring(0, lastSlashIndex + 1); } } else { path = "/"; } path = path.concat(url.path_); // a) All occurrences of "./", where "." is a complete path // segment, are removed. int pathSegmentIndex; while ((pathSegmentIndex = path.indexOf("/./")) >= 0) { path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3)); } // b) If the path ends with "." as a complete path segment, // that "." is removed. if (path.endsWith("/.")) { path = path.substring(0, path.length() - 1); } // c) All occurrences of "<segment>/../", where <segment> is a // complete path segment not equal to "..", are removed. // Removal of these path segments is performed iteratively, // removing the leftmost matching pattern on each iteration, // until no matching pattern remains. while ((pathSegmentIndex = path.indexOf("/../")) > 0) { final String pathSegment = path.substring(0, pathSegmentIndex); final int slashIndex = pathSegment.lastIndexOf('/'); if (slashIndex < 0) { continue; } if (!pathSegment.substring(slashIndex).equals("..")) { path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4)); } } // d) If the path ends with "<segment>/..", where <segment> is a // complete path segment not equal to "..", that // "<segment>/.." is removed. if (path.endsWith("/..")) { final String pathSegment = path.substring(0, path.length() - 3); final int slashIndex = pathSegment.lastIndexOf('/'); if (slashIndex >= 0) { path = path.substring(0, slashIndex + 1); } } path = removeLeadingSlashPoints(path); url.path_ = path; // Step 7: The resulting URL components, including any inherited from // the base URL, are recombined to give the absolute form of // the embedded URL. return url; } /** * "/.." at the beginning should be removed as browsers do (not in RFC) */ private static String removeLeadingSlashPoints(String path) { while (path.startsWith("/..")) { path = path.substring(3); } return path; } /** * Class <tt>Url</tt> represents a Uniform Resource Locator. * * @author Martin Tamme */ private static class Url { private String scheme_; private String location_; private String path_; private String parameters_; private String query_; private String fragment_; /** * Creates a <tt>Url</tt> object. */ public Url() { } /** * Creates a <tt>Url</tt> object from the specified * <tt>Url</tt> object. * * @param url a <tt>Url</tt> object. */ public Url(final Url url) { scheme_ = url.scheme_; location_ = url.location_; path_ = url.path_; parameters_ = url.parameters_; query_ = url.query_; fragment_ = url.fragment_; } /** * Returns a string representation of the <tt>Url</tt> object. * * @return a string representation of the <tt>Url</tt> object. */ @Override public String toString() { final StringBuilder sb = new StringBuilder(); if (scheme_ != null) { sb.append(scheme_); sb.append(':'); } if (location_ != null) { sb.append("//"); sb.append(location_); } if (path_ != null) { sb.append(path_); } if (parameters_ != null) { sb.append(';'); sb.append(parameters_); } if (query_ != null) { sb.append('?'); sb.append(query_); } if (fragment_ != null) { sb.append('#'); sb.append(fragment_); } return sb.toString(); } } } /* * Copyright (c) 2002-2009 Gargoyle Software Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * String utilities class for utility functions not covered by third party libraries. * * @version $Revision: 4002 $ * @author Daniel Gredler * @author Ahmed Ashour * @author Martin Tamme */ final class StringUtils { /** * Disallow instantiation of this class. */ private StringUtils() { // Empty. } /** * Escapes the characters '<', '>' and '&' into their XML entity equivalents. Note that * sometimes we have to use this method instead of * {@link org.apache.commons.lang.StringEscapeUtils#escapeXml(String)} or * {@link org.apache.commons.lang.StringEscapeUtils#escapeHtml(String)} because those methods * escape some unicode characters as well. * * @param s the string to escape * @return the escaped form of the specified string */ public static String escapeXmlChars(final String s) { return s.replace("&", "&").replace("<", "<").replace(">", ">"); } /** * Returns <tt>true</tt> if the specified string contains whitespace, <tt>false</tt> otherwise. * * @param s the string to check for whitespace * @return <tt>true</tt> if the specified string contains whitespace, <tt>false</tt> otherwise */ public static boolean containsWhitespace(final String s) { for (final char c : s.toCharArray()) { if (Character.isWhitespace(c)) { return true; } } return false; } /** * Returns the index within a given string of the first occurrence of * the specified search character. * * @param s a string. * @param searchChar a search character. * @param beginIndex the index to start the search from. * @param endIndex the index to stop the search. * @return the index of the first occurrence of the character in the string or <tt>-1</tt>. */ public static int indexOf( final String s, final char searchChar, final int beginIndex, final int endIndex) { for (int i = beginIndex; i < endIndex; i++) { if (s.charAt(i) == searchChar) { return i; } } return -1; } /** * Returns <tt>true</tt> if the specified string is a valid float, possibly triming the string before checking. * @param s the string to check * @param trim whether or not to trim the string before checking * @return <tt>true</tt> if the specified string is a valid float, <tt>false</tt> otherwise */ public static boolean isFloat(String s, final boolean trim) { if (trim) { s = s.trim(); } boolean ok; try { Float.parseFloat(s); ok = true; } catch (final NumberFormatException e) { ok = false; } return ok; } /** * Returns <tt>true</tt> if the specified list of strings contains the specified string, ignoring case. * @param strings the strings to search * @param string the string to search for * @return <tt>true</tt> if the specified list of strings contains the specified string, ignoring case */ public static boolean containsCaseInsensitive(final List<String> strings, String string) { string = string.toLowerCase(); for (String s : strings) { if (s.toLowerCase().equals(string)) { return true; } } return false; } }