File "class-urlintextprocessor.php"
Full Path: /home/adniftyx/public_html/wp-content/plugins/wordpress-importer/php-toolkit/DataLiberation/URL/class-urlintextprocessor.php
File size: 14.16 KB
MIME-type: text/x-php
Charset: utf-8
<?php
namespace WordPress\DataLiberation\URL;
use WordPress\DataLiberation\BlockMarkup\URL;
use WP_HTML_Text_Replacement;
/**
* Finds string fragments that look like URLs and allows replacing them.
*
* This class implements two stages of detection:
*
* 1. **A "thick" sieve**
* 2. **A "fine" sieve**
*
* The thick sieve uses a regular expression to match URL-like substrings. It matches too
* much and may yield false positives.
*
* The fine sieve filters out invalid candidates using a WHATWG-compliant parser so only
* real URLs are returned.
*
* ## URL Detection
*
* The thick sieve looks for URLs:
*
* * Starting with http://, https://, or //, e.g. //wp.org.
* * With no protocol, e.g. www.wp.org or wp.org/path
*
* Here's a list of matching-related rules, limitations, and assumptions:
*
* ### Protocols
*
* As a site migration tool, this processor only considers URLs with HTTP
* and HTTPS protocols.
*
* ### Domain names
*
* UTF-8 characters in the domain names are supported even if they're
* not encoded as punycode. For example, scanning the text:
*
* > Więcej na łąka.pl
*
* Would yield `łąka.pl`
*
* ### Paths
*
* The path is limited to ASCII characters, as per the URL specification.
* For example, scanning the text:
*
* > Visit the WordPress plugins directory https://w.org/plugins?łąka=1
*
* Would yield `https://w.org/plugins?`, not `https://w.org/plugins?łąka=1`.
* However, scanning this text:
*
* > Visit the WordPress plugins directory https://w.org/plugins?%C5%82%C4%85ka=1
*
* Would yield `https://w.org/plugins?%C5%82%C4%85ka=1`.
*
* ### Parenthesis treatment
*
* This scanner captures parentheses as a part of the path, query, or fragment, except
* when they're seen as the last character in the URL. For example, scanning the text:
*
* > Visit the WordPress plugins directory (https://w.org/plugins)
*
* Would yield `https://w.org/plugins`, but scanning the text:
*
* > Visit the WordPress plugins directory (https://w.org/plug(in)s
*
* Would yield `https://w.org/plug(in)s`.
*
* ### Rejecting URLs with embedded credentials
*
* `https://user:pass@wp.org` is not matched. Rewriting URLs that presume transferable
* credentials is hazardous and rarely correct for migrations.
*
* ### Reject non-HTTP(S) schemes
*
* Out of scope for site moves; all of these are rejected:
* `gopher://site.com`, `blob:afgh2-48189d`, `ahttp://site.com`, `mailto:user@site.com`, `file://asset.zip`.
* If we need additional schemes later, we can add them intentionally.
*
* ### Reject non‑absolute‑looking references
*
* While we do rely on a base URL, inputs like `::`, `/index.html`, `?query` are still ignored.
* Bare-domain forms like `mysite.org/?query` are still matched.
*
* ### Handle trailing punctuation sensibly
*
* `https://mysite.com/path/..` is interpreted as `https://mysite.com/path/` rather than
* collapsing to the origin. A final period is far more likely sentence punctuation than `../`.
* If a user truly writes `https://mysite.com/path/../`, we parse it as expected.
*
* ### Fuzzy matching for malformed ports
*
* * **WHATWG**: `"http://w.org:100000 plugins are in the directory" → failure`.
* * **Inline detection**: `"http://w.org:100000 plugins are in the directory" → "http://w.org/"`
* (truncate at the invalid port).
*
* This is a best‑effort extraction of the valid prefix rather than an all‑or‑nothing rejection.
*
* ### Whitespace handling
*
* * **WHATWG**: `"http://example\t.\norg" → "http://example.org/"`.
* * **Inline detection**: stops at the first whitespace, yielding `"http://example/"`.
*
* This reflects how URLs actually appear in text blocks where whitespace often terminates a link.
*/
class URLInTextProcessor {
private $text;
private $url_starts_at;
private $url_length;
private $bytes_already_parsed = 0;
/**
* @var string
*/
private $matched_url;
/**
* @var URL
*/
private $parsed_url;
private $did_prepend_protocol;
/**
* The base URL for the parsing algorithm.
* See https://url.spec.whatwg.org/.
*
* @var mixed|null
*/
private $base_url;
private $base_protocol;
/**
* The regular expression pattern used for the matchin URL candidates
* from the text.
*
* @var string
*/
private $regex;
/**
* @see \WP_HTML_Tag_Processor
* @var WP_HTML_Text_Replacement[]
*/
private $lexical_updates = array();
/**
* @var bool
* A flag to indicate whether the URL matching should be strict or not.
* If set to true, the matching will be strict, meaning it will only match URLs that strictly adhere to the pattern.
* If set to false, the matching will be more lenient, allowing for potential false positives.
*/
private $strict = false;
public function __construct( $text, $base_url = null ) {
$this->text = $text;
$this->base_url = $base_url;
$this->base_protocol = $base_url ? parse_url( $base_url, PHP_URL_SCHEME ) : null;
$prefix = $this->strict ? '^' : '';
$suffix = $this->strict ? '$' : '';
// Source: https://github.com/vstelmakh/url-highlight/blob/master/src/Matcher/Matcher.php.
$this->regex = '/' . $prefix . '
(?: # scheme
(?<scheme>[a-z0-9\+]+?:)? #
(?:\/*) # The protocol may optionally be followed by one or more slashes
)?
(?: # userinfo
(?:
(?<=\/{2}) # prefixed with \/\/
| # or
(?=[^\p{Sm}\p{Sc}\p{Sk}\p{P}]) # start with not: mathematical, currency, modifier symbol, punctuation
)
(?<userinfo>[^\s<>@\/]+) # not: whitespace, < > @ \/
@ # at
)?
(?=%|[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]) # followed by valid host char
(?| # host
(?<host> # host prefixed by scheme or userinfo (less strict)
(?<=\/\/|@) # prefixed with \/\/ or @
(?=[^\-]) # label start, not: -
(?:%|[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63} # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
(?<=[^\-]) # label end, not: -
(?: # more label parts
\.
(?=[^\-]) # label start, not: -
(?<tld>(?:[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63}) # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
(?<=[^\-]) # label end, not: -
)*
)
| # or
(?<host> # host with tld (no scheme or userinfo)
(?=[^\-]) # label start, not: -
(?:%|[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63} # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
(?<=[^\-]) # label end, not: -
(?: # more label parts
\.
(?=[^\-]) # label start, not: -
(?:%|[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63} # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
(?<=[^\-]) # label end, not: -
)*
\.(?<tld>\w{2,63}) # tld
)
)
(?:\:(?<port>\d{1,5}(?!\d)))? # port
(?<path> # path, query, fragment
[\/?#] # prefixed with \/ or ? or #
[^\s<>]* # any chars except whitespace and <>
(?<=[^\s<>({\[`!;:\'".,?«»“”‘’]) # end with not a space or some punctuation chars
)?
' . $suffix . '/ixuJ';
}
/**
* @return string
*/
public function next_url() {
while ( true ) {
$this->matched_url = null;
$this->parsed_url = null;
$this->url_starts_at = null;
$this->url_length = null;
$this->did_prepend_protocol = false;
/**
* Thick sieve – eagerly match things that look like URLs but turn out to not be URLs in the end.
*/
$matches = array();
$found = preg_match( $this->regex, $this->text, $matches, PREG_OFFSET_CAPTURE, $this->bytes_already_parsed );
if ( 1 !== $found ) {
return false;
}
$this->matched_url = $matches[0][0];
// Do not consider just :: as a URL.
if ( '::' === $this->matched_url ) {
continue;
}
if (
')' === $this->matched_url[ strlen( $this->matched_url ) - 1 ] ||
'.' === $this->matched_url[ strlen( $this->matched_url ) - 1 ]
) {
$this->matched_url = substr( $this->matched_url, 0, - 1 );
}
$url_starts_at = $matches[0][1];
$this->bytes_already_parsed = $url_starts_at + strlen( $this->matched_url );
$had_protocol = WPURL::has_http_https_protocol( $this->matched_url );
$preprocessed_url = $this->matched_url;
if ( $this->base_url && $this->base_protocol && ! $had_protocol ) {
$preprocessed_url = WPURL::ensure_protocol( $preprocessed_url, $this->base_protocol );
$this->did_prepend_protocol = true;
}
/*
* Extra fine sieve – parse the candidates using a WHATWG-compliant parser to rule out false positives.
*/
$parsed_url = WPURL::parse( $preprocessed_url, $this->base_url );
if ( false === $parsed_url ) {
continue;
}
// Only consider HTTP and HTTPS URLs.
if ( $parsed_url->protocol && ! in_array( $parsed_url->protocol, array( 'http:', 'https:' ), true ) ) {
continue;
}
// Disregard URLs with auth details.
if ( $parsed_url->username || $parsed_url->password ) {
continue;
}
// Additional rigor for URLs that are not explicitly preceded by a double slash.
if ( ! $had_protocol ) {
/*
* Skip TLDs that are not in the public suffix.
* This reduces false positives like `index.html` or `plugins.php`.
*
* See https://publicsuffix.org/.
*/
$last_dot_position = strrpos( $parsed_url->hostname, '.' );
if ( false === $last_dot_position ) {
/*
* Oh, there was no dot in the hostname AND no double slash at
* the beginning! Let's assume this isn't a valid URL and move on.
* @TODO: Explore updating the regular expression above to avoid matching
* URLs without a dot in the hostname when they're not preceeded
* by a protocol.
*/
continue;
}
$tld = substr( $parsed_url->hostname, $last_dot_position + 1 );
if ( ! WPURL::is_known_public_domain( $tld ) ) {
// This TLD is not in the public suffix list. It's not a valid domain name.
continue;
}
}
$this->parsed_url = $parsed_url;
$this->url_starts_at = $url_starts_at;
$this->url_length = strlen( $matches[0][0] );
return true;
}
}
public function get_raw_url() {
return $this->matched_url ?? false;
}
public function get_parsed_url() {
if ( null === $this->parsed_url ) {
return false;
}
return $this->parsed_url;
}
public function set_raw_url( $new_url ) {
if ( null === $this->matched_url ) {
return false;
}
if ( $this->did_prepend_protocol ) {
$new_url = substr( $new_url, strpos( $new_url, '://' ) + 3 );
}
$this->matched_url = $new_url;
$this->lexical_updates[ $this->url_starts_at ] = new WP_HTML_Text_Replacement(
$this->url_starts_at,
$this->url_length,
$new_url
);
return true;
}
private function apply_lexical_updates() {
if ( ! count( $this->lexical_updates ) ) {
return 0;
}
/*
* Attribute updates can be enqueued in any order but updates
* to the document must occur in lexical order; that is, each
* replacement must be made before all others which follow it
* at later string indices in the input document.
*
* Sorting avoid making out-of-order replacements which
* can lead to mangled output, partially-duplicated
* attributes, and overwritten attributes.
*/
ksort( $this->lexical_updates );
$bytes_already_copied = 0;
$output_buffer = '';
foreach ( $this->lexical_updates as $diff ) {
$shift = strlen( $diff->text ) - $diff->length;
// Adjust the cursor position by however much an update affects it.
if ( $diff->start < $this->bytes_already_parsed ) {
$this->bytes_already_parsed += $shift;
}
$output_buffer .= substr( $this->text, $bytes_already_copied, $diff->start - $bytes_already_copied );
if ( $diff->start === $this->url_starts_at ) {
$this->url_starts_at = strlen( $output_buffer );
$this->url_length = strlen( $diff->text );
}
$output_buffer .= $diff->text;
$bytes_already_copied = $diff->start + $diff->length;
}
$this->text = $output_buffer . substr( $this->text, $bytes_already_copied );
$this->lexical_updates = array();
}
public function get_updated_text() {
$this->apply_lexical_updates();
return $this->text;
}
}