2018-02-24 23:34:32 -05:00
< ? php
/** @ license MIT
* Copyright 2018 J . King et al .
* See LICENSE and AUTHORS files for details */
declare ( strict_types = 1 );
2020-02-18 18:50:02 -05:00
namespace JKingWeb\Lax\Parser ;
2018-02-24 23:34:32 -05:00
2020-03-03 11:25:38 -05:00
use JKingWeb\Lax\Collection ;
2018-02-26 10:45:25 -05:00
use JKingWeb\Lax\Date ;
2020-03-04 11:49:36 -05:00
use JKingWeb\Lax\Url ;
2018-02-24 23:34:32 -05:00
trait Construct {
/** Trims plain text and collapses whitespace */
protected function trimText ( string $text ) : string {
return trim ( preg_replace ( " < \ s { 2,}>s " , " " , $text ));
}
/** Takes an HTML string as input and returns a sanitized version of that string
*
* The $outputHtml parameter , when false , outputs only the plain - text content of the sanitized HTML
*/
protected function sanitizeString ( string $markup , bool $outputHtml = true ) : string {
if ( ! preg_match ( " /< \ S/ " , $markup )) {
// if the string does not appear to actually contain markup besides entities, we can skip most of the sanitization
return $outputHtml ? $markup : $this -> trimText ( html_entity_decode ( $markup , \ENT_QUOTES | \ENT_HTML5 , " UTF-8 " ));
} else {
return " OOK! " ;
}
}
/** Tests whether a string is a valid e - mail address
*
2020-03-04 11:49:36 -05:00
* Accepts IDN hosts and Unicode localparts
2018-02-24 23:34:32 -05:00
*/
protected function validateMail ( string $addr ) : bool {
$out = preg_match ( " /^(.+?)@([^@]+) $ / " , $addr , $match );
if ( ! $out ) {
return false ;
}
$local = $match [ 1 ];
$domain = $match [ 2 ];
2020-03-04 11:49:36 -05:00
// PHP's filter_var does not accept IDN hosts, so we have to perform an IDNA transformation first
$domain = idn_to_ascii ( $domain , \IDNA_NONTRANSITIONAL_TO_ASCII | \IDNA_CHECK_BIDI | \IDNA_CHECK_CONTEXTJ , \INTL_IDNA_VARIANT_UTS46 ); // settings for IDNA2008 algorithm (I think)
if ( $domain !== false ) { $addr = " $local @ $domain " ;
return ( bool ) filter_var ( $addr , \FILTER_VALIDATE_EMAIL , \FILTER_FLAG_EMAIL_UNICODE );
2018-02-24 23:34:32 -05:00
}
2020-03-04 11:49:36 -05:00
return false ;
2018-02-24 23:34:32 -05:00
}
2018-02-25 22:53:02 -05:00
2020-02-25 15:23:38 -05:00
protected function parseDate ( string $date ) : ? Date {
2018-02-25 22:53:02 -05:00
$out = null ;
$date = $this -> trimText ( $date );
if ( ! strlen ( $date )) {
return $out ;
}
$tz = new \DateTimeZone ( " UTC " );
2018-02-26 11:10:37 -05:00
foreach ( Date :: $supportedFormats as $format ) {
2018-02-26 10:45:25 -05:00
$out = Date :: createFromFormat ( $format , $date , $tz );
2018-02-25 22:53:02 -05:00
if ( $out ) {
break ;
}
}
return $out ? : null ;
}
2020-03-03 11:25:38 -05:00
2020-03-04 11:49:36 -05:00
protected function parseMediaType ( string $type , ? Url $url = null ) : ? string {
if ( preg_match ( '<^\s*([0-9a-z]+(?:/[!#$%&\'\*\+\-\.^_`|~0-9a-z]+))(?:\s|;|$)>i' , $type , $match )) {
/* NOTE : The pattern used here is a subset of what is
technically allowed by RFC 7231 : the " type " portion
is supposed to be as general as the " subtype " portion ,
but in practice only alphabetic types have ever been
registered , making a more specific pattern more
practically useful for detecting media types .
See :
< https :// tools . ietf . org / html / rfc7231 #section-3.1.1.1>
< https :// tools . ietf . org / html / rfc7230 #section-3.2.6>
Additionally , types without subtypes are accepted as
we foresee the general type still being useful to
feed processors .
*/
return strtolower ( $match [ 1 ]);
}
if ( $url ) {
$file = substr ( $url -> getPath (), ( int ) strrpos ( $url -> getPath (), " / " ));
$ext = strrpos ( $file , " . " );
if ( $ext !== false ) {
$ext = substr ( $file , $ext + 1 );
if ( strlen ( $ext )) {
return ( $this -> mime ? ? ( $this -> mime = new \Mimey\MimeTypes )) -> getMimeType ( $ext );
}
}
}
return null ;
}
2020-03-03 11:25:38 -05:00
protected function empty ( $o ) : bool {
return ! array_filter (( array ) $o , function ( $v ) {
return ! is_null ( $v ) && ( ! $v instanceof Collection || sizeof ( $v ) > 0 );
});
}
2018-02-24 23:34:32 -05:00
}