2021-03-10 18:57:41 +01:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace Icamys\SitemapGenerator;
|
|
|
|
|
|
|
|
use BadMethodCallException;
|
|
|
|
use DateTime;
|
2024-12-26 17:07:02 +01:00
|
|
|
use Icamys\SitemapGenerator\Extensions\GoogleImageExtension;
|
2021-03-10 18:57:41 +01:00
|
|
|
use Icamys\SitemapGenerator\Extensions\GoogleVideoExtension;
|
|
|
|
use InvalidArgumentException;
|
|
|
|
use OutOfRangeException;
|
|
|
|
use RuntimeException;
|
2024-12-26 17:07:02 +01:00
|
|
|
use UnexpectedValueException;
|
2021-03-10 18:57:41 +01:00
|
|
|
use XMLWriter;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Class SitemapGenerator
|
|
|
|
* @package Icamys\SitemapGenerator
|
|
|
|
*/
|
|
|
|
class SitemapGenerator
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Max size of a sitemap according to spec.
|
|
|
|
* @see https://www.sitemaps.org/protocol.html
|
|
|
|
*/
|
|
|
|
private const MAX_FILE_SIZE = 52428800;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Max number of urls per sitemap according to spec.
|
|
|
|
* @see https://www.sitemaps.org/protocol.html
|
|
|
|
*/
|
|
|
|
private const MAX_URLS_PER_SITEMAP = 50000;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Max number of sitemaps per index file according to spec.
|
|
|
|
* @see http://www.sitemaps.org/protocol.html
|
|
|
|
*/
|
|
|
|
private const MAX_SITEMAPS_PER_INDEX = 50000;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Total max number of URLs.
|
|
|
|
*/
|
|
|
|
private const TOTAL_MAX_URLS = self::MAX_URLS_PER_SITEMAP * self::MAX_SITEMAPS_PER_INDEX;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Max url length according to spec.
|
|
|
|
* @see https://www.sitemaps.org/protocol.html#xmlTagDefinitions
|
|
|
|
*/
|
|
|
|
private const MAX_URL_LEN = 2048;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Robots file name
|
|
|
|
* @var string
|
|
|
|
* @access public
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private string $robotsFileName = "robots.txt";
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
|
|
|
* Name of sitemap file
|
|
|
|
* @var string
|
|
|
|
* @access public
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private string $sitemapFileName = "sitemap.xml";
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
|
|
|
* Name of sitemap index file
|
|
|
|
* @var string
|
|
|
|
* @access public
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private string $sitemapIndexFileName = "sitemap-index.xml";
|
|
|
|
/**
|
|
|
|
* Sitemap Stylesheet link.
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private string $sitemapStylesheetLink = "";
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
|
|
|
* Quantity of URLs per single sitemap file.
|
|
|
|
* If Your links are very long, sitemap file can be bigger than 10MB,
|
|
|
|
* in this case use smaller value.
|
|
|
|
* @var int
|
|
|
|
* @access public
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private int $maxURLsPerSitemap = self::MAX_URLS_PER_SITEMAP;
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
|
|
|
* If true, two sitemap files (.xml and .xml.gz) will be created and added to robots.txt.
|
|
|
|
* If true, .gz file will be submitted to search engines.
|
|
|
|
* If quantity of URLs will be bigger than 50.000, option will be ignored,
|
|
|
|
* all sitemap files except sitemap index will be compressed.
|
|
|
|
* @var bool
|
|
|
|
* @access public
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private bool $isCompressionEnabled = false;
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
|
|
|
* URL to Your site.
|
|
|
|
* Script will use it to send sitemaps to search engines.
|
|
|
|
* @var string
|
|
|
|
* @access private
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private string $baseURL;
|
|
|
|
/**
|
|
|
|
* URL to sitemap file(s).
|
|
|
|
* Script will use it to reference sitemap files in robots.txt and sitemap index.
|
|
|
|
* @var string
|
|
|
|
* @access private
|
|
|
|
*/
|
|
|
|
private string $sitemapIndexURL;
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
|
|
|
* Base path. Relative to script location.
|
|
|
|
* Use this if Your sitemap and robots files should be stored in other
|
|
|
|
* directory then script.
|
|
|
|
* @var string
|
|
|
|
* @access private
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private string $saveDirectory;
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
|
|
|
* Version of this class
|
|
|
|
* @var string
|
|
|
|
* @access private
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private string $classVersion = "5.0.0";
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
|
|
|
* Search engines URLs
|
|
|
|
* @var array of strings
|
|
|
|
* @access private
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private array $searchEngines = [
|
|
|
|
"https://webmaster.yandex.ru/ping?sitemap=",
|
2021-03-10 18:57:41 +01:00
|
|
|
];
|
|
|
|
/**
|
|
|
|
* Lines for robots.txt file that are written if file does not exist
|
|
|
|
* @var array
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private array $sampleRobotsLines = [
|
2021-03-10 18:57:41 +01:00
|
|
|
"User-agent: *",
|
2024-09-07 14:11:01 +02:00
|
|
|
"Allow: /",
|
2021-03-10 18:57:41 +01:00
|
|
|
];
|
|
|
|
/**
|
|
|
|
* @var array list of valid changefreq values according to the spec
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private array $validChangefreqValues = [
|
2021-03-10 18:57:41 +01:00
|
|
|
'always',
|
|
|
|
'hourly',
|
|
|
|
'daily',
|
|
|
|
'weekly',
|
|
|
|
'monthly',
|
|
|
|
'yearly',
|
|
|
|
'never',
|
|
|
|
];
|
|
|
|
/**
|
|
|
|
* @var float[] list of valid priority values according to the spec
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private array $validPriorities = [
|
2021-03-10 18:57:41 +01:00
|
|
|
0.0,
|
|
|
|
0.1,
|
|
|
|
0.2,
|
|
|
|
0.3,
|
|
|
|
0.4,
|
|
|
|
0.5,
|
|
|
|
0.6,
|
|
|
|
0.7,
|
|
|
|
0.8,
|
|
|
|
0.9,
|
|
|
|
1.0,
|
|
|
|
];
|
|
|
|
/**
|
2024-12-26 17:07:02 +01:00
|
|
|
* @var IFileSystem object used to communicate with file system
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private IFileSystem $fs;
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
2024-12-26 17:07:02 +01:00
|
|
|
* @var IRuntime object used to communicate with runtime
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private IRuntime $runtime;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var XMLWriter Used for writing xml to files
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private XMLWriter $xmlWriter;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var string
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private string $flushedSitemapFilenameFormat;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var int
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private int $flushedSitemapSize = 0;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var int
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private int $flushedSitemapCounter = 0;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var array
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private array $flushedSitemaps = [];
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var bool
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private bool $isSitemapStarted = false;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var int
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private int $totalURLCount = 0;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var int
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private int $urlsetClosingTagLen = 10; // strlen("</urlset>\n")
|
|
|
|
private int $sitemapURLCount = 0;
|
|
|
|
private array $generatedFiles = [];
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
/**
|
2024-12-26 17:07:02 +01:00
|
|
|
* @param IConfig $config Configuration object.
|
|
|
|
* @throws InvalidArgumentException
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
public function __construct(IConfig $config)
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
2024-12-26 17:07:02 +01:00
|
|
|
if ($config->getBaseURL() === '') {
|
|
|
|
throw new InvalidArgumentException('baseURL config parameter is required');
|
|
|
|
}
|
2021-03-10 18:57:41 +01:00
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->baseURL = rtrim($config->getBaseURL(), '/');
|
|
|
|
$this->sitemapIndexURL = rtrim($config->getBaseURL(), '/');
|
|
|
|
|
|
|
|
if ($config->getSitemapIndexURL()) {
|
|
|
|
$this->sitemapIndexURL = rtrim($config->getSitemapIndexURL(), '/');
|
|
|
|
}
|
|
|
|
|
|
|
|
$configFS = $config->getFS();
|
|
|
|
if ($configFS === null) {
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->fs = new FileSystem();
|
|
|
|
} else {
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->fs = $configFS;
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
$configRuntime = $config->getRuntime();
|
|
|
|
if ($configRuntime === null) {
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->runtime = new Runtime();
|
|
|
|
} else {
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->runtime = $configRuntime;
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
if ($this->runtime->is_writable($config->getSaveDirectory()) === false) {
|
2021-03-10 18:57:41 +01:00
|
|
|
throw new InvalidArgumentException(
|
2024-12-26 17:07:02 +01:00
|
|
|
sprintf('the provided basePath (%s) should be a writable directory,', $config->getSaveDirectory()) .
|
2021-03-10 18:57:41 +01:00
|
|
|
' please check its existence and permissions'
|
|
|
|
);
|
|
|
|
}
|
2024-12-26 17:07:02 +01:00
|
|
|
|
|
|
|
$this->saveDirectory = $config->getSaveDirectory();
|
|
|
|
if (strlen($this->saveDirectory) > 0 && substr($this->saveDirectory, -1) != DIRECTORY_SEPARATOR) {
|
|
|
|
$this->saveDirectory = $this->saveDirectory . DIRECTORY_SEPARATOR;
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
$this->xmlWriter = $this->createXmlWriter();
|
|
|
|
$this->flushedSitemapFilenameFormat = sprintf("sm-%%d-%d.xml", time());
|
|
|
|
}
|
|
|
|
|
|
|
|
private function createXmlWriter(): XMLWriter
|
|
|
|
{
|
|
|
|
$w = new XMLWriter();
|
|
|
|
$w->openMemory();
|
|
|
|
$w->setIndent(true);
|
|
|
|
return $w;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param string $filename
|
2024-12-26 17:07:02 +01:00
|
|
|
*
|
2021-03-10 18:57:41 +01:00
|
|
|
* @return SitemapGenerator
|
2024-12-26 17:07:02 +01:00
|
|
|
*
|
|
|
|
* @throws InvalidArgumentException
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
|
|
|
public function setSitemapFilename(string $filename = ''): SitemapGenerator
|
|
|
|
{
|
|
|
|
if (strlen($filename) === 0) {
|
|
|
|
throw new InvalidArgumentException('sitemap filename should not be empty');
|
|
|
|
}
|
|
|
|
if (pathinfo($filename, PATHINFO_EXTENSION) !== 'xml') {
|
|
|
|
throw new InvalidArgumentException('sitemap filename should have *.xml extension');
|
|
|
|
}
|
|
|
|
$this->sitemapFileName = $filename;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
/**
|
|
|
|
* @param string $path
|
|
|
|
* @return SitemapGenerator
|
|
|
|
* @throws InvalidArgumentException
|
|
|
|
*/
|
|
|
|
public function setSitemapStylesheet(string $path): SitemapGenerator
|
|
|
|
{
|
|
|
|
if (strlen($path) === 0) {
|
|
|
|
throw new InvalidArgumentException('sitemap stylesheet path should not be empty');
|
|
|
|
}
|
|
|
|
$this->sitemapStylesheetLink = $path;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
2021-03-10 18:57:41 +01:00
|
|
|
/**
|
|
|
|
* @param string $filename
|
2024-12-26 17:07:02 +01:00
|
|
|
*
|
2021-03-10 18:57:41 +01:00
|
|
|
* @return $this
|
2024-12-26 17:07:02 +01:00
|
|
|
*
|
|
|
|
* @throws InvalidArgumentException
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
|
|
|
public function setSitemapIndexFilename(string $filename = ''): SitemapGenerator
|
|
|
|
{
|
|
|
|
if (strlen($filename) === 0) {
|
|
|
|
throw new InvalidArgumentException('filename should not be empty');
|
|
|
|
}
|
|
|
|
$this->sitemapIndexFileName = $filename;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param string $filename
|
|
|
|
* @return $this
|
2024-12-26 17:07:02 +01:00
|
|
|
* @throws InvalidArgumentException
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
|
|
|
public function setRobotsFileName(string $filename): SitemapGenerator
|
|
|
|
{
|
|
|
|
if (strlen($filename) === 0) {
|
|
|
|
throw new InvalidArgumentException('filename should not be empty');
|
|
|
|
}
|
|
|
|
$this->robotsFileName = $filename;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param int $value
|
|
|
|
* @return $this
|
2024-12-26 17:07:02 +01:00
|
|
|
* @throws OutOfRangeException
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
public function setMaxURLsPerSitemap(int $value): SitemapGenerator
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
if ($value < 1 || self::MAX_URLS_PER_SITEMAP < $value) {
|
|
|
|
throw new OutOfRangeException(
|
|
|
|
sprintf('value %d is out of range 1-%d', $value, self::MAX_URLS_PER_SITEMAP)
|
|
|
|
);
|
|
|
|
}
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->maxURLsPerSitemap = $value;
|
2021-03-10 18:57:41 +01:00
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function enableCompression(): SitemapGenerator
|
|
|
|
{
|
|
|
|
$this->isCompressionEnabled = true;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function disableCompression(): SitemapGenerator
|
|
|
|
{
|
|
|
|
$this->isCompressionEnabled = false;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function isCompressionEnabled(): bool
|
|
|
|
{
|
|
|
|
return $this->isCompressionEnabled;
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
/**
|
|
|
|
* @param string $path
|
|
|
|
* @param string|null $changeFrequency
|
|
|
|
* @param float|null $priority
|
|
|
|
* @param array $extensions
|
|
|
|
* @return void
|
|
|
|
* @throws InvalidArgumentException
|
|
|
|
*/
|
2021-03-10 18:57:41 +01:00
|
|
|
public function validate(
|
2024-12-26 17:07:02 +01:00
|
|
|
string $path,
|
|
|
|
?string $changeFrequency = null,
|
|
|
|
?float $priority = null,
|
|
|
|
array $extensions = []): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
if (!(1 <= mb_strlen($path) && mb_strlen($path) <= self::MAX_URL_LEN)) {
|
|
|
|
throw new InvalidArgumentException(
|
|
|
|
sprintf("The urlPath argument length must be between 1 and %d.", self::MAX_URL_LEN)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
if ($changeFrequency !== null && !in_array($changeFrequency, $this->validChangefreqValues)) {
|
|
|
|
throw new InvalidArgumentException(
|
|
|
|
'The change frequency argument should be one of: %s' . implode(',', $this->validChangefreqValues)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
if ($priority !== null && !in_array($priority, $this->validPriorities)) {
|
|
|
|
throw new InvalidArgumentException("Priority argument should be a float number in the range [0.0..1.0]");
|
|
|
|
}
|
2024-12-26 17:07:02 +01:00
|
|
|
if (count($extensions) > 0) {
|
|
|
|
if (isset($extensions['google_video'])) {
|
|
|
|
GoogleVideoExtension::validate($this->baseURL . $path, $extensions['google_video']);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isset($extensions['google_image'])) {
|
|
|
|
GoogleImageExtension::validateEntryFields($extensions['google_image']);
|
|
|
|
}
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Add url components.
|
|
|
|
* Instead of storing all urls in the memory, the generator will flush sets of added urls
|
|
|
|
* to the temporary files created on your disk.
|
|
|
|
* The file format is 'sm-{index}-{timestamp}.xml'
|
|
|
|
* @param string $path
|
|
|
|
* @param DateTime|null $lastModified
|
|
|
|
* @param string|null $changeFrequency
|
|
|
|
* @param float|null $priority
|
|
|
|
* @param array|null $alternates
|
|
|
|
* @param array $extensions
|
|
|
|
* @return $this
|
2024-12-26 17:07:02 +01:00
|
|
|
* @throws OutOfRangeException
|
|
|
|
* @throws UnexpectedValueException
|
|
|
|
* @throws InvalidArgumentException
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
|
|
|
public function addURL(
|
2024-12-26 17:07:02 +01:00
|
|
|
string $path,
|
|
|
|
?DateTime $lastModified = null,
|
|
|
|
?string $changeFrequency = null,
|
|
|
|
?float $priority = null,
|
|
|
|
?array $alternates = null,
|
|
|
|
array $extensions = []
|
2021-03-10 18:57:41 +01:00
|
|
|
): SitemapGenerator
|
|
|
|
{
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->validate($path, $changeFrequency, $priority, $extensions);
|
2021-03-10 18:57:41 +01:00
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
if ($this->totalURLCount >= self::TOTAL_MAX_URLS) {
|
2021-03-10 18:57:41 +01:00
|
|
|
throw new OutOfRangeException(
|
|
|
|
sprintf("Max url limit reached (%d)", self::TOTAL_MAX_URLS)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
if ($this->isSitemapStarted === false) {
|
|
|
|
$this->writeSitemapStart();
|
|
|
|
}
|
|
|
|
|
|
|
|
$this->writeSitemapUrl($this->baseURL . $path, $lastModified, $changeFrequency, $priority, $alternates, $extensions);
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
if ($this->totalURLCount % 1000 === 0 || $this->sitemapURLCount >= $this->maxURLsPerSitemap) {
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->flushWriter();
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
if ($this->sitemapURLCount === $this->maxURLsPerSitemap) {
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->writeSitemapEnd();
|
|
|
|
}
|
|
|
|
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
protected function writeSitemapStart(): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
$this->xmlWriter->startDocument("1.0", "UTF-8");
|
2024-12-26 17:07:02 +01:00
|
|
|
if ($this->sitemapStylesheetLink != "") {
|
|
|
|
$this->xmlWriter->writePi('xml-stylesheet',
|
|
|
|
sprintf('type="text/xsl" href="%s"', $this->sitemapStylesheetLink));
|
|
|
|
}
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->xmlWriter->writeComment(sprintf('generator-class="%s"', get_class($this)));
|
|
|
|
$this->xmlWriter->writeComment(sprintf('generator-version="%s"', $this->classVersion));
|
|
|
|
$this->xmlWriter->writeComment(sprintf('generated-on="%s"', date('c')));
|
|
|
|
$this->xmlWriter->startElement('urlset');
|
|
|
|
$this->xmlWriter->writeAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9');
|
|
|
|
$this->xmlWriter->writeAttribute('xmlns:xhtml', 'http://www.w3.org/1999/xhtml');
|
|
|
|
$this->xmlWriter->writeAttribute('xmlns:video', 'http://www.google.com/schemas/sitemap-video/1.1');
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->xmlWriter->writeAttribute('xmlns:image', 'http://www.google.com/schemas/sitemap-image/1.1');
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->xmlWriter->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance');
|
|
|
|
$this->xmlWriter->writeAttribute('xsi:schemaLocation', 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd');
|
|
|
|
$this->isSitemapStarted = true;
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
/**
|
|
|
|
* @param string $url
|
|
|
|
* @return string
|
|
|
|
* @throws UnexpectedValueException
|
|
|
|
*/
|
|
|
|
private function encodeEscapeURL(string $url): string
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
2024-12-26 17:07:02 +01:00
|
|
|
// In-place encoding only on non-ASCII characters, like browsers do.
|
|
|
|
$encoded = preg_replace_callback('/[^\x20-\x7f]/', function ($match) {
|
|
|
|
return urlencode($match[0]);
|
|
|
|
}, $url);
|
|
|
|
if (!is_string($encoded)) {
|
|
|
|
throw new UnexpectedValueException('Failed to encode URL');
|
|
|
|
}
|
|
|
|
return htmlspecialchars($encoded, ENT_QUOTES, 'UTF-8');
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param string $loc
|
|
|
|
* @param DateTime|null $lastModified
|
|
|
|
* @param string|null $changeFrequency
|
|
|
|
* @param float|null $priority
|
|
|
|
* @param array|null $alternates
|
|
|
|
* @param array $extensions
|
|
|
|
* @throws UnexpectedValueException
|
|
|
|
*/
|
|
|
|
private function writeSitemapUrl(
|
|
|
|
string $loc,
|
|
|
|
?DateTime $lastModified = null,
|
|
|
|
?string $changeFrequency = null,
|
|
|
|
?float $priority = null,
|
|
|
|
?array $alternates = null,
|
|
|
|
array $extensions = []
|
|
|
|
): void {
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->xmlWriter->startElement('url');
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->xmlWriter->writeElement('loc', $this->encodeEscapeURL($loc));
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
if ($lastModified !== null) {
|
|
|
|
$this->xmlWriter->writeElement('lastmod', $lastModified->format(DateTime::ATOM));
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($changeFrequency !== null) {
|
|
|
|
$this->xmlWriter->writeElement('changefreq', $changeFrequency);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($priority !== null) {
|
|
|
|
$this->xmlWriter->writeElement('priority', number_format($priority, 1, ".", ""));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_array($alternates) && count($alternates) > 0) {
|
|
|
|
foreach ($alternates as $alternate) {
|
|
|
|
if (is_array($alternate) && isset($alternate['hreflang']) && isset($alternate['href'])) {
|
|
|
|
$this->xmlWriter->startElement('xhtml:link');
|
|
|
|
$this->xmlWriter->writeAttribute('rel', 'alternate');
|
|
|
|
$this->xmlWriter->writeAttribute('hreflang', $alternate['hreflang']);
|
|
|
|
$this->xmlWriter->writeAttribute('href', $alternate['href']);
|
|
|
|
$this->xmlWriter->endElement();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach ($extensions as $extName => $extFields) {
|
|
|
|
if ($extName === 'google_video') {
|
|
|
|
GoogleVideoExtension::writeVideoTag($this->xmlWriter, $loc, $extFields);
|
|
|
|
}
|
2024-12-26 17:07:02 +01:00
|
|
|
if ($extName === 'google_image') {
|
|
|
|
GoogleImageExtension::writeImageTag($this->xmlWriter, $extFields);
|
|
|
|
}
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
$this->xmlWriter->endElement(); // url
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->sitemapURLCount++;
|
|
|
|
$this->totalURLCount++;
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
private function flushWriter(): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
2024-12-26 17:07:02 +01:00
|
|
|
$targetSitemapFilepath = $this->saveDirectory . sprintf($this->flushedSitemapFilenameFormat, $this->flushedSitemapCounter);
|
|
|
|
$flushedString = $this->xmlWriter->outputMemory();
|
2021-03-10 18:57:41 +01:00
|
|
|
$flushedStringLen = mb_strlen($flushedString);
|
|
|
|
|
|
|
|
if ($flushedStringLen === 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
$this->flushedSitemapSize += $flushedStringLen;
|
|
|
|
|
|
|
|
if ($this->flushedSitemapSize > self::MAX_FILE_SIZE - $this->urlsetClosingTagLen) {
|
|
|
|
$this->writeSitemapEnd();
|
|
|
|
$this->writeSitemapStart();
|
|
|
|
}
|
|
|
|
$this->fs->file_put_contents($targetSitemapFilepath, $flushedString, FILE_APPEND);
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
private function writeSitemapEnd(): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
2024-12-26 17:07:02 +01:00
|
|
|
$targetSitemapFilepath = $this->saveDirectory . sprintf($this->flushedSitemapFilenameFormat, $this->flushedSitemapCounter);
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->xmlWriter->endElement(); // urlset
|
|
|
|
$this->xmlWriter->endDocument();
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->fs->file_put_contents($targetSitemapFilepath, $this->xmlWriter->flush(), FILE_APPEND);
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->isSitemapStarted = false;
|
|
|
|
$this->flushedSitemaps[] = $targetSitemapFilepath;
|
|
|
|
$this->flushedSitemapCounter++;
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->sitemapURLCount = 0;
|
|
|
|
$this->flushedSitemapSize = 0;
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Flush all stored urls from memory to the disk and close all necessary tags.
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
public function flush(): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
$this->flushWriter();
|
|
|
|
if ($this->isSitemapStarted) {
|
|
|
|
$this->writeSitemapEnd();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Move flushed files to their final location. Compress if necessary.
|
2024-12-26 17:07:02 +01:00
|
|
|
* @throws RuntimeException
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
public function finalize(): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
$this->generatedFiles = [];
|
|
|
|
|
|
|
|
if (count($this->flushedSitemaps) === 1) {
|
|
|
|
$targetSitemapFilename = $this->sitemapFileName;
|
|
|
|
if ($this->isCompressionEnabled) {
|
|
|
|
$targetSitemapFilename .= '.gz';
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
$targetSitemapFilepath = $this->saveDirectory . $targetSitemapFilename;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
if ($this->isCompressionEnabled) {
|
|
|
|
$this->fs->copy($this->flushedSitemaps[0], 'compress.zlib://' . $targetSitemapFilepath);
|
|
|
|
$this->fs->unlink($this->flushedSitemaps[0]);
|
|
|
|
} else {
|
|
|
|
$this->fs->rename($this->flushedSitemaps[0], $targetSitemapFilepath);
|
|
|
|
}
|
|
|
|
$this->generatedFiles['sitemaps_location'] = [$targetSitemapFilepath];
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->generatedFiles['sitemaps_index_url'] = $this->sitemapIndexURL . '/' . $targetSitemapFilename;
|
2021-03-10 18:57:41 +01:00
|
|
|
} else if (count($this->flushedSitemaps) > 1) {
|
|
|
|
$ext = '.' . pathinfo($this->sitemapFileName, PATHINFO_EXTENSION);
|
|
|
|
$targetExt = $ext;
|
|
|
|
if ($this->isCompressionEnabled) {
|
|
|
|
$targetExt .= '.gz';
|
|
|
|
}
|
|
|
|
|
|
|
|
$sitemapsUrls = [];
|
|
|
|
$targetSitemapFilepaths = [];
|
|
|
|
foreach ($this->flushedSitemaps as $i => $flushedSitemap) {
|
2024-12-26 17:07:02 +01:00
|
|
|
$targetSitemapFilename = str_replace($ext, ((int)$i + 1) . $targetExt, $this->sitemapFileName);
|
|
|
|
$targetSitemapFilepath = $this->saveDirectory . $targetSitemapFilename;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
if ($this->isCompressionEnabled) {
|
|
|
|
$this->fs->copy($flushedSitemap, 'compress.zlib://' . $targetSitemapFilepath);
|
|
|
|
$this->fs->unlink($flushedSitemap);
|
|
|
|
} else {
|
|
|
|
$this->fs->rename($flushedSitemap, $targetSitemapFilepath);
|
|
|
|
}
|
2024-12-26 17:07:02 +01:00
|
|
|
$sitemapsUrls[] = htmlspecialchars(
|
|
|
|
$this->sitemapIndexURL . '/' . $targetSitemapFilename, ENT_QUOTES);
|
2021-03-10 18:57:41 +01:00
|
|
|
$targetSitemapFilepaths[] = $targetSitemapFilepath;
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
$targetSitemapIndexFilepath = $this->saveDirectory . $this->sitemapIndexFileName;
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->createSitemapIndex($sitemapsUrls, $targetSitemapIndexFilepath);
|
|
|
|
$this->generatedFiles['sitemaps_location'] = $targetSitemapFilepaths;
|
|
|
|
$this->generatedFiles['sitemaps_index_location'] = $targetSitemapIndexFilepath;
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->generatedFiles['sitemaps_index_url'] = $this->sitemapIndexURL . '/' . $this->sitemapIndexFileName;
|
2021-03-10 18:57:41 +01:00
|
|
|
} else {
|
|
|
|
throw new RuntimeException('failed to finalize, please add urls and flush first');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
private function createSitemapIndex(array $sitemapsUrls, string $sitemapIndexFileName): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->xmlWriter->flush();
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->writeSitemapIndexStart();
|
|
|
|
foreach ($sitemapsUrls as $sitemapsUrl) {
|
|
|
|
$this->writeSitemapIndexUrl($sitemapsUrl);
|
|
|
|
}
|
|
|
|
$this->writeSitemapIndexEnd();
|
|
|
|
$this->fs->file_put_contents(
|
|
|
|
$sitemapIndexFileName,
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->xmlWriter->flush(),
|
2021-03-10 18:57:41 +01:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
protected function writeSitemapIndexStart(): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
$this->xmlWriter->startDocument("1.0", "UTF-8");
|
|
|
|
$this->xmlWriter->writeComment(sprintf('generator-class="%s"', get_class($this)));
|
|
|
|
$this->xmlWriter->writeComment(sprintf('generator-version="%s"', $this->classVersion));
|
|
|
|
$this->xmlWriter->writeComment(sprintf('generated-on="%s"', date('c')));
|
|
|
|
$this->xmlWriter->startElement('sitemapindex');
|
|
|
|
$this->xmlWriter->writeAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9');
|
|
|
|
$this->xmlWriter->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance');
|
|
|
|
$this->xmlWriter->writeAttribute('xsi:schemaLocation', 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd');
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
/**
|
|
|
|
* @param string $url
|
|
|
|
* @throws UnexpectedValueException
|
|
|
|
*/
|
|
|
|
private function writeSitemapIndexUrl(string $url): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
$this->xmlWriter->startElement('sitemap');
|
2024-12-26 17:07:02 +01:00
|
|
|
$this->xmlWriter->writeElement('loc', $this->encodeEscapeURL($url));
|
2021-03-10 18:57:41 +01:00
|
|
|
$this->xmlWriter->writeElement('lastmod', date('c'));
|
|
|
|
$this->xmlWriter->endElement(); // sitemap
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
private function writeSitemapIndexEnd(): void
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
$this->xmlWriter->endElement(); // sitemapindex
|
|
|
|
$this->xmlWriter->endDocument();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return array Array of previously generated files
|
|
|
|
*/
|
|
|
|
public function getGeneratedFiles(): array
|
|
|
|
{
|
|
|
|
return $this->generatedFiles;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Will inform search engines about newly created sitemaps.
|
2024-12-26 17:07:02 +01:00
|
|
|
* Google and Yandex will be notified.
|
2021-03-10 18:57:41 +01:00
|
|
|
* @return array of messages and http codes from each search engine
|
|
|
|
* @access public
|
|
|
|
* @throws BadMethodCallException
|
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
public function submitSitemap(): array
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
if (count($this->generatedFiles) === 0) {
|
|
|
|
throw new BadMethodCallException("To update robots.txt, call finalize() first.");
|
|
|
|
}
|
|
|
|
if (!$this->runtime->extension_loaded('curl')) {
|
2024-12-26 17:07:02 +01:00
|
|
|
throw new BadMethodCallException("curl extension is needed to do submission.");
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
$searchEngines = $this->searchEngines;
|
|
|
|
$result = [];
|
|
|
|
for ($i = 0; $i < count($searchEngines); $i++) {
|
|
|
|
$submitUrl = $searchEngines[$i] . htmlspecialchars($this->generatedFiles['sitemaps_index_url'], ENT_QUOTES);
|
2024-12-26 17:07:02 +01:00
|
|
|
$curlResource = $this->runtime->curl_init($submitUrl);
|
|
|
|
if (is_bool($curlResource) && !$curlResource) {
|
|
|
|
throw new RuntimeException("failed to execute curl_init for url " . $submitUrl);
|
|
|
|
}
|
|
|
|
if (!$this->runtime->curl_setopt($curlResource, CURLOPT_RETURNTRANSFER, true)) {
|
|
|
|
throw new RuntimeException(
|
|
|
|
"failed to set curl option CURLOPT_RETURNTRANSFER to true, error: "
|
|
|
|
. $this->runtime->curl_error($curlResource)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
$responseContent = $this->runtime->curl_exec($curlResource);
|
|
|
|
if (is_bool($responseContent) && !$responseContent) {
|
|
|
|
throw new RuntimeException(
|
|
|
|
"failed to run curl_exec, error: " . $this->runtime->curl_error($curlResource)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
$response = $this->runtime->curl_getinfo($curlResource);
|
2021-03-10 18:57:41 +01:00
|
|
|
$submitSiteShort = array_reverse(explode(".", parse_url($searchEngines[$i], PHP_URL_HOST)));
|
|
|
|
$result[] = [
|
|
|
|
"site" => $submitSiteShort[1] . "." . $submitSiteShort[0],
|
|
|
|
"fullsite" => $submitUrl,
|
|
|
|
"http_code" => $response['http_code'],
|
|
|
|
"message" => str_replace("\n", " ", strip_tags($responseContent)),
|
|
|
|
];
|
|
|
|
}
|
|
|
|
return $result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Adds sitemap url to robots.txt file located in basePath.
|
|
|
|
* If robots.txt file exists,
|
|
|
|
* the function will append sitemap url to file.
|
|
|
|
* If robots.txt does not exist,
|
|
|
|
* the function will create new robots.txt file with sample content and sitemap url.
|
|
|
|
* @access public
|
|
|
|
* @throws BadMethodCallException
|
|
|
|
* @throws RuntimeException
|
|
|
|
*/
|
|
|
|
public function updateRobots(): SitemapGenerator
|
|
|
|
{
|
|
|
|
if (count($this->generatedFiles) === 0) {
|
|
|
|
throw new BadMethodCallException("To update robots.txt, call finalize() first.");
|
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
$robotsFilePath = $this->saveDirectory . $this->robotsFileName;
|
2021-03-10 18:57:41 +01:00
|
|
|
|
|
|
|
$robotsFileContent = $this->createNewRobotsContentFromFile($robotsFilePath);
|
|
|
|
|
|
|
|
$this->fs->file_put_contents($robotsFilePath, $robotsFileContent);
|
|
|
|
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2024-12-26 17:07:02 +01:00
|
|
|
* @param string $filepath
|
2021-03-10 18:57:41 +01:00
|
|
|
* @return string
|
2024-12-26 17:07:02 +01:00
|
|
|
* @throws RuntimeException
|
2021-03-10 18:57:41 +01:00
|
|
|
*/
|
2024-12-26 17:07:02 +01:00
|
|
|
private function createNewRobotsContentFromFile(string $filepath): string
|
2021-03-10 18:57:41 +01:00
|
|
|
{
|
|
|
|
if ($this->fs->file_exists($filepath)) {
|
2024-12-26 17:07:02 +01:00
|
|
|
$existingContent = $this->fs->file_get_contents($filepath);
|
|
|
|
// if $existingContent is bool and false, it means that file exists but is not readable
|
|
|
|
if (is_bool($existingContent) && !$existingContent) {
|
|
|
|
throw new RuntimeException("Failed to read existing robots.txt file: $filepath");
|
|
|
|
}
|
|
|
|
if (is_string($existingContent)) {
|
|
|
|
$contentLines = explode(PHP_EOL, $existingContent);
|
|
|
|
} else {
|
|
|
|
$contentLines = [];
|
|
|
|
}
|
|
|
|
$newContent = "";
|
|
|
|
foreach ($contentLines as $key => $line) {
|
|
|
|
if (str_starts_with($line, 'Sitemap:')) {
|
|
|
|
unset($contentLines[$key]);
|
2021-03-10 18:57:41 +01:00
|
|
|
} else {
|
2024-12-26 17:07:02 +01:00
|
|
|
$newContent .= $line . PHP_EOL;
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2024-12-26 17:07:02 +01:00
|
|
|
$newContent = $this->getSampleRobotsContent();
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
$newContent .= "Sitemap: {$this->generatedFiles['sitemaps_index_url']}";
|
2021-03-10 18:57:41 +01:00
|
|
|
|
2024-12-26 17:07:02 +01:00
|
|
|
return $newContent;
|
2021-03-10 18:57:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return string
|
|
|
|
* @access private
|
|
|
|
*/
|
|
|
|
private function getSampleRobotsContent(): string
|
|
|
|
{
|
|
|
|
return implode(PHP_EOL, $this->sampleRobotsLines) . PHP_EOL;
|
|
|
|
}
|
|
|
|
}
|