<?php namespace Icamys\SitemapGenerator; use BadMethodCallException; use DateTime; use Icamys\SitemapGenerator\Extensions\GoogleVideoExtension; use InvalidArgumentException; use OutOfRangeException; use RuntimeException; use XMLWriter; /** * Class SitemapGenerator * @package Icamys\SitemapGenerator */ class SitemapGenerator { /** * Max size of a sitemap according to spec. * @see https://www.sitemaps.org/protocol.html */ private const MAX_FILE_SIZE = 52428800; /** * Max number of urls per sitemap according to spec. * @see https://www.sitemaps.org/protocol.html */ private const MAX_URLS_PER_SITEMAP = 50000; /** * Max number of sitemaps per index file according to spec. * @see http://www.sitemaps.org/protocol.html */ private const MAX_SITEMAPS_PER_INDEX = 50000; /** * Total max number of URLs. */ private const TOTAL_MAX_URLS = self::MAX_URLS_PER_SITEMAP * self::MAX_SITEMAPS_PER_INDEX; /** * Max url length according to spec. * @see https://www.sitemaps.org/protocol.html#xmlTagDefinitions */ private const MAX_URL_LEN = 2048; /** * Robots file name * @var string * @access public */ private $robotsFileName = "robots.txt"; /** * Name of sitemap file * @var string * @access public */ private $sitemapFileName = "sitemap.xml"; /** * Name of sitemap index file * @var string * @access public */ private $sitemapIndexFileName = "sitemap-index.xml"; /** * Quantity of URLs per single sitemap file. * If Your links are very long, sitemap file can be bigger than 10MB, * in this case use smaller value. * @var int * @access public */ private $maxUrlsPerSitemap = self::MAX_URLS_PER_SITEMAP; /** * If true, two sitemap files (.xml and .xml.gz) will be created and added to robots.txt. * If true, .gz file will be submitted to search engines. * If quantity of URLs will be bigger than 50.000, option will be ignored, * all sitemap files except sitemap index will be compressed. * @var bool * @access public */ private $isCompressionEnabled = false; /** * URL to Your site. * Script will use it to send sitemaps to search engines. * @var string * @access private */ private $baseURL; /** * Base path. Relative to script location. * Use this if Your sitemap and robots files should be stored in other * directory then script. * @var string * @access private */ private $basePath; /** * Version of this class * @var string * @access private */ private $classVersion = "4.3.1"; /** * Search engines URLs * @var array of strings * @access private */ private $searchEngines = [ [ "http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=USERID&url=", "http://search.yahooapis.com/SiteExplorerService/V1/ping?sitemap=", ], "http://www.google.com/ping?sitemap=", "http://submissions.ask.com/ping?sitemap=", "http://www.bing.com/ping?sitemap=", "http://www.webmaster.yandex.ru/ping?sitemap=", ]; /** * Array with urls * @var array * @access private */ private $urls; /** * Lines for robots.txt file that are written if file does not exist * @var array */ private $sampleRobotsLines = [ "User-agent: *", "Allow: /", ]; /** * @var array list of valid changefreq values according to the spec */ private $validChangefreqValues = [ 'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never', ]; /** * @var float[] list of valid priority values according to the spec */ private $validPriorities = [ 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, ]; /** * @var FileSystem object used to communicate with file system */ private $fs; /** * @var Runtime object used to communicate with runtime */ private $runtime; /** * @var XMLWriter Used for writing xml to files */ private $xmlWriter; /** * @var string */ private $flushedSitemapFilenameFormat; /** * @var int */ private $flushedSitemapSize = 0; /** * @var int */ private $flushedSitemapCounter = 0; /** * @var array */ private $flushedSitemaps = []; /** * @var bool */ private $isSitemapStarted = false; /** * @var int */ private $totalUrlCount = 0; /** * @var int */ private $urlsetClosingTagLen = 10; // strlen("</urlset>\n") private $sitemapUrlCount = 0; private $generatedFiles = []; /** * @param string $baseURL You site URL * @param string $basePath Relative path where sitemap and robots should be stored. * @param FileSystem|null $fs * @param Runtime|null $runtime */ public function __construct(string $baseURL, string $basePath = "", FileSystem $fs = null, Runtime $runtime = null) { $this->urls = []; $this->baseURL = rtrim($baseURL, '/'); if ($fs === null) { $this->fs = new FileSystem(); } else { $this->fs = $fs; } if ($runtime === null) { $this->runtime = new Runtime(); } else { $this->runtime = $runtime; } if ($this->runtime->is_writable($basePath) === false) { throw new InvalidArgumentException( sprintf('the provided basePath (%s) should be a writable directory,', $basePath) . ' please check its existence and permissions' ); } if (strlen($basePath) > 0 && substr($basePath, -1) != DIRECTORY_SEPARATOR) { $basePath = $basePath . DIRECTORY_SEPARATOR; } $this->basePath = $basePath; $this->xmlWriter = $this->createXmlWriter(); $this->flushedSitemapFilenameFormat = sprintf("sm-%%d-%d.xml", time()); } private function createXmlWriter(): XMLWriter { $w = new XMLWriter(); $w->openMemory(); $w->setIndent(true); return $w; } /** * @param string $filename * @return SitemapGenerator */ public function setSitemapFilename(string $filename = ''): SitemapGenerator { if (strlen($filename) === 0) { throw new InvalidArgumentException('sitemap filename should not be empty'); } if (pathinfo($filename, PATHINFO_EXTENSION) !== 'xml') { throw new InvalidArgumentException('sitemap filename should have *.xml extension'); } $this->sitemapFileName = $filename; return $this; } /** * @param string $filename * @return $this */ public function setSitemapIndexFilename(string $filename = ''): SitemapGenerator { if (strlen($filename) === 0) { throw new InvalidArgumentException('filename should not be empty'); } $this->sitemapIndexFileName = $filename; return $this; } /** * @param string $filename * @return $this */ public function setRobotsFileName(string $filename): SitemapGenerator { if (strlen($filename) === 0) { throw new InvalidArgumentException('filename should not be empty'); } $this->robotsFileName = $filename; return $this; } /** * @param int $value * @return $this */ public function setMaxUrlsPerSitemap(int $value): SitemapGenerator { if ($value < 1 || self::MAX_URLS_PER_SITEMAP < $value) { throw new OutOfRangeException( sprintf('value %d is out of range 1-%d', $value, self::MAX_URLS_PER_SITEMAP) ); } $this->maxUrlsPerSitemap = $value; return $this; } public function enableCompression(): SitemapGenerator { $this->isCompressionEnabled = true; return $this; } public function disableCompression(): SitemapGenerator { $this->isCompressionEnabled = false; return $this; } public function isCompressionEnabled(): bool { return $this->isCompressionEnabled; } public function validate( string $path, DateTime $lastModified = null, string $changeFrequency = null, float $priority = null, array $alternates = null, array $extensions = []) { if (!(1 <= mb_strlen($path) && mb_strlen($path) <= self::MAX_URL_LEN)) { throw new InvalidArgumentException( sprintf("The urlPath argument length must be between 1 and %d.", self::MAX_URL_LEN) ); } if ($changeFrequency !== null && !in_array($changeFrequency, $this->validChangefreqValues)) { throw new InvalidArgumentException( 'The change frequency argument should be one of: %s' . implode(',', $this->validChangefreqValues) ); } if ($priority !== null && !in_array($priority, $this->validPriorities)) { throw new InvalidArgumentException("Priority argument should be a float number in the range [0.0..1.0]"); } if ($extensions !== null && isset($extensions['google_video'])) { GoogleVideoExtension::validate($this->baseURL . $path, $extensions['google_video']); } } /** * Add url components. * Instead of storing all urls in the memory, the generator will flush sets of added urls * to the temporary files created on your disk. * The file format is 'sm-{index}-{timestamp}.xml' * @param string $path * @param DateTime|null $lastModified * @param string|null $changeFrequency * @param float|null $priority * @param array|null $alternates * @param array $extensions * @return $this */ public function addURL( string $path, DateTime $lastModified = null, string $changeFrequency = null, float $priority = null, array $alternates = null, array $extensions = [] ): SitemapGenerator { $this->validate($path, $lastModified, $changeFrequency, $priority, $alternates, $extensions); if ($this->totalUrlCount >= self::TOTAL_MAX_URLS) { throw new OutOfRangeException( sprintf("Max url limit reached (%d)", self::TOTAL_MAX_URLS) ); } if ($this->isSitemapStarted === false) { $this->writeSitemapStart(); } $this->writeSitemapUrl($this->baseURL . $path, $lastModified, $changeFrequency, $priority, $alternates, $extensions); if ($this->totalUrlCount % 1000 === 0 || $this->sitemapUrlCount >= $this->maxUrlsPerSitemap) { $this->flushWriter(); } if ($this->sitemapUrlCount === $this->maxUrlsPerSitemap) { $this->writeSitemapEnd(); } return $this; } private function writeSitemapStart() { $this->xmlWriter->startDocument("1.0", "UTF-8"); $this->xmlWriter->writeComment(sprintf('generator-class="%s"', get_class($this))); $this->xmlWriter->writeComment(sprintf('generator-version="%s"', $this->classVersion)); $this->xmlWriter->writeComment(sprintf('generated-on="%s"', date('c'))); $this->xmlWriter->startElement('urlset'); $this->xmlWriter->writeAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9'); $this->xmlWriter->writeAttribute('xmlns:xhtml', 'http://www.w3.org/1999/xhtml'); $this->xmlWriter->writeAttribute('xmlns:video', 'http://www.google.com/schemas/sitemap-video/1.1'); $this->xmlWriter->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'); $this->xmlWriter->writeAttribute('xsi:schemaLocation', 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'); $this->isSitemapStarted = true; } private function writeSitemapUrl($loc, $lastModified, $changeFrequency, $priority, $alternates, $extensions) { $this->xmlWriter->startElement('url'); $this->xmlWriter->writeElement('loc', htmlspecialchars($loc, ENT_QUOTES)); if ($lastModified !== null) { $this->xmlWriter->writeElement('lastmod', $lastModified->format(DateTime::ATOM)); } if ($changeFrequency !== null) { $this->xmlWriter->writeElement('changefreq', $changeFrequency); } if ($priority !== null) { $this->xmlWriter->writeElement('priority', number_format($priority, 1, ".", "")); } if (is_array($alternates) && count($alternates) > 0) { foreach ($alternates as $alternate) { if (is_array($alternate) && isset($alternate['hreflang']) && isset($alternate['href'])) { $this->xmlWriter->startElement('xhtml:link'); $this->xmlWriter->writeAttribute('rel', 'alternate'); $this->xmlWriter->writeAttribute('hreflang', $alternate['hreflang']); $this->xmlWriter->writeAttribute('href', $alternate['href']); $this->xmlWriter->endElement(); } } } foreach ($extensions as $extName => $extFields) { if ($extName === 'google_video') { GoogleVideoExtension::writeVideoTag($this->xmlWriter, $loc, $extFields); } } $this->xmlWriter->endElement(); // url $this->sitemapUrlCount++; $this->totalUrlCount++; } private function flushWriter() { $targetSitemapFilepath = $this->basePath . sprintf($this->flushedSitemapFilenameFormat, $this->flushedSitemapCounter); $flushedString = $this->xmlWriter->outputMemory(true); $flushedStringLen = mb_strlen($flushedString); if ($flushedStringLen === 0) { return; } $this->flushedSitemapSize += $flushedStringLen; if ($this->flushedSitemapSize > self::MAX_FILE_SIZE - $this->urlsetClosingTagLen) { $this->writeSitemapEnd(); $this->writeSitemapStart(); } $this->fs->file_put_contents($targetSitemapFilepath, $flushedString, FILE_APPEND); } private function writeSitemapEnd() { $targetSitemapFilepath = $this->basePath . sprintf($this->flushedSitemapFilenameFormat, $this->flushedSitemapCounter); $this->xmlWriter->endElement(); // urlset $this->xmlWriter->endDocument(); $this->fs->file_put_contents($targetSitemapFilepath, $this->xmlWriter->flush(true), FILE_APPEND); $this->isSitemapStarted = false; $this->flushedSitemaps[] = $targetSitemapFilepath; $this->flushedSitemapCounter++; $this->sitemapUrlCount = 0; } /** * Flush all stored urls from memory to the disk and close all necessary tags. */ public function flush() { $this->flushWriter(); if ($this->isSitemapStarted) { $this->writeSitemapEnd(); } } /** * Move flushed files to their final location. Compress if necessary. */ public function finalize() { $this->generatedFiles = []; if (count($this->flushedSitemaps) === 1) { $targetSitemapFilename = $this->sitemapFileName; if ($this->isCompressionEnabled) { $targetSitemapFilename .= '.gz'; } $targetSitemapFilepath = $this->basePath . $targetSitemapFilename; if ($this->isCompressionEnabled) { $this->fs->copy($this->flushedSitemaps[0], 'compress.zlib://' . $targetSitemapFilepath); $this->fs->unlink($this->flushedSitemaps[0]); } else { $this->fs->rename($this->flushedSitemaps[0], $targetSitemapFilepath); } $this->generatedFiles['sitemaps_location'] = [$targetSitemapFilepath]; $this->generatedFiles['sitemaps_index_url'] = $this->baseURL . '/' . $targetSitemapFilename; } else if (count($this->flushedSitemaps) > 1) { $ext = '.' . pathinfo($this->sitemapFileName, PATHINFO_EXTENSION); $targetExt = $ext; if ($this->isCompressionEnabled) { $targetExt .= '.gz'; } $sitemapsUrls = []; $targetSitemapFilepaths = []; foreach ($this->flushedSitemaps as $i => $flushedSitemap) { $targetSitemapFilename = str_replace($ext, ($i + 1) . $targetExt, $this->sitemapFileName); $targetSitemapFilepath = $this->basePath . $targetSitemapFilename; if ($this->isCompressionEnabled) { $this->fs->copy($flushedSitemap, 'compress.zlib://' . $targetSitemapFilepath); $this->fs->unlink($flushedSitemap); } else { $this->fs->rename($flushedSitemap, $targetSitemapFilepath); } $sitemapsUrls[] = htmlspecialchars($this->baseURL . '/' . $targetSitemapFilename, ENT_QUOTES); $targetSitemapFilepaths[] = $targetSitemapFilepath; } $targetSitemapIndexFilepath = $this->basePath . $this->sitemapIndexFileName; $this->createSitemapIndex($sitemapsUrls, $targetSitemapIndexFilepath); $this->generatedFiles['sitemaps_location'] = $targetSitemapFilepaths; $this->generatedFiles['sitemaps_index_location'] = $targetSitemapIndexFilepath; $this->generatedFiles['sitemaps_index_url'] = $this->baseURL . '/' . $this->sitemapIndexFileName; } else { throw new RuntimeException('failed to finalize, please add urls and flush first'); } } private function createSitemapIndex($sitemapsUrls, $sitemapIndexFileName) { $this->xmlWriter->flush(true); $this->writeSitemapIndexStart(); foreach ($sitemapsUrls as $sitemapsUrl) { $this->writeSitemapIndexUrl($sitemapsUrl); } $this->writeSitemapIndexEnd(); $this->fs->file_put_contents( $sitemapIndexFileName, $this->xmlWriter->flush(true), FILE_APPEND ); } private function writeSitemapIndexStart() { $this->xmlWriter->startDocument("1.0", "UTF-8"); $this->xmlWriter->writeComment(sprintf('generator-class="%s"', get_class($this))); $this->xmlWriter->writeComment(sprintf('generator-version="%s"', $this->classVersion)); $this->xmlWriter->writeComment(sprintf('generated-on="%s"', date('c'))); $this->xmlWriter->startElement('sitemapindex'); $this->xmlWriter->writeAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9'); $this->xmlWriter->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'); $this->xmlWriter->writeAttribute('xsi:schemaLocation', 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'); } private function writeSitemapIndexUrl($url) { $this->xmlWriter->startElement('sitemap'); $this->xmlWriter->writeElement('loc', htmlspecialchars($url, ENT_QUOTES)); $this->xmlWriter->writeElement('lastmod', date('c')); $this->xmlWriter->endElement(); // sitemap } private function writeSitemapIndexEnd() { $this->xmlWriter->endElement(); // sitemapindex $this->xmlWriter->endDocument(); } /** * @return array Array of previously generated files */ public function getGeneratedFiles(): array { return $this->generatedFiles; } /** * Will inform search engines about newly created sitemaps. * Google, Ask, Bing and Yahoo will be noticed. * If You don't pass yahooAppId, Yahoo still will be informed, * but this method can be used once per day. If You will do this often, * message that limit was exceeded will be returned from Yahoo. * @param string $yahooAppId Your site Yahoo appid. * @return array of messages and http codes from each search engine * @access public * @throws BadMethodCallException */ public function submitSitemap($yahooAppId = null): array { if (count($this->generatedFiles) === 0) { throw new BadMethodCallException("To update robots.txt, call finalize() first."); } if (!$this->runtime->extension_loaded('curl')) { throw new BadMethodCallException("cURL extension is needed to do submission."); } $searchEngines = $this->searchEngines; $searchEngines[0] = isset($yahooAppId) ? str_replace("USERID", $yahooAppId, $searchEngines[0][0]) : $searchEngines[0][1]; $result = []; for ($i = 0; $i < count($searchEngines); $i++) { $submitUrl = $searchEngines[$i] . htmlspecialchars($this->generatedFiles['sitemaps_index_url'], ENT_QUOTES); $submitSite = $this->runtime->curl_init($submitUrl); $this->runtime->curl_setopt($submitSite, CURLOPT_RETURNTRANSFER, true); $responseContent = $this->runtime->curl_exec($submitSite); $response = $this->runtime->curl_getinfo($submitSite); $submitSiteShort = array_reverse(explode(".", parse_url($searchEngines[$i], PHP_URL_HOST))); $result[] = [ "site" => $submitSiteShort[1] . "." . $submitSiteShort[0], "fullsite" => $submitUrl, "http_code" => $response['http_code'], "message" => str_replace("\n", " ", strip_tags($responseContent)), ]; } return $result; } /** * Adds sitemap url to robots.txt file located in basePath. * If robots.txt file exists, * the function will append sitemap url to file. * If robots.txt does not exist, * the function will create new robots.txt file with sample content and sitemap url. * @access public * @throws BadMethodCallException * @throws RuntimeException */ public function updateRobots(): SitemapGenerator { if (count($this->generatedFiles) === 0) { throw new BadMethodCallException("To update robots.txt, call finalize() first."); } $robotsFilePath = $this->basePath . $this->robotsFileName; $robotsFileContent = $this->createNewRobotsContentFromFile($robotsFilePath); $this->fs->file_put_contents($robotsFilePath, $robotsFileContent); return $this; } /** * @param $filepath * @return string */ private function createNewRobotsContentFromFile($filepath): string { if ($this->fs->file_exists($filepath)) { $robotsFileContent = ""; $robotsFile = explode(PHP_EOL, $this->fs->file_get_contents($filepath)); foreach ($robotsFile as $key => $value) { if (substr($value, 0, 8) == 'Sitemap:') { unset($robotsFile[$key]); } else { $robotsFileContent .= $value . PHP_EOL; } } } else { $robotsFileContent = $this->getSampleRobotsContent(); } $robotsFileContent .= "Sitemap: {$this->generatedFiles['sitemaps_index_url']}"; return $robotsFileContent; } /** * @return string * @access private */ private function getSampleRobotsContent(): string { return implode(PHP_EOL, $this->sampleRobotsLines) . PHP_EOL; } }