ZwiiCMS/core/class/SitemapGenerator.class.php
2020-06-30 20:13:51 +02:00

541 lines
19 KiB
PHP

<?php
namespace Icamys\SitemapGenerator;
class SitemapGenerator
{
const MAX_FILE_SIZE = 10485760;
const MAX_URLS_PER_SITEMAP = 50000;
const URL_PARAM_LOC = 0;
const URL_PARAM_LASTMOD = 1;
const URL_PARAM_CHANGEFREQ = 2;
const URL_PARAM_PRIORITY = 3;
/**
* Name of sitemap file
* @var string
* @access public
*/
public $sitemapFileName = "sitemap.xml";
/**
* Name of sitemap index file
* @var string
* @access public
*/
public $sitemapIndexFileName = "sitemap-index.xml";
/**
* Robots file name
* @var string
* @access public
*/
public $robotsFileName = "robots.txt";
/**
* Quantity of URLs per single sitemap file.
* According to specification max value is 50.000.
* If Your links are very long, sitemap file can be bigger than 10MB,
* in this case use smaller value.
* @var int
* @access public
*/
public $maxURLsPerSitemap = self::MAX_URLS_PER_SITEMAP;
/**
* Quantity of sitemaps per index file.
* According to specification max value is 50.000
* If Your index file is very long, index file can be bigger than 10MB,
* in this case use smaller value.
* @see http://www.sitemaps.org/protocol.html
* @var int
* @access public
*/
public $maxSitemaps = 50000;
/**
* If true, two sitemap files (.xml and .xml.gz) will be created and added to robots.txt.
* If true, .gz file will be submitted to search engines.
* If quantity of URLs will be bigger than 50.000, option will be ignored,
* all sitemap files except sitemap index will be compressed.
* @var bool
* @access public
*/
public $createGZipFile = false;
/**
* URL to Your site.
* Script will use it to send sitemaps to search engines.
* @var string
* @access private
*/
private $baseURL;
/**
* Base path. Relative to script location.
* Use this if Your sitemap and robots files should be stored in other
* directory then script.
* @var string
* @access private
*/
private $basePath;
/**
* Version of this class
* @var string
* @access private
*/
private $classVersion = "1.0.0";
/**
* Search engines URLs
* @var array of strings
* @access private
*/
private $searchEngines = array(
array(
"http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=USERID&url=",
"http://search.yahooapis.com/SiteExplorerService/V1/ping?sitemap="
),
"http://www.google.com/webmasters/tools/ping?sitemap=",
"http://submissions.ask.com/ping?sitemap=",
"http://www.bing.com/webmaster/ping.aspx?siteMap="
);
/**
* Array with urls
* @var \SplFixedArray of strings
* @access private
*/
private $urls;
/**
* Array with sitemap
* @var array of strings
* @access private
*/
private $sitemaps;
/**
* Array with sitemap index
* @var array of strings
* @access private
*/
private $sitemapIndex;
/**
* Current sitemap full URL
* @var string
* @access private
*/
private $sitemapFullURL;
/**
* @var \DOMDocument
*/
private $document;
/**
* Constructor.
* @param string $baseURL You site URL, with / at the end.
* @param string|null $basePath Relative path where sitemap and robots should be stored.
*/
public function __construct($baseURL, $basePath = "")
{
$this->urls = new \SplFixedArray();
$this->baseURL = $baseURL;
$this->basePath = $basePath;
$this->document = new \DOMDocument("1.0");
$this->document->preserveWhiteSpace = false;
$this->document->formatOutput = true;
}
/**
* Use this to add many URL at one time.
* Each inside array can have 1 to 4 fields.
* @param $urlsArray
* @throws \InvalidArgumentException
*/
public function addUrls($urlsArray)
{
if (!is_array($urlsArray)) {
throw new \InvalidArgumentException("Array as argument should be given.");
}
foreach ($urlsArray as $url) {
$this->addUrl(
isset($url[0]) ? $url[0] : null,
isset($url[1]) ? $url[1] : null,
isset($url[2]) ? $url[2] : null,
isset($url[3]) ? $url[3] : null
);
}
}
/**
* Use this to add single URL to sitemap.
* @param string $url URL
* @param \DateTime $lastModified When it was modified, use ISO 8601
* @param string $changeFrequency How often search engines should revisit this URL
* @param string $priority Priority of URL on You site
* @see http://en.wikipedia.org/wiki/ISO_8601
* @see http://php.net/manual/en/function.date.php
* @throws \InvalidArgumentException
*/
public function addUrl($url, \DateTime $lastModified = null, $changeFrequency = null, $priority = null)
{
if ($url == null) {
throw new \InvalidArgumentException("URL is mandatory. At least one argument should be given.");
}
$urlLength = extension_loaded('mbstring') ? mb_strlen($url) : strlen($url);
if ($urlLength > 2048) {
throw new \InvalidArgumentException(
"URL length can't be bigger than 2048 characters.
Note, that precise url length check is guaranteed only using mb_string extension.
Make sure Your server allow to use mbstring extension."
);
}
$tmp = new \SplFixedArray(1);
$tmp[self::URL_PARAM_LOC] = $url;
if (isset($lastModified)) {
$tmp->setSize(2);
$tmp[self::URL_PARAM_LASTMOD] = $lastModified->format(\DateTime::ATOM);
}
if (isset($changeFrequency)) {
$tmp->setSize(3);
$tmp[self::URL_PARAM_CHANGEFREQ] = $changeFrequency;
}
if (isset($priority)) {
$tmp->setSize(4);
$tmp[self::URL_PARAM_PRIORITY] = $priority;
}
if ($this->urls->getSize() === 0) {
$this->urls->setSize(1);
} else {
if ($this->urls->getSize() === $this->urls->key()) {
$this->urls->setSize($this->urls->getSize() * 2);
}
}
$this->urls[$this->urls->key()] = $tmp;
$this->urls->next();
}
/**
* @throws \BadMethodCallException
* @throws \InvalidArgumentException
* @throws \LengthException
*/
public function createSitemap()
{
if (!isset($this->urls)) {
throw new \BadMethodCallException("To create sitemap, call addUrl or addUrls function first.");
}
if ($this->maxURLsPerSitemap > self::MAX_URLS_PER_SITEMAP) {
throw new \InvalidArgumentException(
"More than " . self::MAX_URLS_PER_SITEMAP . " URLs per single sitemap is not allowed."
);
}
$generatorInfo = '<!-- generated-on="' . date('c') . '" -->';
$sitemapHeader = '<?xml version="1.0" encoding="UTF-8"?>' . $generatorInfo . '
<urlset
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' . "\r\n" . '
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9' . "\n" . '
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"' . "\n" . '
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
</urlset>';
$sitemapIndexHeader = '<?xml version="1.0" encoding="UTF-8"?>' . $generatorInfo . '
<sitemapindex
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd"
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
</sitemapindex>';
$nullUrls = 0;
foreach ($this->urls as $url) {
if (is_null($url)) {
$nullUrls++;
}
}
$nonEmptyUrls = $this->urls->getSize() - $nullUrls;
$chunks = ceil($nonEmptyUrls / $this->maxURLsPerSitemap);
for ($chunkCounter = 0; $chunkCounter < $chunks; $chunkCounter++) {
$xml = new \SimpleXMLElement($sitemapHeader);
for ($urlCounter = $chunkCounter * $this->maxURLsPerSitemap;
$urlCounter < ($chunkCounter + 1) * $this->maxURLsPerSitemap && $urlCounter < $nonEmptyUrls;
$urlCounter++
) {
$row = $xml->addChild('url');
$row->addChild(
'loc',
htmlspecialchars($this->baseURL . $this->urls[$urlCounter][self::URL_PARAM_LOC], ENT_QUOTES, 'UTF-8')
);
if ($this->urls[$urlCounter]->getSize() > 1) {
$row->addChild('lastmod', $this->urls[$urlCounter][self::URL_PARAM_LASTMOD]);
}
if ($this->urls[$urlCounter]->getSize() > 2) {
$row->addChild('changefreq', $this->urls[$urlCounter][self::URL_PARAM_CHANGEFREQ]);
}
if ($this->urls[$urlCounter]->getSize() > 3) {
$row->addChild('priority', $this->urls[$urlCounter][self::URL_PARAM_PRIORITY]);
}
}
if (strlen($xml->asXML()) > self::MAX_FILE_SIZE) {
throw new \LengthException(
"Sitemap size equals to " . strlen($xml->asXML())
. " bytes is more than 10MB (" . self::MAX_FILE_SIZE . " bytes),
please decrease maxURLsPerSitemap variable."
);
}
$this->sitemaps[] = $xml->asXML();
}
if (count($this->sitemaps) > $this->maxSitemaps) {
throw new \LengthException(
"Sitemap index can contain {$this->maxSitemaps} sitemaps.
Perhaps You trying to submit too many maps."
);
}
if (count($this->sitemaps) > 1) {
for ($i = 0; $i < count($this->sitemaps); $i++) {
$this->sitemaps[$i] = array(
str_replace(".xml", ($i + 1) . ".xml", $this->sitemapFileName),
$this->sitemaps[$i]
);
}
$xml = new \SimpleXMLElement($sitemapIndexHeader);
foreach ($this->sitemaps as $sitemap) {
$row = $xml->addChild('sitemap');
$row->addChild('loc', $this->baseURL . "/" . $this->getSitemapFileName(htmlentities($sitemap[0])));
$row->addChild('lastmod', date('c'));
}
$this->sitemapFullURL = $this->baseURL . "/" . $this->sitemapIndexFileName;
$this->sitemapIndex = array(
$this->sitemapIndexFileName,
$xml->asXML()
);
} else {
$this->sitemapFullURL = $this->baseURL . "/" . $this->getSitemapFileName();
$this->sitemaps[0] = array(
$this->sitemapFileName,
$this->sitemaps[0]
);
}
}
/**
* Returns created sitemaps as array of strings.
* Use it You want to work with sitemap without saving it as files.
* @return array of strings
* @access public
*/
public function toArray()
{
if (isset($this->sitemapIndex)) {
return array_merge(array($this->sitemapIndex), $this->sitemaps);
} else {
return $this->sitemaps;
}
}
/**
* Will write sitemaps as files.
* @access public
* @throws \BadMethodCallException
*/
public function writeSitemap()
{
if (!isset($this->sitemaps)) {
throw new \BadMethodCallException("To write sitemap, call createSitemap function first.");
}
if (isset($this->sitemapIndex)) {
$this->document->loadXML($this->sitemapIndex[1]);
$this->writeFile($this->document->saveXML(), $this->basePath, $this->sitemapIndex[0], true);
foreach ($this->sitemaps as $sitemap) {
$this->writeFile($sitemap[1], $this->basePath, $sitemap[0]);
}
} else {
$this->document->loadXML($this->sitemaps[0][1]);
$this->writeFile($this->document->saveXML(), $this->basePath, $this->sitemaps[0][0], true);
$this->writeFile($this->sitemaps[0][1], $this->basePath, $this->sitemaps[0][0]);
}
}
private function getSitemapFileName($name = null)
{
if (!$name) {
$name = $this->sitemapFileName;
}
if ($this->createGZipFile) {
$name .= ".gz";
}
return $name;
}
/**
* Save file.
* @param string $content
* @param string $filePath
* @param string $fileName
* @param bool $noGzip
* @return bool
* @access private
*/
private function writeFile($content, $filePath, $fileName, $noGzip = false)
{
if (!$noGzip && $this->createGZipFile) {
return $this->writeGZipFile($content, $filePath, $fileName);
}
$file = fopen($filePath . $fileName, 'w');
fwrite($file, $content);
return fclose($file);
}
/**
* Save GZipped file.
* @param string $content
* @param string $filePath
* @param string $fileName
* @return bool
* @access private
*/
private function writeGZipFile($content, $filePath, $fileName)
{
$fileName .= '.gz';
$file = gzopen($filePath . $fileName, 'w');
gzwrite($file, $content);
return gzclose($file);
}
/**
* If robots.txt file exist, will update information about newly created sitemaps.
* If there is no robots.txt will, create one and put into it information about sitemaps.
* @access public
* @throws \BadMethodCallException
*/
public function updateRobots()
{
if (!isset($this->sitemaps)) {
throw new \BadMethodCallException("To update robots.txt, call createSitemap function first.");
}
$sampleRobotsFile = "User-agent: *\nAllow: /";
if (file_exists($this->basePath . $this->robotsFileName)) {
$robotsFile = explode("\n", file_get_contents($this->basePath . $this->robotsFileName));
$robotsFileContent = "";
foreach ($robotsFile as $key => $value) {
if (substr($value, 0, 8) == 'Sitemap:') {
unset($robotsFile[$key]);
} else {
$robotsFileContent .= $value . "\n";
}
}
$robotsFileContent .= "Sitemap: $this->sitemapFullURL";
if (!isset($this->sitemapIndex)) {
$robotsFileContent .= "\nSitemap: " . $this->getSitemapFileName($this->sitemapFullURL);
}
file_put_contents($this->basePath . $this->robotsFileName, $robotsFileContent);
} else {
$sampleRobotsFile = $sampleRobotsFile . "\n\nSitemap: " . $this->sitemapFullURL;
if (!isset($this->sitemapIndex)) {
$sampleRobotsFile .= "\nSitemap: " . $this->getSitemapFileName($this->sitemapFullURL);
}
file_put_contents($this->basePath . $this->robotsFileName, $sampleRobotsFile);
}
}
/**
* Will inform search engines about newly created sitemaps.
* Google, Ask, Bing and Yahoo will be noticed.
* If You don't pass yahooAppId, Yahoo still will be informed,
* but this method can be used once per day. If You will do this often,
* message that limit was exceeded will be returned from Yahoo.
* @param string $yahooAppId Your site Yahoo appid.
* @return array of messages and http codes from each search engine
* @access public
* @throws \BadMethodCallException
*/
public function submitSitemap($yahooAppId = null)
{
if (!isset($this->sitemaps)) {
throw new \BadMethodCallException("To submit sitemap, call createSitemap function first.");
}
if (!extension_loaded('curl')) {
throw new \BadMethodCallException("cURL library is needed to do submission.");
}
$searchEngines = $this->searchEngines;
$searchEngines[0] = isset($yahooAppId) ?
str_replace("USERID", $yahooAppId, $searchEngines[0][0]) :
$searchEngines[0][1];
$result = array();
for ($i = 0; $i < count($searchEngines); $i++) {
$submitSite = curl_init($searchEngines[$i] . htmlspecialchars($this->sitemapFullURL, ENT_QUOTES, 'UTF-8'));
curl_setopt($submitSite, CURLOPT_RETURNTRANSFER, true);
$responseContent = curl_exec($submitSite);
$response = curl_getinfo($submitSite);
$submitSiteShort = array_reverse(explode(".", parse_url($searchEngines[$i], PHP_URL_HOST)));
$result[] = array(
"site" => $submitSiteShort[1] . "." . $submitSiteShort[0],
"fullsite" => $searchEngines[$i] . htmlspecialchars($this->sitemapFullURL, ENT_QUOTES, 'UTF-8'),
"http_code" => $response['http_code'],
"message" => str_replace("\n", " ", strip_tags($responseContent))
);
}
return $result;
}
/**
* Returns array of URLs
*
* Converts internal SplFixedArray to array
* @return array
*/
public function getUrls()
{
$urls = $this->urls->toArray();
/**
* @var int $key
* @var \SplFixedArray $urlSplArr
*/
foreach ($urls as $key => $urlSplArr) {
if (!is_null($urlSplArr)) {
$urlArr = $urlSplArr->toArray();
$url = [];
foreach ($urlArr as $paramIndex => $paramValue) {
switch ($paramIndex) {
case static::URL_PARAM_LOC:
$url['loc'] = $paramValue;
break;
case static::URL_PARAM_CHANGEFREQ:
$url['changefreq'] = $paramValue;
break;
case static::URL_PARAM_LASTMOD:
$url['lastmod'] = $paramValue;
break;
case static::URL_PARAM_PRIORITY:
$url['priority'] = $paramValue;
break;
default:
break;
}
}
$urls[$key] = $url;
}
}
return $urls;
}
public function countUrls()
{
return $this->urls->getSize();
}
}