<?php

namespace wbb\system\cronjob;

use GuzzleHttp\ClientInterface;
use GuzzleHttp\Psr7\Request;
use wbb\data\board\BoardCache;
use wbb\data\board\BoardEditor;
use wbb\data\rss\feed\RssFeed;
use wbb\data\rss\feed\RssFeedEditor;
use wbb\data\rss\feed\RssFeedList;
use wbb\data\thread\ThreadAction;
use wbb\system\label\object\RssFeedLabelObjectHandler;
use wbb\system\label\object\ThreadLabelObjectHandler;
use wcf\data\cronjob\Cronjob;
use wcf\system\cache\runtime\UserProfileRuntimeCache;
use wcf\system\cronjob\AbstractCronjob;
use wcf\system\database\util\PreparedStatementConditionBuilder;
use wcf\system\exception\SystemException;
use wcf\system\html\input\HtmlInputProcessor;
use wcf\system\io\HttpFactory;
use wcf\system\language\LanguageFactory;
use wcf\system\WCF;
use wcf\util\ArrayUtil;
use wcf\util\MessageUtil;
use wcf\util\Url;
use wcf\util\XML;

/**
 * Creates threads from RSS feeds.
 *
 * @author  Marcel Werk
 * @copyright   2001-2019 WoltLab GmbH
 * @license WoltLab License <http://www.woltlab.com/license-agreement.html>
 * @package WoltLabSuite\Forum\System\Cronjob
 */
class RSSFeedReaderCronjob extends AbstractCronjob
{
    /**
     * Maximum number of seconds a request to download a RSS feed may take.
     */
    private const DOWNLOAD_TIMEOUT = 10;

    /**
     * @var ClientInterface
     */
    private $httpClient;

    /**
     * @inheritDoc
     */
    public function execute(Cronjob $cronjob)
    {
        parent::execute($cronjob);

        $sql = "INSERT INTO wbb" . WCF_N . "_rss_feed_log
                            (feedID, hash, threadID)
                VALUES      (?, ?, ?)";
        $logStatement = WCF::getDB()->prepareStatement($sql);

        // get feeds
        $feedList = new RssFeedList();
        $feedList->getConditionBuilder()->add('isDisabled = ?', [0]);
        $feedList->getConditionBuilder()->add('lastRun + cycleTime < ?', [TIME_NOW]);
        $feedList->readObjects();

        if (\count($feedList)) {
            $feedLabels = RssFeedLabelObjectHandler::getInstance()->getAssignedLabels($feedList->getObjectIDs(), false);
        }

        foreach ($feedList as $feed) {
            if (!$feed->boardID || !$feed->userID) {
                continue;
            }
            $errorMessage = '';

            try {
                $data = $this->getFeedData($feed);
                if (!empty($data)) {
                    $user = UserProfileRuntimeCache::getInstance()->getObject($feed->userID);

                    // get language
                    if ($feed->languageID) {
                        $language = LanguageFactory::getInstance()
                            ->getLanguage($feed->languageID);
                    } else {
                        $language = LanguageFactory::getInstance()
                            ->getLanguage(LanguageFactory::getInstance()->getDefaultLanguageID());
                    }

                    // get tags
                    $feedTags = [];
                    if (MODULE_TAGGING && WBB_THREAD_ENABLE_TAGS && $feed->threadTags) {
                        $feedTags = \array_unique(ArrayUtil::trim(\explode(',', $feed->threadTags)));
                    }

                    $parsedFeedURL = Url::parse($feed->url);
                    foreach ($data as $item) {
                        $postData = [];
                        if (WBB_OFFICIAL_POST_DEFAULT && $user->getPermission('mod.board.canMarkPostOfficial')) {
                            $postData['isOfficial'] = 1;
                        }

                        // fix relative links
                        if (!empty($item['link']) && !\preg_match('~^https?://~', $item['link'])) {
                            $item['link'] = $parsedFeedURL['scheme'] . '://' . $parsedFeedURL['host'] . $item['link'];
                        }

                        $tags = $feedTags;
                        if ($feed->useCategoriesAsTags && !empty($item['categories'])) {
                            $tags = \array_unique(\array_merge($tags, $item['categories']));
                        }

                        $htmlInputProcessor = new HtmlInputProcessor();
                        $htmlInputProcessor->process($language->getDynamicVariable('wbb.acp.rssFeed.template', [
                            'author' => $feed->title,
                            'link' => MessageUtil::stripCrap($item['link']),
                            'description' => MessageUtil::stripCrap($item['description']),
                        ]), 'com.woltlab.wbb.post');

                        // create threads
                        $action = new ThreadAction([], 'create', [
                            'data' => [
                                'boardID' => $feed->boardID,
                                'languageID' => $feed->languageID,
                                'topic' => \mb_substr(MessageUtil::stripCrap($item['title']), 0, 255),
                                'time' => $item['time'],
                                'userID' => $feed->userID,
                                'username' => $user->username,
                                'isClosed' => $feed->closeThread,
                                'isDisabled' => $feed->disableThread,
                                'hasLabels' => !empty($feedLabels[$feed->feedID]) ? 1 : 0,
                            ],
                            'postData' => $postData,
                            'tags' => $tags,
                            'subscribeThread' => false,
                            'htmlInputProcessor' => $htmlInputProcessor,
                        ]);
                        $resultValues = $action->executeAction();

                        // set labels
                        if (isset($feedLabels[$feed->feedID])) {
                            ThreadLabelObjectHandler::getInstance()->setLabels(
                                \array_keys($feedLabels[$feed->feedID]),
                                $resultValues['returnValues']->threadID
                            );
                        }

                        // create log entry
                        $logStatement->execute([$feed->feedID, $item['hash'], $resultValues['returnValues']->threadID]);
                    }

                    // update last post
                    $boardEditor = new BoardEditor(BoardCache::getInstance()->getBoard($feed->boardID));
                    $boardEditor->updateLastPost();
                }
            } catch (\Exception $e) {
                \wcf\functions\exception\logThrowable($e);
                $errorMessage = $e->getMessage();
            }

            // update last run
            $editor = new RssFeedEditor($feed);
            $editor->update([
                'lastRun' => TIME_NOW,
                'errorMessage' => $errorMessage,
            ]);
        }
    }

    /**
     * Reads an atom feed.
     *
     * @see http://www.ietf.org/rfc/rfc4287.txt
     *
     * @param   RssFeed     $feed
     * @param   \DOMXPath   $xpath
     * @return  string[][]
     */
    protected function readAtomFeed(RssFeed $feed, \DOMXPath $xpath)
    {
        // get search keywords
        $keywords = [];
        if ($feed->searchKeywords) {
            $keywords = \array_unique(ArrayUtil::trim(\preg_split('/[,;]/', \mb_strtolower($feed->searchKeywords))));
        }

        // get items
        $items = $xpath->query('//ns:entry');
        $data = [];
        $i = 0;

        foreach ($items as $item) {
            $childNodes = $xpath->query('child::*', $item);
            $itemData = [];
            $isYoutube = false;
            foreach ($childNodes as $childNode) {
                if ($childNode->nodeName == 'category') {
                    if ($childNode->attributes->getNamedItem('term')) {
                        if (!isset($itemData['categories'])) {
                            $itemData['categories'] = [];
                        }

                        $itemData['categories'][] = $childNode->attributes->getNamedItem('term')->nodeValue;
                    }
                } elseif ($childNode->nodeName == 'media:group') {
                    foreach ($xpath->query('child::*', $childNode) as $mediaProperty) {
                        if ($mediaProperty->nodeName == 'media:description') {
                            $isYoutube = true;
                            $itemData['content'] = $mediaProperty->nodeValue;
                        }
                    }
                } else {
                    if ($childNode->nodeName == 'link') {
                        if (!isset($itemData[$childNode->nodeName]) && $childNode->attributes->getNamedItem('href')) {
                            $rel = $childNode->attributes->getNamedItem('rel');

                            // default rel-value is alternate
                            if (($rel && $rel->nodeValue == 'alternate') || $rel === null) {
                                $itemData[$childNode->nodeName] = $childNode
                                    ->attributes
                                    ->getNamedItem('href')
                                    ->nodeValue;
                            }
                        }
                    } else {
                        $itemData[$childNode->nodeName] = $childNode->nodeValue;
                    }
                }
            }

            if ($isYoutube && !empty($itemData['link']) && !empty($itemData['content'])) {
                $itemData['content'] = \nl2br($itemData['link'] . "\n\n" . $itemData['content']);
            }

            // validate item data
            if (
                empty($itemData['title'])
                || empty($itemData['id'])
                || empty($itemData['link'])
                || (empty($itemData['content']) && empty($itemData['summary']))
            ) {
                continue;
            }

            $hash = \sha1($itemData['id']);
            if (isset($itemData['published'])) {
                $time = \strtotime($itemData['published']);
                if ($time > TIME_NOW) {
                    continue;
                }
            } elseif (isset($itemData['updated'])) {
                $time = \strtotime($itemData['updated']);
                if ($time > TIME_NOW) {
                    continue;
                }
            } else {
                $time = TIME_NOW;
            }
            if (!empty($itemData['content'])) {
                $description = $itemData['content'];
            } else {
                $description = $itemData['summary'];
            }

            // check search words
            if (!empty($keywords)) {
                $haystack = \mb_strtolower($itemData['title'] . $description);
                $skip = true;
                foreach ($keywords as $keyword) {
                    if (\mb_strpos($haystack, $keyword) !== false) {
                        $skip = false;
                        break;
                    }
                }
                if ($skip) {
                    continue;
                }
            }

            // get data
            $data[$hash] = [
                'title' => $itemData['title'],
                'link' => $itemData['link'],
                'description' => $description,
                'time' => $time,
                'hash' => $hash,
                'categories' => !empty($itemData['categories']) ? $itemData['categories'] : [],
            ];

            // check max results
            $i++;
            if ($feed->maxResults && $i == $feed->maxResults) {
                break;
            }
        }

        return $data;
    }

    /**
     * Reads a rss feed.
     *
     * @see http://validator.w3.org/feed/docs/rss2.html
     *
     * @param   RssFeed     $feed
     * @param   \DOMXPath   $xpath
     * @return  string[][]
     */
    protected function readRssFeed(RssFeed $feed, \DOMXPath $xpath)
    {
        // get search keywords
        $keywords = [];
        if ($feed->searchKeywords) {
            $keywords = \array_unique(ArrayUtil::trim(\preg_split('/[,;]/', \mb_strtolower($feed->searchKeywords))));
        }

        // get items
        $items = $xpath->query('//channel/item');
        $data = [];
        $i = 0;
        foreach ($items as $item) {
            $childNodes = $xpath->query('child::*', $item);
            $itemData = [];
            foreach ($childNodes as $childNode) {
                if ($childNode->nodeName == 'category') {
                    if (!isset($itemData['categories'])) {
                        $itemData['categories'] = [];
                    }

                    $itemData['categories'][] = $childNode->nodeValue;
                } else {
                    $itemData[$childNode->nodeName] = $childNode->nodeValue;
                }
            }

            // validate item data
            if (empty($itemData['title']) || (empty($itemData['description']) && empty($itemData['content:encoded']))) {
                continue;
            }

            // get description
            if (!empty($itemData['content:encoded'])) {
                $description = $itemData['content:encoded'];
            } else {
                $description = $itemData['description'];
            }

            // get hash
            if (!empty($itemData['guid'])) {
                $hash = \sha1($itemData['guid']);
            } elseif (!empty($itemData['link'])) {
                $hash = \sha1($itemData['link']);
            } else {
                $hash = \sha1($itemData['title'] . $description);
            }

            $time = false;
            // get publication date
            if (isset($itemData['pubDate'])) {
                $time = \strtotime($itemData['pubDate']);
                if ($time > TIME_NOW) {
                    continue;
                }
            }

            // either strtotime returned false or the feed did not include a date
            if (!$time) {
                $time = TIME_NOW;
            }

            // check search words
            if (!empty($keywords)) {
                $haystack = \mb_strtolower($itemData['title'] . $description);
                $skip = true;
                foreach ($keywords as $keyword) {
                    if (\mb_strpos($haystack, $keyword) !== false) {
                        $skip = false;
                        break;
                    }
                }
                if ($skip) {
                    continue;
                }
            }

            // get data
            $data[$hash] = [
                'title' => $itemData['title'],
                'link' => !empty($itemData['link']) ? $itemData['link'] : '',
                'description' => $description,
                'time' => $time,
                'hash' => $hash,
                'categories' => !empty($itemData['categories']) ? $itemData['categories'] : [],
            ];

            // check max results
            $i++;
            if ($feed->maxResults && $i == $feed->maxResults) {
                break;
            }
        }

        return $data;
    }

    private function getFeedData(RssFeed $feed): array
    {
        $request = new Request('GET', $feed->url);
        $response = $this->getHttpClient()->send($request);
        $content = (string)$response->getBody();

        $xml = new XML();
        $xml->loadXML($feed->url, $content);
        $xpath = $xml->xpath();
        $rootNode = $xpath->query('/*')->item(0);
        if ($rootNode === null) {
            return [];
        }

        $data = [];
        if ($rootNode->nodeName == 'feed') {
            $data = $this->readAtomFeed($feed, $xpath);
        } elseif ($rootNode->nodeName == 'rss') {
            $data = $this->readRssFeed($feed, $xpath);
        } else {
            throw new SystemException("XML document '" . $feed->url . "' is not a valid RSS or Atom feed.");
        }

        if (!empty($data)) {
            $data = $this->filterData($feed, $data);
        }

        return $data;
    }

    private function filterData(RssFeed $feed, $data): array
    {
        $conditionBuilder = new PreparedStatementConditionBuilder();
        $conditionBuilder->add('feedID = ?', [$feed->feedID]);
        $conditionBuilder->add('hash IN (?)', [\array_keys($data)]);
        $sql = "SELECT  hash
                FROM    wbb" . WCF_N . "_rss_feed_log
                " . $conditionBuilder;
        $statement = WCF::getDB()->prepareStatement($sql);
        $statement->execute($conditionBuilder->getParameters());
        while ($row = $statement->fetchArray()) {
            unset($data[$row['hash']]);
        }

        return $data;
    }

    private function getHttpClient(): ClientInterface
    {
        if (!$this->httpClient) {
            $this->httpClient = HttpFactory::makeClientWithTimeout(self::DOWNLOAD_TIMEOUT);
        }

        return $this->httpClient;
    }
}
