Index: typo3/sysext/indexed_search/class.indexer.php =================================================================== --- typo3/sysext/indexed_search/class.indexer.php (revision 331) +++ typo3/sysext/indexed_search/class.indexer.php (working copy) @@ -876,9 +876,36 @@ return $list; } + /** + * Extracts the "base href" from content string. + * + * @param string Content to analyze + * @return string The base href or an empty string if not found + */ + public function extractBaseHref($string) { + if (!is_object($this->htmlParser)) { + $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); + } + $parts = $this->htmlParser->splitTags('base', $string); + foreach ($parts as $k => $v) { + if ($k % 2) { + $params = $this->htmlParser->get_tag_attributes($v, 1); + $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag + switch (strtolower($firstTagName)) { + case 'base': + $href = $params[0]['href']; + if ($href) { + // Return the first "base href" found (a single one should be present anyway) + return $href; + } + } + } + } + return ''; + } Index: class.crawler.php =================================================================== --- class.crawler.php (revision 331) +++ class.crawler.php (working copy) @@ -641,6 +641,15 @@ $indexerObj->indexExternalUrl($url); $url_qParts = parse_url($url); + $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host']; + $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content); + if (!$baseHref) { + // Extract base href from current URL + $baseHref = $baseAbsoluteHref; + $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/')); + } + $baseHref = rtrim($baseHref, '/'); + // Get URLs on this page: $subUrls = array(); $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content); @@ -653,7 +662,12 @@ $qParts = parse_url($subUrl); if (!$qParts['scheme']) { - $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl); + $relativeUrl = t3lib_div::resolveBackPath($subUrl); + if ($relativeUrl{0} === '/') { + $subUrl = $baseAbsoluteHref . $relativeUrl; + } else { + $subUrl = $baseHref . '/' . $relativeUrl; + } } $subUrls[] = $subUrl;