From 51e5adeabd6d63f5a4de781d4f1b8dcb01e96bbf Mon Sep 17 00:00:00 2001 From: Quentin BOUTEILLER Date: Fri, 27 Jun 2025 21:18:28 +0200 Subject: [PATCH] [CentreFranceBridge] Fix parser following website update --- bridges/CentreFranceBridge.php | 36 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/bridges/CentreFranceBridge.php b/bridges/CentreFranceBridge.php index 1abfae98..12237351 100644 --- a/bridges/CentreFranceBridge.php +++ b/bridges/CentreFranceBridge.php @@ -72,15 +72,9 @@ class CentreFranceBridge extends BridgeAbstract $newspaperUrl = 'https://www.' . $this->getInput('newspaper') . '/' . $localitySlug . '/'; $html = getSimpleHTMLDOM($newspaperUrl); - // Articles are detected through their titles - foreach ($html->find('.c-titre') as $articleTitleDOMElement) { - $articleLinkDOMElement = $articleTitleDOMElement->find('a', 0); - - // Ignore articles in the « Les + partagés » block - if (strpos($articleLinkDOMElement->id, 'les_plus_partages') !== false) { - continue; - } - + // Articles are detected through a standard tag + foreach ($html->find('article') as $articleDOMElement) { + $articleLinkDOMElement = $articleDOMElement->find('a', 0); $articleURI = $articleLinkDOMElement->href; // If the URI has already been processed, ignore it @@ -96,7 +90,7 @@ class CentreFranceBridge extends BridgeAbstract $articleTitle = ''; // If article is reserved for subscribers - if ($articleLinkDOMElement->find('span.premium-picto', 0)) { + if ($articleLinkDOMElement->find('span.premium-icon', 0)) { if ($this->getInput('remove-reserved-for-subscribers-articles') === true) { continue; } @@ -104,18 +98,22 @@ class CentreFranceBridge extends BridgeAbstract $articleTitle .= '🔒 '; } - $articleTitleDOMElement = $articleLinkDOMElement->find('span[data-tb-title]', 0); - if ($articleTitleDOMElement === null) { - continue; - } - if ($limit > 0 && count($this->items) === $limit) { break; } - $articleTitle .= $articleLinkDOMElement->find('span[data-tb-title]', 0)->innertext; - $articleFullURI = urljoin('https://www.' . $this->getInput('newspaper') . '/', $articleURI); + // Loop through each possible title class name + for ($i = 1; $i <= 3; $i++) { + $articleTitleDOMElement = $articleLinkDOMElement->find('.typo-card-title-' . $i, 0); + if (!$articleTitleDOMElement instanceof \simple_html_dom_node) { + continue; + } + $articleTitle .= $articleTitleDOMElement->innertext; + break; + } + + $articleFullURI = urljoin('https://www.' . $this->getInput('newspaper') . '/', $articleURI); $item = [ 'title' => $articleTitle, 'uri' => $articleFullURI, @@ -184,7 +182,7 @@ class CentreFranceBridge extends BridgeAbstract $articleTags = $html->find('#content>div.flex+div.grid section>.bg-gray-light>a.border-gray-dark'); if (is_array($articleTags)) { - $item['categories'] = array_map(static fn ($articleTag) => $articleTag->innertext, $articleTags); + $item['categories'] = array_map(static fn ($articleTag) => html_entity_decode($articleTag->innertext), $articleTags); } $explode = explode('_', $uri); @@ -196,7 +194,7 @@ class CentreFranceBridge extends BridgeAbstract } // If the article is a "grand format", we use another parsing strategy - if ($item['content'] === '' && $html->find('article') !== []) { + if (($item['content'] ?? '') === '' && $html->find('article') !== []) { $articleContent = $html->find('article > section'); foreach ($articleContent as $contentPart) { if ($contentPart->find('#journo') !== []) {