Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sitemap templates #149

Merged
merged 9 commits into from
Jan 10, 2025
3 changes: 3 additions & 0 deletions src/_locales/en/messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -378,5 +378,8 @@
},
"popup_ws_version": {
"message": "Web Scraper version: "
},
"sitemap_template_create_nav_button": {
"message": "Load news template"
}
}
3 changes: 3 additions & 0 deletions src/_locales/ru/messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -412,5 +412,8 @@
},
"popup_ws_version": {
"message": "Версия Web Scraper: "
},
"sitemap_template_create_nav_button": {
"message": "Загрузить новостной шаблон"
}
}
6 changes: 6 additions & 0 deletions src/devtools/views/Viewport.html
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@
data-i18n="create_sitemap_import_nav_button"
></a>
</li>
<li>
<a
id="sitemap-template-create-nav-button"
data-i18n="sitemap_template_create_nav_button"
></a>
</li>
</ul>
</li>
</ul>
Expand Down
9 changes: 9 additions & 0 deletions src/libs/urlToSitemapName.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
export default function urlToSitemapName(url) {
try {
let hostname = new URL(url).hostname.replace(/^www\./, '');
return hostname.replace(/\./g, '_').replace(/\//g, '_');
} catch (e) {
console.error('invalid_URL:', e);
return '';
}
}
18 changes: 18 additions & 0 deletions src/scripts/Controller.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ import SelectorList from './SelectorList';
import SelectorTable from './Selector/SelectorTable';
import Model from './Model';
import Translator from './Translator';
import urlToSitemapName from '../libs/urlToSitemapName';

export const SITEMAP_ID_REGEXP = /^[a-z][a-z0-9_\$\(\)\+\-]+$/;
const sitemapTemplate = require('../sitemaps_templates/sitemapTemplate.json');

export default class SitemapController {
constructor(store, templateDir) {
Expand Down Expand Up @@ -190,6 +192,9 @@ export default class SitemapController {
'#create-sitemap-import-nav-button': {
click: this.showImportSitemapPanel,
},
'#sitemap-template-create-nav-button': {
click: this.showTemplateSitemapPanel,
},
'#sitemap-export-nav-button': {
click: this.showSitemapExportPanel,
},
Expand Down Expand Up @@ -700,6 +705,19 @@ export default class SitemapController {
return true;
}

showTemplateSitemapPanel() {
this.showImportSitemapPanel();
chrome.tabs.query({ active: true, currentWindow: true }, tabs => {
const currentTab = tabs[0];
if (currentTab && currentTab.url) {
document.getElementById('edit_sitemap_id').value = urlToSitemapName(currentTab.url);
}
sitemapTemplate.startUrls = [currentTab.url];
$('#sitemapJSON').text(JSON.stringify(sitemapTemplate));
});
return true;
}

showSitemapExportPanel() {
this.setActiveNavigationButton('sitemap-export');
const sitemap = this.state.currentSitemap;
Expand Down
66 changes: 66 additions & 0 deletions src/sitemaps_templates/sitemapTemplate.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{
"rootSelector": {
"id": "_root",
"uuid": "0"
},
"_id": "example",
"startUrls": ["https://www.vesti.ru/"],
"selectors": [
{
"id": "news_link",
"selector": "h3 a",
"type": "SelectorLink",
"multiple": true,
"extractAttribute": "href",
"parentSelectors": ["0"],
"uuid": "1"
},
{
"parentSelectors": ["1"],
"type": "SelectorText",
"uuid": "2",
"id": "title",
"selector": "h1"
},
{
"parentSelectors": ["1"],
"type": "SelectorText",
"multiple": true,
"uuid": "3",
"id": "text",
"selector": "p"
},
{
"id": "main_photo",
"selector": "meta[property=\"og:image\"]",
"type": "SelectorElementAttribute",
"extractAttribute": "content",
"parentSelectors": ["1"],
"uuid": "4"
},
{
"id": "tags",
"selector": "a.tags",
"type": "SelectorText",
"parentSelectors": ["1"],
"uuid": "5"
},
{
"id": "author",
"selector": "html",
"type": "SelectorElement",
"multiple": true,
"parentSelectors": ["1"],
"mergeIntoList": true,
"uuid": "6"
},
{
"id": "publication_date",
"selector": "div.article__date",
"type": "SelectorText",
"parentSelectors": ["1"],
"uuid": "7"
}
],
"sitemapSpecificationVersion": 1
}