1
0
mirror of synced 2026-05-22 21:33:16 +00:00

rework the search indexing process

- reconfigure the docsearch scraper to store component and version for each res
- switch from the Docker image to a resuable GitHub Action
- add publish-docsearch-config extension to transform Handlebars into YAML
This commit is contained in:
Dan Allen
2022-12-21 12:57:51 -07:00
committed by Marcus Hert Da Coregio
parent 2fb91e6266
commit 409bd29abd
8 changed files with 159 additions and 47 deletions
-20
View File
@@ -1,20 +0,0 @@
{
"index_name": "security-docs",
"start_urls": [
"https://docs.spring.io/spring-security/reference/"
],
"selectors": {
"lvl0": {
"selector": "//nav[@class='crumbs']//li[@class='crumb'][last()-1]",
"type": "xpath",
"global": true,
"default_value": "Home"
},
"lvl1": ".doc h1",
"lvl2": ".doc h2",
"lvl3": ".doc h3",
"lvl4": ".doc h4",
"text": ".doc p, .doc td.content, .doc th.tableblock"
}
}
@@ -1,21 +0,0 @@
#!/bin/bash
###
# Docs
# config.json https://docsearch.algolia.com/docs/config-file
# Run the crawler https://docsearch.algolia.com/docs/run-your-own/#run-the-crawl-from-the-docker-image
### USAGE
if [ "$#" -ne 3 ]; then
echo -e "not enough arguments USAGE:\n\n$0 \$ALGOLIA_APPLICATION_ID \$ALGOLIA_API_KEY \$CONFIG_FILE\n\n" >&2
exit 1
fi
# Script Parameters
APPLICATION_ID=$1
API_KEY=$2
CONFIG_FILE=$3
#### Script
script_dir=$(dirname $0)
docker run -e "APPLICATION_ID=$APPLICATION_ID" -e "API_KEY=$API_KEY" -e "CONFIG=$(cat $CONFIG_FILE | jq -r tostring)" algolia/docsearch-scraper
+67
View File
@@ -0,0 +1,67 @@
{
"index_name": "spring-security-docs",
"start_urls": [
{{#each components}}
{{#each versions}}
{
"url": "{{{@root.site.url}}}/{{#if (eq ./activeVersionSegment '')}}(?:$|index.html$|[a-z].*){{else}}{{{./activeVersionSegment}}}/{{/if}}",
"extra_attributes": {
"component": "{{#if (eq ./name 'ROOT')}}spring-security{{else}}{{{./name}}}{{/if}}",
"version": "{{{./version}}}",
"version_rank": {{#if (eq this ../latest)}}1{{else}}2{{/if}}
}
}{{#unless (and @last @../last)}},{{/unless}}
{{/each}}
{{/each}}
],
"sitemap_urls": [
"{{{site.url}}}/sitemap.xml"
],
"scrape_start_urls": true,
"stop_urls": [
{{#each stopPages}}
"{{{@root.site.url}}}{{{./pub.url}}}"{{#unless @last}},{{/unless}}
{{/each}}
],
"selectors": {
"default": {
"lvl0": {
"global": true,
"selector": ".nav-panel-explore .context .title, .nav-panel-explore .context .version"
},
"lvl1": ".doc > h1.page",
"lvl2": ".doc .sect1 > h2:first-child",
"lvl3": ".doc .sect2 > h3:first-child",
"lvl4": ".doc .sect3 > h4:first-child",
"text": ".doc p, .doc dt, .doc td.content, .doc th.tableblock"
}
},
"selectors_exclude": [
"#section-summary"
],
"min_indexed_level": 1,
"custom_settings": {
"advancedSyntax": true,
"attributesForFaceting": [
"component",
"version"
],
"attributesToRetrieve": [
"anchor",
"content",
"hierarchy",
"url",
"component",
"version"
],
"attributesToSnippet": [
"content:25"
],
"customRanking": [
"desc(weight.page_rank)",
"asc(version_rank)",
"desc(weight.level)",
"asc(weight.position)"
]
}
}
+16 -3
View File
@@ -8,8 +8,21 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
fetch-depth: 5
- name: Run Docsearch Scraper
run: $GITHUB_WORKSPACE/.github/actions/algolia-docsearch-scraper.sh "${{ secrets.ALGOLIA_APPLICATION_ID }}" "${{ secrets.ALGOLIA_WRITE_API_KEY }}" $GITHUB_WORKSPACE/.github/actions/algolia-config.json
- name: Configure Indexer
run: |
CONFIG_FILE=.github/actions/docsearch-config.json
if [ ! -f $CONFIG_FILE ]; then
curl -sL -o $CONFIG_FILE $(node -p "require('fs').readFileSync('antora-playbook.yml', 'utf8').match(/^ url: (.*)/m)[1]")/docsearch-config.json
fi
INDEX_NAME=$(node -p "JSON.parse(require('fs').readFileSync('$CONFIG_FILE')).index_name")
echo "CONFIG_FILE=${CONFIG_FILE}" >> $GITHUB_ENV
echo "INDEX_NAME_TMP=${INDEX_NAME}-${GITHUB_RUN_ID}" >> $GITHUB_ENV
- name: Run Indexer
uses: darrenjennings/algolia-docsearch-action@master
with:
algolia_application_id: ${{ secrets.ALGOLIA_APP_ID }}
algolia_api_key: ${{ secrets.ALGOLIA_API_KEY }}
file: ${{ env.CONFIG_FILE }}