infra: cleanup docs build (#21134)

Refactors the docs build in order to:
- run the same `make build` command in both vercel and local build
- incrementally build artifacts in 2 distinct steps, instead of building
all docs in-place (in vercel) or in a _dist dir (locally)

Highlights:
- introduces `make build` in order to build the docs
- collects and generates all files for the build in
`docs/build/intermediate`
- renders those jupyter notebook + markdown files into
`docs/build/outputs`

And now the outputs to host are in `docs/build/outputs`, which will need
a vercel settings change.

Todo:
- [ ] figure out how to point the right directory (right now deleting
and moving docs dir in vercel_build.sh isn't great)
pull/19751/merge
Erick Friis 3 weeks ago committed by GitHub
parent 6fa8626e2f
commit cd4c54282a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -17,16 +17,11 @@ clean: docs_clean api_docs_clean
## docs_build: Build the documentation.
docs_build:
docs/.local_build.sh
cd docs && make build-local
## docs_clean: Clean the documentation build artifacts.
docs_clean:
@if [ -d _dist ]; then \
rm -r _dist; \
echo "Directory _dist has been cleaned."; \
else \
echo "Nothing to clean."; \
fi
cd docs && make clean
## docs_linkcheck: Run linkchecker on the documentation.
docs_linkcheck:

1
docs/.gitignore vendored

@ -1,2 +1,3 @@
/.quarto/
src/supabase.d.ts
build

@ -1,27 +0,0 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
set -o xtrace
SCRIPT_DIR="$(cd "$(dirname "$0")"; pwd)"
cd "${SCRIPT_DIR}"
mkdir -p ../_dist
rsync -ruv --exclude node_modules --exclude api_reference --exclude .venv --exclude .docusaurus . ../_dist
cd ../_dist
poetry run python scripts/model_feat_table.py
cp ../cookbook/README.md src/pages/cookbook.mdx
mkdir -p docs/templates
cp ../templates/docs/INDEX.md docs/templates/index.md
poetry run python scripts/copy_templates.py
wget -q https://raw.githubusercontent.com/langchain-ai/langserve/main/README.md -O docs/langserve.md
wget -q https://raw.githubusercontent.com/langchain-ai/langgraph/main/README.md -O docs/langgraph.md
poetry run quarto render docs
poetry run python scripts/generate_api_reference_links.py --docs_dir docs
yarn
yarn start

@ -0,0 +1,80 @@
# we build the docs in these stages:
# 1. install quarto and python dependencies
# 2. copy files from "source dir" to "intermediate dir"
# 2. generate files like model feat table, etc in "intermediate dir"
# 3. copy files to their right spots (e.g. langserve readme) in "intermediate dir"
# 4. build the docs from "intermediate dir" to "output dir"
SOURCE_DIR = docs/
INTERMEDIATE_DIR = build/intermediate/docs
OUTPUT_DIR = build/output
OUTPUT_DOCS_DIR = $(OUTPUT_DIR)/docs
PYTHON = .venv/bin/python
QUARTO_CMD ?= quarto
PARTNER_DEPS_LIST := $(shell ls -1 ../libs/partners | grep -vE "airbyte|ibm" | xargs -I {} echo "../libs/partners/{}" | tr '\n' ' ')
PORT ?= 3001
clean:
rm -rf build
install-vercel-deps:
yum -y update
yum install gcc bzip2-devel libffi-devel zlib-devel wget tar gzip rsync -y
wget -q https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.450/quarto-1.3.450-linux-amd64.tar.gz
tar -xzf quarto-1.3.450-linux-amd64.tar.gz
install-py-deps:
python3 -m venv .venv
$(PYTHON) -m pip install --upgrade pip
$(PYTHON) -m pip install --upgrade uv
$(PYTHON) -m uv pip install -r vercel_requirements.txt
$(PYTHON) -m uv pip install --editable $(PARTNER_DEPS_LIST)
generate-files:
mkdir -p $(INTERMEDIATE_DIR)
cp -r $(SOURCE_DIR)/* $(INTERMEDIATE_DIR)
mkdir -p $(INTERMEDIATE_DIR)/templates
cp ../templates/docs/INDEX.md $(INTERMEDIATE_DIR)/templates/index.md
cp ../cookbook/README.md $(INTERMEDIATE_DIR)/cookbook.mdx
$(PYTHON) scripts/model_feat_table.py $(INTERMEDIATE_DIR)
$(PYTHON) scripts/copy_templates.py $(INTERMEDIATE_DIR)
wget -q https://raw.githubusercontent.com/langchain-ai/langserve/main/README.md -O $(INTERMEDIATE_DIR)/langserve.md
$(PYTHON) scripts/resolve_local_links.py $(INTERMEDIATE_DIR)/langserve.md https://github.com/langchain-ai/langserve/tree/main/
wget -q https://raw.githubusercontent.com/langchain-ai/langgraph/main/README.md -O $(INTERMEDIATE_DIR)/langgraph.md
$(PYTHON) scripts/resolve_local_links.py $(INTERMEDIATE_DIR)/langgraph.md https://github.com/langchain-ai/langgraph/tree/main/
$(PYTHON) scripts/generate_api_reference_links.py --docs_dir $(INTERMEDIATE_DIR)
copy-infra:
mkdir -p $(OUTPUT_DIR)
cp -r src $(OUTPUT_DIR)
cp vercel.json $(OUTPUT_DIR)
cp babel.config.js $(OUTPUT_DIR)
cp -r data $(OUTPUT_DIR)
cp docusaurus.config.js $(OUTPUT_DIR)
cp package.json $(OUTPUT_DIR)
cp sidebars.js $(OUTPUT_DIR)
cp -r static $(OUTPUT_DIR)
cp yarn.lock $(OUTPUT_DIR)
quarto-render:
$(QUARTO_CMD) render $(INTERMEDIATE_DIR) --output-dir $(OUTPUT_DOCS_DIR) --no-execute
mv $(OUTPUT_DOCS_DIR)/$(INTERMEDIATE_DIR)/* $(OUTPUT_DOCS_DIR)
rm -rf $(OUTPUT_DOCS_DIR)/build
md-sync:
rsync -avm --include="*/" --include="*.mdx" --include="*.md" --exclude="*" $(INTERMEDIATE_DIR)/ $(OUTPUT_DOCS_DIR)
build: install-py-deps generate-files copy-infra quarto-render md-sync
start:
cd $(OUTPUT_DIR) && yarn && yarn start --port=$(PORT)

@ -1,76 +0,0 @@
/* eslint-disable prefer-template */
/* eslint-disable no-param-reassign */
// eslint-disable-next-line import/no-extraneous-dependencies
const babel = require("@babel/core");
const path = require("path");
const fs = require("fs");
/**
*
* @param {string|Buffer} content Content of the resource file
* @param {object} [map] SourceMap data consumable by https://github.com/mozilla/source-map
* @param {any} [meta] Meta data, could be anything
*/
async function webpackLoader(content, map, meta) {
const cb = this.async();
if (!this.resourcePath.endsWith(".ts")) {
cb(null, JSON.stringify({ content, imports: [] }), map, meta);
return;
}
try {
const result = await babel.parseAsync(content, {
sourceType: "module",
filename: this.resourcePath,
});
const imports = [];
result.program.body.forEach((node) => {
if (node.type === "ImportDeclaration") {
const source = node.source.value;
if (!source.startsWith("langchain")) {
return;
}
node.specifiers.forEach((specifier) => {
if (specifier.type === "ImportSpecifier") {
const local = specifier.local.name;
const imported = specifier.imported.name;
imports.push({ local, imported, source });
} else {
throw new Error("Unsupported import type");
}
});
}
});
imports.forEach((imp) => {
const { imported, source } = imp;
const moduleName = source.split("/").slice(1).join("_");
const docsPath = path.resolve(__dirname, "docs", "api", moduleName);
const available = fs.readdirSync(docsPath, { withFileTypes: true });
const found = available.find(
(dirent) =>
dirent.isDirectory() &&
fs.existsSync(path.resolve(docsPath, dirent.name, imported + ".md"))
);
if (found) {
imp.docs =
"/" + path.join("docs", "api", moduleName, found.name, imported);
} else {
throw new Error(
`Could not find docs for ${source}.${imported} in docs/api/`
);
}
});
cb(null, JSON.stringify({ content, imports }), map, meta);
} catch (err) {
cb(err);
}
}
module.exports = webpackLoader;

@ -330,7 +330,7 @@
"id": "da9a9239",
"metadata": {},
"source": [
"For more information refer to [OpenVINO LLM guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html) and [OpenVINO Local Pipelines notebook](./openvino.ipynb)."
"For more information refer to [OpenVINO LLM guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html) and [OpenVINO Local Pipelines notebook](/docs/integrations/llms/openvino/)."
]
}
],

@ -67,7 +67,7 @@ from langchain_community.embeddings import QuantizedBgeEmbeddings
### Weight-Only Quantization with ITREX
See a [usage example](../docs/integrations/llms/weight_only_quantization.ipynb).
See a [usage example](/docs/integrations/llms/weight_only_quantization).
## Detail of Configuration Parameters

@ -2,35 +2,44 @@ import glob
import os
import re
import shutil
import sys
from pathlib import Path
TEMPLATES_DIR = Path(os.path.abspath(__file__)).parents[2] / "templates"
DOCS_TEMPLATES_DIR = Path(os.path.abspath(__file__)).parents[1] / "docs" / "templates"
if __name__ == "__main__":
intermediate_dir = Path(sys.argv[1])
templates_source_dir = Path(os.path.abspath(__file__)).parents[2] / "templates"
templates_intermediate_dir = intermediate_dir / "templates"
readmes = list(glob.glob(str(TEMPLATES_DIR) + "/*/README.md"))
destinations = [readme[len(str(TEMPLATES_DIR)) + 1 : -10] + ".md" for readme in readmes]
for source, destination in zip(readmes, destinations):
full_destination = DOCS_TEMPLATES_DIR / destination
shutil.copyfile(source, full_destination)
with open(full_destination, "r") as f:
content = f.read()
# remove images
content = re.sub("\!\[.*?\]\((.*?)\)", "", content)
with open(full_destination, "w") as f:
f.write(content)
readmes = list(glob.glob(str(templates_source_dir) + "/*/README.md"))
destinations = [
readme[len(str(templates_source_dir)) + 1 : -10] + ".md" for readme in readmes
]
for source, destination in zip(readmes, destinations):
full_destination = templates_intermediate_dir / destination
shutil.copyfile(source, full_destination)
with open(full_destination, "r") as f:
content = f.read()
# remove images
content = re.sub("\!\[.*?\]\((.*?)\)", "", content)
with open(full_destination, "w") as f:
f.write(content)
sidebar_hidden = """---
sidebar_hidden = """---
sidebar_class_name: hidden
---
"""
TEMPLATES_INDEX_DESTINATION = DOCS_TEMPLATES_DIR / "index.md"
with open(TEMPLATES_INDEX_DESTINATION, "r") as f:
content = f.read()
# replace relative links
content = re.sub("\]\(\.\.\/", "](/docs/templates/", content)
# handle index file
templates_index_source = templates_source_dir / "docs" / "INDEX.md"
templates_index_intermediate = templates_intermediate_dir / "index.md"
with open(templates_index_source, "r") as f:
content = f.read()
# replace relative links
content = re.sub("\]\(\.\.\/", "](/docs/templates/", content)
with open(TEMPLATES_INDEX_DESTINATION, "w") as f:
f.write(sidebar_hidden + content)
with open(templates_index_intermediate, "w") as f:
f.write(sidebar_hidden + content)

@ -25,7 +25,6 @@ _IMPORT_RE = re.compile(
_CURRENT_PATH = Path(__file__).parent.absolute()
# Directory where generated markdown files are stored
_DOCS_DIR = _CURRENT_PATH / "docs"
_JSON_PATH = _CURRENT_PATH / "api_reference" / "guide_imports.json"
def find_files(path):
@ -55,6 +54,12 @@ def get_args():
default=_DOCS_DIR,
help="Directory where generated markdown files are stored",
)
parser.add_argument(
"--json_path",
type=str,
default=None,
help="Path to store the generated JSON file",
)
return parser.parse_args()
@ -83,9 +88,11 @@ def main():
global_imports[class_name][doc_title] = doc_url
# Write the global imports information to a JSON file
_JSON_PATH.parent.mkdir(parents=True, exist_ok=True)
with _JSON_PATH.open("w") as f:
json.dump(global_imports, f)
if args.json_path:
json_path = Path(args.json_path)
json_path.parent.mkdir(parents=True, exist_ok=True)
with json_path.open("w") as f:
json.dump(global_imports, f)
def _get_doc_title(data: str, file_name: str) -> str:

@ -1,11 +1,11 @@
import os
import sys
from pathlib import Path
from langchain_community import chat_models, llms
from langchain_core.language_models.chat_models import BaseChatModel, SimpleChatModel
from langchain_core.language_models.llms import LLM, BaseLLM
INTEGRATIONS_DIR = Path(os.path.abspath(__file__)).parents[1] / "docs" / "integrations"
LLM_IGNORE = ("FakeListLLM", "OpenAIChat", "PromptLayerOpenAIChat")
LLM_FEAT_TABLE_CORRECTION = {
"TextGen": {"_astream": False, "_agenerate": False},
@ -218,9 +218,17 @@ def get_chat_model_table() -> str:
if __name__ == "__main__":
output_dir = Path(sys.argv[1])
output_integrations_dir = output_dir / "integrations"
output_integrations_dir_llms = output_integrations_dir / "llms"
output_integrations_dir_chat = output_integrations_dir / "chat"
output_integrations_dir_llms.mkdir(parents=True, exist_ok=True)
output_integrations_dir_chat.mkdir(parents=True, exist_ok=True)
llm_page = LLM_TEMPLATE.format(table=get_llm_table())
with open(INTEGRATIONS_DIR / "llms" / "index.mdx", "w") as f:
with open(output_integrations_dir / "llms" / "index.mdx", "w") as f:
f.write(llm_page)
chat_model_page = CHAT_MODEL_TEMPLATE.format(table=get_chat_model_table())
with open(INTEGRATIONS_DIR / "chat" / "index.mdx", "w") as f:
with open(output_integrations_dir / "chat" / "index.mdx", "w") as f:
f.write(chat_model_page)

@ -1,11 +0,0 @@
[DEFAULT]
nbs_path = .
recursive = True
tst_flags = notest
user = hwchase17
doc_host = https://python.langchain.com
doc_baseurl = /docs
module_baseurls = metaflow=https://github.com/Netflix/metaflow/tree/master/
fastcore=https://github.com/fastcore/tree/master
host = github

@ -2,39 +2,9 @@
set -e
yum -y update
yum install gcc bzip2-devel libffi-devel zlib-devel wget tar gzip -y
make install-vercel-deps
# install quarto
wget -q https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.450/quarto-1.3.450-linux-amd64.tar.gz
tar -xzf quarto-1.3.450-linux-amd64.tar.gz
export PATH=$PATH:$(pwd)/quarto-1.3.450/bin/
QUARTO_CMD="./quarto-1.3.450/bin/quarto" make build
# setup python env
python3 -m venv .venv
source .venv/bin/activate
python3 -m pip install --upgrade pip
python3 -m pip install --upgrade uv
python3 -m uv pip install -r vercel_requirements.txt
python3 -m uv pip install -e $(ls ../libs/partners | grep -vE "airbyte|ibm|.md" | xargs -I {} echo "../libs/partners/{}")
# autogenerate integrations tables
python3 scripts/model_feat_table.py
# copy in external files
mkdir docs/templates
cp ../templates/docs/INDEX.md docs/templates/index.md
python3 scripts/copy_templates.py
cp ../cookbook/README.md src/pages/cookbook.mdx
wget -q https://raw.githubusercontent.com/langchain-ai/langserve/main/README.md -O docs/langserve.md
python3 scripts/resolve_local_links.py docs/langserve.md https://github.com/langchain-ai/langserve/tree/main/
wget -q https://raw.githubusercontent.com/langchain-ai/langgraph/main/README.md -O docs/langgraph.md
python3 scripts/resolve_local_links.py docs/langgraph.md https://github.com/langchain-ai/langgraph/tree/main/
# render
quarto render docs/
python3 scripts/generate_api_reference_links.py --docs_dir docs
rm -rf docs
mv build/output/docs ./
Loading…
Cancel
Save