infra: cleanup docs build (#21134)

Refactors the docs build in order to: - run the same `make build` command in both vercel and local build - incrementally build artifacts in 2 distinct steps, instead of building all docs in-place (in vercel) or in a _dist dir (locally) Highlights: - introduces `make build` in order to build the docs - collects and generates all files for the build in `docs/build/intermediate` - renders those jupyter notebook + markdown files into `docs/build/outputs` And now the outputs to host are in `docs/build/outputs`, which will need a vercel settings change. Todo: - [ ] figure out how to point the right directory (right now deleting and moving docs dir in vercel_build.sh isn't great)
3 weeks ago · cd4c54282a
parent 6fa8626e2f
commit cd4c54282a
12 changed files with 141 additions and 185 deletions
--- a/9
+++ b/9
@ -17,16 +17,11 @@ clean: docs_clean api_docs_clean

 ## docs_build: Build the documentation.
 docs_build:
-	docs/.local_build.sh
+	cd docs && make build-local

 ## docs_clean: Clean the documentation build artifacts.
 docs_clean:
-	@if [ -d _dist ]; then \
-		rm -r _dist; \
-		echo "Directory _dist has been cleaned."; \
-	else \
-		echo "Nothing to clean."; \
-	fi
+	cd docs && make clean

 ## docs_linkcheck: Run linkchecker on the documentation.
 docs_linkcheck:
--- a/docs/.gitignore
+++ b/docs/.gitignore
@ -1,2 +1,3 @@
 /.quarto/
 src/supabase.d.ts
+build
--- a/docs/.local_build.sh
+++ b/docs/.local_build.sh
@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-set -o errexit
-set -o nounset
-set -o pipefail
-set -o xtrace
-
-SCRIPT_DIR="$(cd "$(dirname "$0")"; pwd)"
-cd "${SCRIPT_DIR}"
-
-mkdir -p ../_dist
-rsync -ruv --exclude node_modules --exclude api_reference --exclude .venv --exclude .docusaurus . ../_dist
-cd ../_dist
-poetry run python scripts/model_feat_table.py
-cp ../cookbook/README.md src/pages/cookbook.mdx
-mkdir -p docs/templates
-cp ../templates/docs/INDEX.md docs/templates/index.md
-poetry run python scripts/copy_templates.py
-wget -q https://raw.githubusercontent.com/langchain-ai/langserve/main/README.md -O docs/langserve.md
-wget -q https://raw.githubusercontent.com/langchain-ai/langgraph/main/README.md -O docs/langgraph.md
-
-
-poetry run quarto render docs
-poetry run python scripts/generate_api_reference_links.py  --docs_dir docs
-
-yarn
-yarn start
--- a/docs/Makefile
+++ b/docs/Makefile
@ -0,0 +1,80 @@
+# we build the docs in these stages:
+# 1. install quarto and python dependencies
+# 2. copy files from "source dir" to "intermediate dir"
+# 2. generate files like model feat table, etc in "intermediate dir"
+# 3. copy files to their right spots (e.g. langserve readme) in "intermediate dir"
+# 4. build the docs from "intermediate dir" to "output dir"
+
+SOURCE_DIR = docs/
+INTERMEDIATE_DIR = build/intermediate/docs
+OUTPUT_DIR = build/output
+OUTPUT_DOCS_DIR = $(OUTPUT_DIR)/docs
+
+PYTHON = .venv/bin/python
+
+QUARTO_CMD ?= quarto
+
+PARTNER_DEPS_LIST := $(shell ls -1 ../libs/partners | grep -vE "airbyte|ibm" | xargs -I {} echo "../libs/partners/{}" | tr '\n' ' ')
+
+PORT ?= 3001
+
+clean:
+	rm -rf build
+
+install-vercel-deps:
+	yum -y update
+	yum install gcc bzip2-devel libffi-devel zlib-devel wget tar gzip rsync -y
+
+	wget -q https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.450/quarto-1.3.450-linux-amd64.tar.gz
+	tar -xzf quarto-1.3.450-linux-amd64.tar.gz
+
+install-py-deps:
+	python3 -m venv .venv
+	$(PYTHON) -m pip install --upgrade pip
+	$(PYTHON) -m pip install --upgrade uv
+	$(PYTHON) -m uv pip install -r vercel_requirements.txt
+	$(PYTHON) -m uv pip install --editable $(PARTNER_DEPS_LIST)
+
+generate-files:
+	mkdir -p $(INTERMEDIATE_DIR)
+	cp -r $(SOURCE_DIR)/* $(INTERMEDIATE_DIR)
+	mkdir -p $(INTERMEDIATE_DIR)/templates
+	cp ../templates/docs/INDEX.md $(INTERMEDIATE_DIR)/templates/index.md
+	cp ../cookbook/README.md $(INTERMEDIATE_DIR)/cookbook.mdx
+
+	$(PYTHON) scripts/model_feat_table.py $(INTERMEDIATE_DIR)
+
+	$(PYTHON) scripts/copy_templates.py $(INTERMEDIATE_DIR)
+
+	wget -q https://raw.githubusercontent.com/langchain-ai/langserve/main/README.md -O $(INTERMEDIATE_DIR)/langserve.md
+	$(PYTHON) scripts/resolve_local_links.py $(INTERMEDIATE_DIR)/langserve.md https://github.com/langchain-ai/langserve/tree/main/
+
+	wget -q https://raw.githubusercontent.com/langchain-ai/langgraph/main/README.md -O $(INTERMEDIATE_DIR)/langgraph.md
+	$(PYTHON) scripts/resolve_local_links.py $(INTERMEDIATE_DIR)/langgraph.md https://github.com/langchain-ai/langgraph/tree/main/
+
+	$(PYTHON) scripts/generate_api_reference_links.py --docs_dir $(INTERMEDIATE_DIR)
+
+copy-infra:
+	mkdir -p $(OUTPUT_DIR)
+	cp -r src $(OUTPUT_DIR)
+	cp vercel.json $(OUTPUT_DIR)
+	cp babel.config.js $(OUTPUT_DIR)
+	cp -r data $(OUTPUT_DIR)
+	cp docusaurus.config.js $(OUTPUT_DIR)
+	cp package.json $(OUTPUT_DIR)
+	cp sidebars.js $(OUTPUT_DIR)
+	cp -r static $(OUTPUT_DIR)
+	cp yarn.lock $(OUTPUT_DIR)
+
+quarto-render:
+	$(QUARTO_CMD) render $(INTERMEDIATE_DIR) --output-dir $(OUTPUT_DOCS_DIR) --no-execute
+	mv $(OUTPUT_DOCS_DIR)/$(INTERMEDIATE_DIR)/* $(OUTPUT_DOCS_DIR)
+	rm -rf $(OUTPUT_DOCS_DIR)/build
+
+md-sync:
+	rsync -avm --include="*/" --include="*.mdx" --include="*.md" --exclude="*" $(INTERMEDIATE_DIR)/ $(OUTPUT_DOCS_DIR)
+
+build: install-py-deps generate-files copy-infra quarto-render md-sync
+
+start:
+	cd $(OUTPUT_DIR) && yarn && yarn start --port=$(PORT)
--- a/docs/code-block-loader.js
+++ b/docs/code-block-loader.js
@ -1,76 +0,0 @@
-/* eslint-disable prefer-template */
-/* eslint-disable no-param-reassign */
-// eslint-disable-next-line import/no-extraneous-dependencies
-const babel = require("@babel/core");
-const path = require("path");
-const fs = require("fs");
-
-/**
- *
- * @param {string|Buffer} content Content of the resource file
- * @param {object} [map] SourceMap data consumable by https://github.com/mozilla/source-map
- * @param {any} [meta] Meta data, could be anything
- */
-async function webpackLoader(content, map, meta) {
-  const cb = this.async();
-
-  if (!this.resourcePath.endsWith(".ts")) {
-    cb(null, JSON.stringify({ content, imports: [] }), map, meta);
-    return;
-  }
-
-  try {
-    const result = await babel.parseAsync(content, {
-      sourceType: "module",
-      filename: this.resourcePath,
-    });
-
-    const imports = [];
-
-    result.program.body.forEach((node) => {
-      if (node.type === "ImportDeclaration") {
-        const source = node.source.value;
-
-        if (!source.startsWith("langchain")) {
-          return;
-        }
-
-        node.specifiers.forEach((specifier) => {
-          if (specifier.type === "ImportSpecifier") {
-            const local = specifier.local.name;
-            const imported = specifier.imported.name;
-            imports.push({ local, imported, source });
-          } else {
-            throw new Error("Unsupported import type");
-          }
-        });
-      }
-    });
-
-    imports.forEach((imp) => {
-      const { imported, source } = imp;
-      const moduleName = source.split("/").slice(1).join("_");
-      const docsPath = path.resolve(__dirname, "docs", "api", moduleName);
-      const available = fs.readdirSync(docsPath, { withFileTypes: true });
-      const found = available.find(
-        (dirent) =>
-          dirent.isDirectory() &&
-          fs.existsSync(path.resolve(docsPath, dirent.name, imported + ".md"))
-      );
-      if (found) {
-        imp.docs =
-          "/" + path.join("docs", "api", moduleName, found.name, imported);
-      } else {
-        throw new Error(
-          `Could not find docs for ${source}.${imported} in docs/api/`
-        );
-      }
-    });
-
-    cb(null, JSON.stringify({ content, imports }), map, meta);
-  } catch (err) {
-    cb(err);
-  }
-}
-
-module.exports = webpackLoader;
--- a/docs/docs/integrations/llms/huggingface_pipelines.ipynb
+++ b/docs/docs/integrations/llms/huggingface_pipelines.ipynb
@ -330,7 +330,7 @@
   "id": "da9a9239",
   "metadata": {},
   "source": [
-    "For more information refer to [OpenVINO LLM guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html) and [OpenVINO Local Pipelines notebook](./openvino.ipynb)."
+    "For more information refer to [OpenVINO LLM guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html) and [OpenVINO Local Pipelines notebook](/docs/integrations/llms/openvino/)."
   ]
  }
 ],
--- a/docs/docs/integrations/providers/intel.mdx
+++ b/docs/docs/integrations/providers/intel.mdx
@ -67,7 +67,7 @@ from langchain_community.embeddings import QuantizedBgeEmbeddings

 ### Weight-Only Quantization with ITREX

-See a [usage example](../docs/integrations/llms/weight_only_quantization.ipynb).
+See a [usage example](/docs/integrations/llms/weight_only_quantization).

 ## Detail of Configuration Parameters

--- a/docs/scripts/copy_templates.py
+++ b/docs/scripts/copy_templates.py
@ -2,35 +2,44 @@ import glob
 import os
 import re
 import shutil
+import sys
 from pathlib import Path

-TEMPLATES_DIR = Path(os.path.abspath(__file__)).parents[2] / "templates"
-DOCS_TEMPLATES_DIR = Path(os.path.abspath(__file__)).parents[1] / "docs" / "templates"
+if __name__ == "__main__":
+    intermediate_dir = Path(sys.argv[1])

+    templates_source_dir = Path(os.path.abspath(__file__)).parents[2] / "templates"
+    templates_intermediate_dir = intermediate_dir / "templates"

-readmes = list(glob.glob(str(TEMPLATES_DIR) + "/*/README.md"))
-destinations = [readme[len(str(TEMPLATES_DIR)) + 1 : -10] + ".md" for readme in readmes]
-for source, destination in zip(readmes, destinations):
-    full_destination = DOCS_TEMPLATES_DIR / destination
-    shutil.copyfile(source, full_destination)
-    with open(full_destination, "r") as f:
-        content = f.read()
-    # remove images
-    content = re.sub("\!\[.*?\]\((.*?)\)", "", content)
-    with open(full_destination, "w") as f:
-        f.write(content)
+    readmes = list(glob.glob(str(templates_source_dir) + "/*/README.md"))
+    destinations = [
+        readme[len(str(templates_source_dir)) + 1 : -10] + ".md" for readme in readmes
+    ]
+    for source, destination in zip(readmes, destinations):
+        full_destination = templates_intermediate_dir / destination
+        shutil.copyfile(source, full_destination)
+        with open(full_destination, "r") as f:
+            content = f.read()
+        # remove images
+        content = re.sub("\!\[.*?\]\((.*?)\)", "", content)
+        with open(full_destination, "w") as f:
+            f.write(content)

-sidebar_hidden = """---
+    sidebar_hidden = """---
 sidebar_class_name: hidden
 ---

 """
-TEMPLATES_INDEX_DESTINATION = DOCS_TEMPLATES_DIR / "index.md"
-with open(TEMPLATES_INDEX_DESTINATION, "r") as f:
-    content = f.read()

-# replace relative links
-content = re.sub("\]\(\.\.\/", "](/docs/templates/", content)
+    # handle index file
+    templates_index_source = templates_source_dir / "docs" / "INDEX.md"
+    templates_index_intermediate = templates_intermediate_dir / "index.md"
+
+    with open(templates_index_source, "r") as f:
+        content = f.read()
+
+    # replace relative links
+    content = re.sub("\]\(\.\.\/", "](/docs/templates/", content)

-with open(TEMPLATES_INDEX_DESTINATION, "w") as f:
-    f.write(sidebar_hidden + content)
+    with open(templates_index_intermediate, "w") as f:
+        f.write(sidebar_hidden + content)
--- a/docs/scripts/generate_api_reference_links.py
+++ b/docs/scripts/generate_api_reference_links.py
@ -25,7 +25,6 @@ _IMPORT_RE = re.compile(
 _CURRENT_PATH = Path(__file__).parent.absolute()
 # Directory where generated markdown files are stored
 _DOCS_DIR = _CURRENT_PATH / "docs"
-_JSON_PATH = _CURRENT_PATH / "api_reference" / "guide_imports.json"


 def find_files(path):
@ -55,6 +54,12 @@ def get_args():
        default=_DOCS_DIR,
        help="Directory where generated markdown files are stored",
    )
+    parser.add_argument(
+        "--json_path",
+        type=str,
+        default=None,
+        help="Path to store the generated JSON file",
+    )
    return parser.parse_args()


@ -83,9 +88,11 @@ def main():
                global_imports[class_name][doc_title] = doc_url

    # Write the global imports information to a JSON file
-    _JSON_PATH.parent.mkdir(parents=True, exist_ok=True)
-    with _JSON_PATH.open("w") as f:
-        json.dump(global_imports, f)
+    if args.json_path:
+        json_path = Path(args.json_path)
+        json_path.parent.mkdir(parents=True, exist_ok=True)
+        with json_path.open("w") as f:
+            json.dump(global_imports, f)


 def _get_doc_title(data: str, file_name: str) -> str:
--- a/docs/scripts/model_feat_table.py
+++ b/docs/scripts/model_feat_table.py
@ -1,11 +1,11 @@
 import os
+import sys
 from pathlib import Path

 from langchain_community import chat_models, llms
 from langchain_core.language_models.chat_models import BaseChatModel, SimpleChatModel
 from langchain_core.language_models.llms import LLM, BaseLLM

-INTEGRATIONS_DIR = Path(os.path.abspath(__file__)).parents[1] / "docs" / "integrations"
 LLM_IGNORE = ("FakeListLLM", "OpenAIChat", "PromptLayerOpenAIChat")
 LLM_FEAT_TABLE_CORRECTION = {
    "TextGen": {"_astream": False, "_agenerate": False},
@ -218,9 +218,17 @@ def get_chat_model_table() -> str:


 if __name__ == "__main__":
+    output_dir = Path(sys.argv[1])
+    output_integrations_dir = output_dir / "integrations"
+    output_integrations_dir_llms = output_integrations_dir / "llms"
+    output_integrations_dir_chat = output_integrations_dir / "chat"
+    output_integrations_dir_llms.mkdir(parents=True, exist_ok=True)
+    output_integrations_dir_chat.mkdir(parents=True, exist_ok=True)
+
    llm_page = LLM_TEMPLATE.format(table=get_llm_table())
-    with open(INTEGRATIONS_DIR / "llms" / "index.mdx", "w") as f:
+
+    with open(output_integrations_dir / "llms" / "index.mdx", "w") as f:
        f.write(llm_page)
    chat_model_page = CHAT_MODEL_TEMPLATE.format(table=get_chat_model_table())
-    with open(INTEGRATIONS_DIR / "chat" / "index.mdx", "w") as f:
+    with open(output_integrations_dir / "chat" / "index.mdx", "w") as f:
        f.write(chat_model_page)
--- a/docs/settings.ini
+++ b/docs/settings.ini
@ -1,11 +0,0 @@
-[DEFAULT]
-nbs_path = .
-recursive = True
-tst_flags = notest
-user = hwchase17
-doc_host = https://python.langchain.com
-doc_baseurl = /docs
-module_baseurls = metaflow=https://github.com/Netflix/metaflow/tree/master/
-	fastcore=https://github.com/fastcore/tree/master
-host = github
-
--- a/docs/vercel_build.sh
+++ b/docs/vercel_build.sh
@ -2,39 +2,9 @@

 set -e

-yum -y update
-yum install gcc bzip2-devel libffi-devel zlib-devel wget tar gzip -y
+make install-vercel-deps

-# install quarto
-wget -q https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.450/quarto-1.3.450-linux-amd64.tar.gz
-tar -xzf quarto-1.3.450-linux-amd64.tar.gz
-export PATH=$PATH:$(pwd)/quarto-1.3.450/bin/
+QUARTO_CMD="./quarto-1.3.450/bin/quarto" make build

-
-# setup python env
-python3 -m venv .venv
-source .venv/bin/activate
-python3 -m pip install --upgrade pip
-python3 -m pip install --upgrade uv
-python3 -m uv pip install -r vercel_requirements.txt
-python3 -m uv pip install -e $(ls ../libs/partners | grep -vE "airbyte|ibm|.md" | xargs -I {} echo "../libs/partners/{}")
-
-# autogenerate integrations tables
-python3 scripts/model_feat_table.py
-
-# copy in external files
-mkdir docs/templates
-cp ../templates/docs/INDEX.md docs/templates/index.md
-python3 scripts/copy_templates.py
-
-cp ../cookbook/README.md src/pages/cookbook.mdx
-
-wget -q https://raw.githubusercontent.com/langchain-ai/langserve/main/README.md -O docs/langserve.md
-python3 scripts/resolve_local_links.py docs/langserve.md https://github.com/langchain-ai/langserve/tree/main/
-
-wget -q https://raw.githubusercontent.com/langchain-ai/langgraph/main/README.md -O docs/langgraph.md
-python3 scripts/resolve_local_links.py docs/langgraph.md https://github.com/langchain-ai/langgraph/tree/main/
-
-# render
-quarto render docs/
-python3 scripts/generate_api_reference_links.py --docs_dir docs
+rm -rf docs
+mv build/output/docs ./