hn_categorizer.py
· 6.1 KiB · Python
Eredeti
"""
LangChain Hacker News Story Categorizer Tutorial
This script demonstrates key LangChain concepts by building a simple application
that fetches Hacker News stories and categorizes them using OpenAI's GPT model.
Key LangChain Concepts Demonstrated:
1. Chains: Sequences of operations that can be combined
2. Prompts: Structured way to interact with LLMs
3. LLMs: Language Model integration
4. Pydantic Output Parsing: Type-safe structured output handling
"""
import os
import requests
from typing import List, Dict
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich import print as rprint
from pydantic import BaseModel, Field
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
# Initialize Rich console for pretty output
console = Console()
# Define our Pydantic model for structured output
class StoryAnalysis(BaseModel):
"""
Pydantic model for story analysis output.
Using Pydantic provides type safety and validation.
"""
category: str = Field(description="The main category of the story (Tech, Business, Science, etc.)")
subcategory: str = Field(description="A more specific subcategory")
summary: str = Field(description="A brief 1-2 sentence summary of the story's main points")
def fetch_hn_stories(limit: int = 5) -> List[Dict]:
"""
Fetch top stories from Hacker News.
This function demonstrates basic API interaction outside of LangChain.
Later, we'll combine this with LangChain components.
"""
# Get top story IDs
response = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json")
story_ids = response.json()[:limit]
stories = []
for story_id in story_ids:
# Fetch individual story details
story_url = f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json"
story_response = requests.get(story_url)
story_data = story_response.json()
if story_data and 'title' in story_data:
stories.append({
'title': story_data['title'],
'url': story_data.get('url', ''),
'score': story_data.get('score', 0)
})
return stories
def setup_langchain_categorizer():
"""
Set up the LangChain components for story categorization.
This demonstrates several key LangChain concepts:
1. Pydantic Output Parser: Type-safe structured output
2. Prompt Templates: Create reusable prompts
3. LLM Chain: Combine prompts and models
"""
# Create a Pydantic output parser
# This is a more modern approach than using ResponseSchema
output_parser = PydanticOutputParser(pydantic_object=StoryAnalysis)
# Create a prompt template with format instructions
# This shows how to create structured prompts in LangChain
prompt = ChatPromptTemplate.from_template("""
Analyze the following Hacker News story and provide a categorization and summary.
Story Title: {title}
URL: {url}
{format_instructions}
Provide your analysis in the exact format specified above:
""")
# Initialize the language model
# ChatOpenAI is a LangChain wrapper around OpenAI's chat models
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
# Create a chain that combines the prompt and model
# LLMChain is a basic building block in LangChain for combining prompts with LLMs
chain = LLMChain(llm=llm, prompt=prompt)
return chain, output_parser
def display_results(stories: List[Dict], categorized_results: List[StoryAnalysis]):
"""
Display the results in a pretty format using Rich.
This function shows how to work with the structured output from our LangChain pipeline.
The categorized_results are strongly typed thanks to our Pydantic model.
"""
table = Table(title="Hacker News Stories Analysis", show_header=True, header_style="bold magenta")
table.add_column("Title", style="cyan", no_wrap=False)
table.add_column("Category", style="green", no_wrap=True)
table.add_column("Subcategory", style="yellow", no_wrap=True)
table.add_column("Summary", style="white", no_wrap=False)
for story, result in zip(stories, categorized_results):
table.add_row(
story['title'],
result.category, # Note: Using dot notation because result is a Pydantic model
result.subcategory,
result.summary
)
console.print()
console.print(Panel.fit(
"🚀 LangChain Hacker News Analyzer",
subtitle="Analyzing top stories using LangChain and GPT-3.5",
style="bold blue"
))
console.print()
console.print(table)
console.print()
def main():
"""
Main function to run the HN story categorizer.
This function orchestrates the entire pipeline:
1. Fetch stories from HN API
2. Set up LangChain components
3. Process stories through the LLM chain
4. Display results
"""
# Show a welcome message
console.print(Panel.fit(
"Fetching and analyzing Hacker News stories...",
style="bold green"
))
# Fetch stories
stories = fetch_hn_stories(limit=5)
# Setup LangChain components
chain, output_parser = setup_langchain_categorizer()
# Process each story
categorized_results = []
with console.status("[bold green]Processing stories...") as status:
for story in stories:
# Get format instructions from the parser
format_instructions = output_parser.get_format_instructions()
# Run the chain
result = chain.run(
title=story['title'],
url=story['url'],
format_instructions=format_instructions
)
# Parse the result into our Pydantic model
parsed_result = output_parser.parse(result)
categorized_results.append(parsed_result)
# Display results
display_results(stories, categorized_results)
if __name__ == "__main__":
main()
| 1 | """ |
| 2 | LangChain Hacker News Story Categorizer Tutorial |
| 3 | |
| 4 | This script demonstrates key LangChain concepts by building a simple application |
| 5 | that fetches Hacker News stories and categorizes them using OpenAI's GPT model. |
| 6 | |
| 7 | Key LangChain Concepts Demonstrated: |
| 8 | 1. Chains: Sequences of operations that can be combined |
| 9 | 2. Prompts: Structured way to interact with LLMs |
| 10 | 3. LLMs: Language Model integration |
| 11 | 4. Pydantic Output Parsing: Type-safe structured output handling |
| 12 | """ |
| 13 | |
| 14 | import os |
| 15 | import requests |
| 16 | from typing import List, Dict |
| 17 | from rich.console import Console |
| 18 | from rich.panel import Panel |
| 19 | from rich.table import Table |
| 20 | from rich import print as rprint |
| 21 | from pydantic import BaseModel, Field |
| 22 | |
| 23 | from langchain.chains import LLMChain |
| 24 | from langchain.chat_models import ChatOpenAI |
| 25 | from langchain.prompts import ChatPromptTemplate |
| 26 | from langchain.output_parsers import PydanticOutputParser |
| 27 | |
| 28 | # Initialize Rich console for pretty output |
| 29 | console = Console() |
| 30 | |
| 31 | # Define our Pydantic model for structured output |
| 32 | class StoryAnalysis(BaseModel): |
| 33 | """ |
| 34 | Pydantic model for story analysis output. |
| 35 | Using Pydantic provides type safety and validation. |
| 36 | """ |
| 37 | category: str = Field(description="The main category of the story (Tech, Business, Science, etc.)") |
| 38 | subcategory: str = Field(description="A more specific subcategory") |
| 39 | summary: str = Field(description="A brief 1-2 sentence summary of the story's main points") |
| 40 | |
| 41 | def fetch_hn_stories(limit: int = 5) -> List[Dict]: |
| 42 | """ |
| 43 | Fetch top stories from Hacker News. |
| 44 | |
| 45 | This function demonstrates basic API interaction outside of LangChain. |
| 46 | Later, we'll combine this with LangChain components. |
| 47 | """ |
| 48 | # Get top story IDs |
| 49 | response = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json") |
| 50 | story_ids = response.json()[:limit] |
| 51 | |
| 52 | stories = [] |
| 53 | for story_id in story_ids: |
| 54 | # Fetch individual story details |
| 55 | story_url = f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json" |
| 56 | story_response = requests.get(story_url) |
| 57 | story_data = story_response.json() |
| 58 | |
| 59 | if story_data and 'title' in story_data: |
| 60 | stories.append({ |
| 61 | 'title': story_data['title'], |
| 62 | 'url': story_data.get('url', ''), |
| 63 | 'score': story_data.get('score', 0) |
| 64 | }) |
| 65 | |
| 66 | return stories |
| 67 | |
| 68 | def setup_langchain_categorizer(): |
| 69 | """ |
| 70 | Set up the LangChain components for story categorization. |
| 71 | |
| 72 | This demonstrates several key LangChain concepts: |
| 73 | 1. Pydantic Output Parser: Type-safe structured output |
| 74 | 2. Prompt Templates: Create reusable prompts |
| 75 | 3. LLM Chain: Combine prompts and models |
| 76 | """ |
| 77 | |
| 78 | # Create a Pydantic output parser |
| 79 | # This is a more modern approach than using ResponseSchema |
| 80 | output_parser = PydanticOutputParser(pydantic_object=StoryAnalysis) |
| 81 | |
| 82 | # Create a prompt template with format instructions |
| 83 | # This shows how to create structured prompts in LangChain |
| 84 | prompt = ChatPromptTemplate.from_template(""" |
| 85 | Analyze the following Hacker News story and provide a categorization and summary. |
| 86 | |
| 87 | Story Title: {title} |
| 88 | URL: {url} |
| 89 | |
| 90 | {format_instructions} |
| 91 | |
| 92 | Provide your analysis in the exact format specified above: |
| 93 | """) |
| 94 | |
| 95 | # Initialize the language model |
| 96 | # ChatOpenAI is a LangChain wrapper around OpenAI's chat models |
| 97 | llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo") |
| 98 | |
| 99 | # Create a chain that combines the prompt and model |
| 100 | # LLMChain is a basic building block in LangChain for combining prompts with LLMs |
| 101 | chain = LLMChain(llm=llm, prompt=prompt) |
| 102 | |
| 103 | return chain, output_parser |
| 104 | |
| 105 | def display_results(stories: List[Dict], categorized_results: List[StoryAnalysis]): |
| 106 | """ |
| 107 | Display the results in a pretty format using Rich. |
| 108 | |
| 109 | This function shows how to work with the structured output from our LangChain pipeline. |
| 110 | The categorized_results are strongly typed thanks to our Pydantic model. |
| 111 | """ |
| 112 | table = Table(title="Hacker News Stories Analysis", show_header=True, header_style="bold magenta") |
| 113 | table.add_column("Title", style="cyan", no_wrap=False) |
| 114 | table.add_column("Category", style="green", no_wrap=True) |
| 115 | table.add_column("Subcategory", style="yellow", no_wrap=True) |
| 116 | table.add_column("Summary", style="white", no_wrap=False) |
| 117 | |
| 118 | for story, result in zip(stories, categorized_results): |
| 119 | table.add_row( |
| 120 | story['title'], |
| 121 | result.category, # Note: Using dot notation because result is a Pydantic model |
| 122 | result.subcategory, |
| 123 | result.summary |
| 124 | ) |
| 125 | |
| 126 | console.print() |
| 127 | console.print(Panel.fit( |
| 128 | "🚀 LangChain Hacker News Analyzer", |
| 129 | subtitle="Analyzing top stories using LangChain and GPT-3.5", |
| 130 | style="bold blue" |
| 131 | )) |
| 132 | console.print() |
| 133 | console.print(table) |
| 134 | console.print() |
| 135 | |
| 136 | def main(): |
| 137 | """ |
| 138 | Main function to run the HN story categorizer. |
| 139 | |
| 140 | This function orchestrates the entire pipeline: |
| 141 | 1. Fetch stories from HN API |
| 142 | 2. Set up LangChain components |
| 143 | 3. Process stories through the LLM chain |
| 144 | 4. Display results |
| 145 | """ |
| 146 | # Show a welcome message |
| 147 | console.print(Panel.fit( |
| 148 | "Fetching and analyzing Hacker News stories...", |
| 149 | style="bold green" |
| 150 | )) |
| 151 | |
| 152 | # Fetch stories |
| 153 | stories = fetch_hn_stories(limit=5) |
| 154 | |
| 155 | # Setup LangChain components |
| 156 | chain, output_parser = setup_langchain_categorizer() |
| 157 | |
| 158 | # Process each story |
| 159 | categorized_results = [] |
| 160 | with console.status("[bold green]Processing stories...") as status: |
| 161 | for story in stories: |
| 162 | # Get format instructions from the parser |
| 163 | format_instructions = output_parser.get_format_instructions() |
| 164 | |
| 165 | # Run the chain |
| 166 | result = chain.run( |
| 167 | title=story['title'], |
| 168 | url=story['url'], |
| 169 | format_instructions=format_instructions |
| 170 | ) |
| 171 | |
| 172 | # Parse the result into our Pydantic model |
| 173 | parsed_result = output_parser.parse(result) |
| 174 | categorized_results.append(parsed_result) |
| 175 | |
| 176 | # Display results |
| 177 | display_results(stories, categorized_results) |
| 178 | |
| 179 | if __name__ == "__main__": |
| 180 | main() |
requirements.txt
· 106 B · Text
Eredeti
langchain==0.0.340
openai==1.3.7
python-dotenv==1.0.0
requests==2.31.0
rich==13.7.0
beautifulsoup4==4.12.2
| 1 | langchain==0.0.340 |
| 2 | openai==1.3.7 |
| 3 | python-dotenv==1.0.0 |
| 4 | requests==2.31.0 |
| 5 | rich==13.7.0 |
| 6 | beautifulsoup4==4.12.2 |