-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_poc.py
721 lines (535 loc) · 23.8 KB
/
model_poc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
from openai import OpenAI
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import pdfplumber
from googlesearch import search
from sentence_transformers import SentenceTransformer
import faiss
from langchain.chains import RetrievalQA
from langchain_openai import OpenAI
import pinecone
import mimetypes
from urllib.parse import urlparse
from fake_useragent import UserAgent
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain #need a fix: ~RunnableSequence
from langchain_core.runnables import RunnableSequence
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS, Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI #need a lil fix: langchain_openai import ChatOpenAI
import os
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv
import warnings
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
warnings.filterwarnings("ignore")
# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENV")
# Initialize models
hf_model_name = "sentence-transformers/all-MiniLM-L6-v2"
hf_model = SentenceTransformer(hf_model_name)
# Initialize OpenAI API
llm = OpenAI(openai_api_key=openai_api_key)
api = KaggleApi()
api.authenticate()
# Step 1: Search for relevant web pages
def search_web(company_name, industry_name, num_results=5):
company_query = (
f"{company_name} {industry_name} "
"financial OR environmental OR technological OR ESG policies OR reports "
"-privacy -user"
)
industry_query = f"{industry_name} industry standards OR benchmarks OR trends"
results = []
for query in [company_query, industry_query]:
for url in search(query, num_results=num_results, advanced=True):
results.append(url)
# print(url)
# print("\n")
return results
# g_res = search_web("Google", "Technology")
# for x in g_res:
# print(x)
# print("\n")
# print("*"*500)
# search_web("Tesla", "Automotive")
# print("*"*500)
# search_web("Emerson", "Industrial Automation")
# # Step 2: Scrape data from web pages
def scrape_webpage(url):
try:
url = url.url
path = urlparse(url).path
file_type = mimetypes.guess_type(path)[0]
if file_type == "application/pdf":
# print(f"Skipping PDF: {url}")
return ""
# Scrape HTML content
response = requests.get(url, timeout=10, headers={'User-Agent': UserAgent().random})
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
paragraphs = soup.find_all("p")
content = " ".join([p.get_text() for p in paragraphs])
return content.strip()
except Exception as e:
print(f"Error scraping {url}: {e}")
return ""
# Google_res = search_web("Google", "Technology")
# for url in Google_res:
# print(scrape_webpage(url))
# print("*"*100)
# print("\n")
# # search_web("Tesla", "Automotive")
# # print("*"*500)
# # search_web("Emerson", "Industrial Automation")
# # scrapped_data = scrape_webpage("https://www.tesla.com/privacy")
# # scrapped_data = scrape_webpage("https://www.google.com/policies/privacy/")
# # # Step 3: Search and download PDFs
def extract_text_from_pdf(filepath):
try:
text = ""
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
return text.strip()
except Exception as e:
print(f"Error extracting text from {filepath}: {e}")
return ""
def is_valid_pdf_metadata(url, max_size_mb=10, max_pages=30):
"""
Check the metadata of a PDF file (e.g., file size and number of pages)
to determine if it meets the criteria for further processing.
Args:
url (str): URL of the PDF to check.
max_size_mb (float): Maximum allowed file size in MB.
max_pages (int): Maximum allowed number of pages.
Returns:
bool: True if the PDF is likely useful based on metadata, False otherwise.
"""
try:
response = requests.head(url, timeout=10)
response.raise_for_status()
# Get file size from headers (in bytes)
file_size_mb = int(response.headers.get("Content-Length", 0)) / (1024 * 1024)
# Use a URL parser to try to get the page count from URL (if it is included in the metadata)
parsed_url = urlparse(url)
# This is a heuristic: often page counts might be embedded in the URL structure or metadata.
# You can also check for headers that provide such data.
page_count = 0 # Set page count to 0 for this example, since we're not using a metadata extraction tool.
# Apply basic checks for file size and page count
return file_size_mb <= max_size_mb and page_count <= max_pages
except Exception as e:
print(f"Error checking metadata for {url}: {e}")
return False
def scrape_webpage1(url):
try:
# accessing url from search result objecct
url = url.url
path = urlparse(url).path
file_type = mimetypes.guess_type(path)[0]
response = requests.get(url, timeout=10, headers={'User-Agent': UserAgent().random})
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
paragraphs = soup.find_all("p")
content = " ".join([p.get_text() for p in paragraphs])
return content.strip()
except Exception as e:
print(f"Error scraping {url}: {e}")
return ""
def find_and_download_pdfs(company_name, industry_name, num_results=3):
"""
Search, download, and parse information-dense PDFs related to a company and industry.
Args:
company_name (str): Name of the company.
industry_name (str): Name of the industry.
num_results (int): Number of search results to consider.
Returns:
list[dict]: List of parsed PDF metadata and content.
"""
query = f"{company_name} {industry_name} industry standards OR current technology OR AI applications OR generative AI OR future trends filetype:pdf"
pdf_texts = []
search_results = search(query, num_results=num_results) # Assuming 'search' is your search method
for url in search_results:
if url.endswith(".pdf") and is_valid_pdf_metadata(url):
try:
# Download the PDF
response = requests.get(url, timeout=10)
response.raise_for_status()
pdf_filename = url.split("/")[-1]
with open(pdf_filename, "wb") as f:
f.write(response.content)
print(f"Downloaded: {pdf_filename}")
# Extract and analyze PDF text
extracted_text = extract_text_from_pdf(pdf_filename)
# You can skip the previous information-dense check, as metadata has already been validated
pdf_texts.append({
"company": company_name,
"industry": industry_name,
"filename": pdf_filename,
"text": extracted_text,
})
# os.remove(pdf_filename) # Clean up temporary PDF file
# optional: We can also store the PDFs in a separate folder for further analysis
except Exception as e:
print(f"Error downloading or processing PDF from {url}: {e}")
return pdf_texts
# # google_pdfs = find_and_download_pdfs("Google", "Technology")
# # tesla_pdfs = find_and_download_pdfs("Tesla", "Automotive")
# # with open("sample2.txt", "w") as f:
# # for pdf in tesla_pdfs:
# # f.write(pdf["text"])
# # f.write("\n\n")
# # f.write("*"*100)
# # with open("sample1.txt", "w") as f:
# # for pdf in google_pdfs:
# # f.write(pdf["text"])
# # f.write("\n\n")
# # f.write("*"*100)
# # extract_text_from_pdf("ai-principles-2023-progress-update.pdf")
# # Step 4: Create RAG system
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
def create_rag_system(contents, chunk_size=1000, chunk_overlap=200):
"""
Create a RAG (Retrieval-Augmented Generation) system for large texts.
Args:
contents (list): List of text documents to process.
chunk_size (int): Size of each chunk in characters.
chunk_overlap (int): Overlap between chunks in characters.
Returns:
retriever: The RAG retriever object.
"""
if not os.getenv("OPENAI_API_KEY"):
raise ValueError("OpenAI API key is required for embedding generation.")
# Validate contents
if not contents or not all(isinstance(item, str) for item in contents):
raise ValueError("Expected 'contents' to be a list of strings.")
print("Splitting documents into chunks...")
try:
# Use RecursiveCharacterTextSplitter for smarter chunking
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ".", " "], # Prefer splitting at paragraphs, sentences, or words
)
# contents is list of strings
documents = text_splitter.create_documents(contents)
print(f"Number of chunks created: {len(documents)}")
# Generate embeddings
print("Generating embeddings for documents using OpenAI...")
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")
vector_store = FAISS.from_documents(documents, embeddings_model)
# Create retriever
print("Creating retriever...")
retriever = vector_store.as_retriever(search_kwargs={"k": 2})
print("RAG system successfully created.")
return retriever
except Exception as e:
print(f"An error occurred: {e}")
return None
def retrieve_and_enrich_with_subqueries(retriever, company_name, industry_name):
"""
Retrieve and enrich content using targeted subqueries to ensure comprehensive coverage.
Args:
retriever: The RAG retriever object.
company_name (str): The name of the company.
industry_name (str): The industry the company belongs to.
Returns:
str: Enriched and summarized text relevant to the query.
"""
# Define subqueries
subqueries = [
f"{company_name} {industry_name} general industry overview",
f"{company_name} {industry_name} Generative AI use cases",
f"{company_name} {industry_name} challenges faced",
f"{company_name} {industry_name} how AI/ML can solve challenges",
]
# Retrieve documents for each subquery
print("Retrieving relevant documents for subqueries...")
retrieved_texts = []
for subquery in subqueries:
print(f"Processing subquery: {subquery}")
relevant_docs = retriever.invoke(subquery)
retrieved_texts.extend([doc.page_content for doc in relevant_docs])
# Combine retrieved content
print(type(retrieved_texts))
combined_text = " ".join(retrieved_texts)
print(len(retrieved_texts))
# print("COMBINED TEXT\n", combined_text)
if not combined_text.strip():
return ""
# Enrich the retrieved text using a Generative AI model
print("Using Generative AI for enrichment...")
enrichment_prompt = """
You are an expert in AI and ML. Based on the following text:
---
{combined_text}
---
Provide:
1. Key challenges in the company or industry for Generative AI.
2. Opportunities where AI/ML can overcome these challenges.
3. Specific Generative AI use cases for this company or industry.
4. Suggest some innovative AI applications for the company.
"""
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.7)
prompt = PromptTemplate(input_variables=["combined_text"], template=enrichment_prompt)
chain = LLMChain(llm=llm, prompt=prompt)
try:
enriched_text = chain.invoke(input={"combined_text": combined_text})
return enriched_text
except Exception as e:
print(f"Error during enrichment: {e}")
return ""
def extract_keywords_from_text1(input_text):
"""
Extracts keywords from the provided input text using an LLM.
Args:
input_text (str): The input text from which keywords are to be extracted.
Returns:
List[str]: A list of extracted keywords.
"""
llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.7,
max_tokens=500,
openai_api_key=os.getenv("OPENAI_API_KEY")
)
prompt_template = """
You are an AI assistant that specializes in extracting relevant keywords from a given text. These keywords will be used to search for relevant datasets on Kaggle and other platforms.
Your task is to analyze the provided content and generate a concise list of important keywords or phrases
that will help in finding relevant datasets.
Text to analyze:
{input_text}
Provide the most important 4,5 extracted keywords keeping in mind that those are helpful for relevant dataset search as a comma-separated list.
"""
prompt = PromptTemplate(
input_variables=["input_text"],
template=prompt_template.strip()
)
chain = LLMChain(llm=llm, prompt=prompt)
try:
response = chain.invoke(input={"input_text": input_text})
keywords = response.get("text", "").strip().split(",")
return [kw.strip() for kw in keywords if kw.strip()]
except Exception as e:
print(f"Error during keyword extraction: {e}")
return []
# # Extract Insights
# Extract Insights with Subquery Enrichment
def extract_insights(retriever, company_name, industry_name):
"""
Extract key insights for a company and industry based on the retrieved and enriched content.
Args:
retriever: The retriever object for the content.
company_name (str): The name of the company.
industry_name (str): The name of the industry.
Returns:
List[str]: A list of key insights for the company.
"""
llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.7,
max_tokens=1000,
openai_api_key=os.getenv("OPENAI_API_KEY")
)
# Step 1: Retrieve and enrich content with subqueries
enriched_content = retrieve_and_enrich_with_subqueries(retriever, company_name, industry_name)
enriched_content = enriched_content.get("text", "")
if not enriched_content.strip():
print("No enriched content generated from the subqueries.")
return ["No insights could be generated due to insufficient data."]
# Step 2: Define a prompt template for extracting insights
prompt_template = """
You are an AI assistant specializing in business insights. Based on the provided data:
- Highlight key strengths and opportunities for the company ({company_name}) in the {industry_name} industry.
- Identify risks, challenges, and market trends relevant to the company.
- Provide actionable recommendations or innovations the company could explore.
Use the following enriched content for your analysis:
{enriched_content}
"""
# Step 3: Generate insights using the enriched content
prompt = PromptTemplate(
input_variables=["company_name", "industry_name", "enriched_content"],
template=prompt_template.strip()
)
chain = LLMChain(llm=llm, prompt=prompt)
try:
response = chain.invoke(
input={"company_name" : company_name,
"industry_name" : industry_name,
"enriched_content": enriched_content
}
)
# print("RESPONSEEEEE")
# print(response)
response = response.get("enriched_content", "")
insights = [line.strip() for line in response.splitlines() if line.strip()]
return insights
except Exception as e:
print(f"Error during LLM chain execution: {e}")
return ["Failed to generate insights due to an error."]
def extract_keywords_from_text(input_text):
"""
Extracts keywords from the provided input text using an LLM.
Args:
input_text (str): The input text from which keywords are to be extracted.
Returns:
List[str]: A list of extracted keywords.
"""
llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.7,
max_tokens=500,
openai_api_key=os.getenv("OPENAI_API_KEY")
)
prompt_template = """
You are an AI assistant that specializes in extracting relevant keywords from a given text. These keywords will be used to search for relevant datasets on Kaggle and other platforms.
Your task is to analyze the provided content and generate a concise list of important keywords or phrases
that will help in finding relevant datasets.
Text to analyze:
{input_text}
Provide the most important 4,5 extracted keywords keeping in mind that those are helpful for relevant dataset search as a comma-separated list.
"""
prompt = PromptTemplate(
input_variables=["input_text"],
template=prompt_template.strip()
)
chain = LLMChain(llm=llm, prompt=prompt)
try:
response = chain.invoke(input={"input_text": input_text})
keywords = response.get("text", "").strip().split(",")
return [kw.strip() for kw in keywords if kw.strip()]
except Exception as e:
print(f"Error during keyword extraction: {e}")
return []
def evaluate_usecases(usecases):
"""
Evaluate the suggested use cases with the user and decide whether to proceed or revisit.
Args:
usecases (List[str]): List of suggested use cases.
Returns:
bool: True if the use cases are feasible and user wants to proceed, False otherwise.
"""
while True:
user_input = input("\nAre these use cases feasible? (yes/no): ").strip().lower()
if user_input == "yes":
print("Great! Proceeding with dataset search...")
return True
elif user_input == "no":
print("Understood. Redirecting to the researcher agent for refinement...")
return False
else:
print("Invalid input. Please type 'yes' or 'no'.")
def search_datasets(keywords):
"""
Search Kaggle datasets using the provided keywords and return top results.
Args:
keywords (List[str]): List of keywords to search for.
Returns:
Dict[str, List[Tuple[str, str, str, int]]]: A dictionary of keywords and their corresponding top datasets.
"""
results = {}
for keyword in keywords:
datasets = api.dataset_list(search=keyword, sort_by="hottest")
results[keyword] = [
(dataset.ref,
dataset.title,
dataset.size,
dataset.voteCount
)
for dataset in datasets[:5]
]
return results
def display_dataset_choices(results):
"""
Display dataset options for each keyword and allow the user to select from them.
Args:
results (Dict[str, List[Tuple[str, str, str, int]]]): The dictionary of datasets per keyword.
Returns:
List[Tuple[str, str]]: The list of selected dataset references and titles.
"""
print("Dataset search results:")
selected_datasets = []
for keyword, datasets in results.items():
print(f"\nKeyword: {keyword}")
for i, (ref, title, size, votes) in enumerate(datasets, 1):
print(f" {i}. {title} (Votes: {votes}, Size: {size}) [Ref: {ref}]")
print("\nEnter the numbers of the datasets you want to select for this keyword (comma-separated):")
user_input = input("Your choice: ").strip()
if user_input:
choices = [int(choice) - 1 for choice in user_input.split(",") if choice.isdigit()]
selected_datasets.extend([datasets[i][:2] for i in choices if 0 <= i < len(datasets)])
return selected_datasets
def main():
try:
# Step 1: Gather company and industry names
company_name = input("Enter the company name: ").strip()
industry_name = input("Enter the industry name: ").strip()
print("Searching for relevant web pages...")
urls = search_web(company_name, industry_name)
if not urls:
print("No relevant web pages found.")
return
print("Found URLs:", urls)
print("Number of links found:", len(urls))
# Step 2: Scrape data from web pages
print("Scraping web pages...")
webpage_texts = [scrape_webpage(url) for url in urls]
# Step 3: Search and download PDFs
print("Searching for and downloading PDFs...")
pdf_texts = find_and_download_pdfs(company_name, industry_name)
# Combine all retrieved content
all_texts = webpage_texts + pdf_texts
all_texts = [text for text in all_texts if text] # Filter out empty texts
if not all_texts:
print("No data could be retrieved from the web pages or PDFs.")
return
all_texts = [text for text in all_texts if isinstance(text, str) and text.strip()]
# Step 4: Create RAG system
print("Setting up the Retrieval-Augmented Generation (RAG) system using FAISS...")
print(f"Type of all_text {type(all_texts)}")
print(f"First 5 elements of all_texts: {all_texts[:5]}")
retriever = create_rag_system(all_texts)
# Step 5: Extract insights
print("Extracting important insights...")
insights = extract_insights(retriever, company_name, industry_name)
print(f"\nKey insights for {company_name} in the {industry_name} industry:")
print(insights)
eval = evaluate_usecases(insights)
if not eval:
main()
print("Extraction of keywords from the insights")
keywords = extract_keywords_from_text(" ".join(insights))
print("Extracted Keywords:", keywords)
# Search for datasets based on keywords
search_results = search_datasets(keywords)
# Display choices and let the user select datasets
selected_datasets = display_dataset_choices(search_results)
if not selected_datasets:
print("No datasets were selected.")
else:
print("\nYou selected the following datasets:")
for ref, title in selected_datasets:
print(f" - {title} [Ref: {ref}]")
# Download selected datasets
download_path = "data/selected_datasets"
os.makedirs(download_path, exist_ok=True)
for ref, title in selected_datasets:
print(f"\nDownloading dataset: {title}...")
api.dataset_download_files(ref, path=download_path, unzip=True)
print(f"\nDatasets downloaded to {download_path}.")
for file in os.listdir(download_path):
print(f" - {file}")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
main()
#31 dec working fine
#task 1 : add feedback, add search agent
#task 2 : langchain to autogen