-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.py
153 lines (138 loc) · 5.04 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import streamlit as st
from trafilatura import fetch_url, extract
from trafilatura.settings import use_config
from faq_gen import (
get_df_with_chunks_embedded,
search_material,
generate_questions,
join_chunks,
get_qa_pair,
generate_illustration,
)
# This is a workaround for a bug in trafilatura.
trafilatura_cfg = use_config()
trafilatura_cfg.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
@st.cache(suppress_st_warning=True, persist=True)
def extract_text_from_url(url: str) -> str:
"""
Extracts the text from the url, and returns the text
:param url: the URL of the article to extract text from
:type url: str
:return: A string of text
"""
print(f"Extracting text from {url}")
downloaded = fetch_url(url)
text_in = extract(downloaded, config=trafilatura_cfg)
return text_in
# A simple Streamlit app that showcases the functionality of the FAQ generator
st.title("Eco FAQs")
st.write(
"Use this tool to generate FAQs for your Climate Change education materials. "
"Inspired by [Project Regeneration Nexus](https://regeneration.org/nexus), "
"it uses OpenAI's APIs to generate questions and answers."
)
st.write(
"The app is currently a POC. It extracts text from the selected nexus solution, "
"chunks the text, embeds those chunks, and uses prompt-chaining to generate "
"questions and answers pertaining to the text."
)
# Extracted using GPT by pasting the HTML blob into playground and prompting with
# "Extract all hrefs from above into a python list" :)
hrefs = [
"/nexus/afforestation",
"/nexus/agroecology",
"/nexus/agroforestry",
"/nexus/asparagopsis",
"/nexus/azolla-fern",
"/nexus/bamboo",
"/nexus/beavers",
"/nexus/biochar",
"/nexus/buildings",
"/nexus/clean-cookstoves",
"/nexus/compost",
"/nexus/degraded-land-restoration",
"/nexus/eating-plants",
"/nexus/education-girls",
"/nexus/electric-vehicles",
"/nexus/electrify-everything",
"/nexus/energy-storage",
"/nexus/fifteen-minute-city",
"/nexus/fire-ecology",
"/nexus/grasslands",
"/nexus/heat-pumps",
"/nexus/mangroves",
"/nexus/marine-protected-areas",
"/nexus/micromobility",
"/nexus/nature-of-cities",
"/nexus/net-zero-cities",
"/nexus/offsets",
"/nexus/ocean-farming",
"/nexus/regenerative-agriculture",
"/nexus/seaforestation",
"/nexus/seagrasses",
"/nexus/silvopasture",
"/nexus/smart-microgrids",
"/nexus/solar",
"/nexus/tidal-salt-marshes",
"/nexus/tropical-forests",
"/nexus/urban-farming",
"/nexus/urban-mobility",
"/nexus/wetlands",
"/nexus/wind",
]
top_similar = 4
left, right = st.columns(2)
form = left.form("template_form")
# url_input = form.text_input("Project Regeneration Nexus solutions on", value="regeneration.org/nexus/agroecology")
url_input = form.selectbox(
"Project Regeneration Nexus Solutions on",
list(map(lambda href: href.split("/")[-1], hrefs)),
)
topic_input = form.text_input(
"Relevant Topic (optional)",
placeholder="food diversity",
value="action items",
)
audience_input = form.text_input(
"Audience (optional)", placeholder="students", value="students"
)
num_questions_input = form.number_input(
"Number of Questions", min_value=2, max_value=5, value=3
)
submit = form.form_submit_button("Generate FAQs")
with st.container():
if submit and url_input:
with st.spinner("Generating FAQs..."):
text = extract_text_from_url(
"https://regeneration.org/nexus/" + url_input
)
df = get_df_with_chunks_embedded(text)
res = search_material(df, topic_input, n=top_similar)
chunks = join_chunks(res["chunk"])
generated_questions = generate_questions(
chunks=chunks,
topic=topic_input,
audience=audience_input,
num_questions=num_questions_input,
)
# TODO: could be done in parallel ##
# with concurrent.futures.ThreadPoolExecutor() as executor:
# future1 = executor.submit(get_qa_pair, generated_questions, df)
# future2 = executor.submit(generate_illustration, url_input)
# qa_pair = future1.result()
# illustration_url = future2.result()
# executor.shutdown(wait=True)
qa_pair = get_qa_pair(generated_questions, df)
illustration_url = generate_illustration(prompt=url_input)
# if we need content moderation
# qa_pair = [pair for pair in qa_pair if not is_flagged_for_content_violation(pair['answer'])]
with right:
right.image(illustration_url)
right.write(
"Reference to the page: [{}]({})".format(
url_input, "https://regeneration.org/nexus/" + url_input
)
)
for pair in qa_pair:
expander = right.expander(f"{pair['question']}")
expander.write(f"{pair['answer']}")