DigitalLibrary/Main.py at main · ecrabtreee/DigitalLibrary · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
'''
Main.py

Author: Emily Crabtree
Date: May 21, 2026
Description: Taking users current goodreads data and analyzing it to give insights on their
             reading habits and progress towards their yearly reading goal.
'''

# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

class Main:

    # Initilazing main class
    def __init__ (self, filepath: str = "goodreads_data.csv"):
        self.df = pd.read_csv(filepath)
        self.df = self.df.fillna("")
        # Convert Number of Pages to numeric
        self.df["Number of Pages"] = pd.to_numeric(self.df["Number of Pages"], errors='coerce')
        self.setGoals()
        self.readingAnalytics(self.df)

        # Asking user if they want to see different stats and visualizations
        # Also breaks up the amount of content shown at once so they arent bombarded with info
        question = input("Would you like to see your speed stats? (Y/N) ")
        if question.upper() == "Y":
            self.readingTimeStats()

        question = input("Would you like to see your genre and author stats? (Y/N) ")
        if question.upper() == "Y":
            self.readingGenresAuthors()

        question = input("Would you like to see your ratings and recommendations? (Y/N) ")
        if question.upper() == "Y":
            self.comparingRatingsRecommendingBooks()

        question = input("Would you like to see some visualizations of your data? (Y/N) ")
        if question.upper() == "Y":
            self.statsDataVisualization()

    # Setting yearly goals and getting current date from user
    def setGoals(self):
        self.date=input("What is today's date? (YYYY-MM-DD) ")
        self.year = self.date.split("-")[0]
        self.month = self.date.split("-")[1]
        self.day = self.date.split("-")[2]

        self.yearlyGoal = int(input("What is your yearly reading goal (in books)? "))
        #self.dailyPageGoal = int(input("What is your daily page goal? "))

    # Analyzing reader data and calculating analytics
    def readingAnalytics(self, df):
        #print(df.head())

        # Getting yearly progress
        self.yearlyProgress = len(self.df[self.df["Date Read"].str.contains(self.year, na=False)])
        print(f"You are {(self.yearlyProgress *100)/self.yearlyGoal}% done with your reading goal..\n")

        # Tracking number of books already read each month (using / format for dates)
        self.januaryBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/01", na=False)])
        self.FebruaryBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/02", na=False)])
        self.MarchBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/03", na=False)])
        self.AprilBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/04", na=False)])
        self.MayBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/05", na=False)])
        self.JuneBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/06", na=False)])
        self.JulyBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/07", na=False)])
        self.AugustBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/08", na=False)])
        self.SeptemberBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/09", na=False)])
        self.OctoberBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/10", na=False)])
        self.NovemberBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/11", na=False)])
        self.DecemberBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/12", na=False)])

        # Display current month's books
        monthNames = ["", "January", "February", "March", "April", "May", "June",
                      "July", "August", "September", "October", "November", "December"]
        currentMonthNum = int(self.month)
        currentMonthBooks = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/{self.month}", na=False)])
        print(f"You read {currentMonthBooks} books this month ({monthNames[currentMonthNum]}).\n")

        # Calculate average number of books read per month so far this year
        totalBooks = 0
        for monthNum in range(1, currentMonthNum + 1):
            totalBooks += len(self.df[self.df["Date Read"].str.contains(f"{self.year}/{monthNum:02d}", na=False)])
        self.avgBooksPerMonth = totalBooks / currentMonthNum

        print(f"You are averaging {self.avgBooksPerMonth:.2f} books per month so far this year.\n")

        # Deeper dive into monthly averages
        question = input("Would you like to see how many books you've read in each month? (Y/N) ")
        if question == "Y":
            print(f"\nYou read {self.januaryBooks} books in January, {self.FebruaryBooks} books in February, {self.MarchBooks} books in March, {self.AprilBooks} books in April, {self.MayBooks} books in May, {self.JuneBooks} books in June, {self.JulyBooks} books in July, {self.AugustBooks} books in August, {self.SeptemberBooks} books in September, {self.OctoberBooks} books in October, {self.NovemberBooks} books in November, and {self.DecemberBooks} books in December.")


    # Reading velocity and time-based statistics
    def readingTimeStats(self):
        print("\n" + "="*60)
        print("READING TIME STATISTICS")
        print("="*60 + "\n")

        # Filter books that have been read
        read_books = self.df[self.df["Date Read"] != ""].copy()

        if len(read_books) == 0:
            print("No books read yet.\n")
            return

        # Convert date columns to datetime for calculations
        read_books["Date Read"] = pd.to_datetime(read_books["Date Read"], format="%Y/%m/%d", errors='coerce')
        read_books["Date Added"] = pd.to_datetime(read_books["Date Added"], format="%Y/%m/%d", errors='coerce')

        # Remove rows with invalid dates
        read_books = read_books.dropna(subset=["Date Read", "Date Added"])

        # Calculate days to complete book
        read_books["Days to Complete"] = (read_books["Date Read"] - read_books["Date Added"]).dt.days
        read_books["Days to Complete"] = read_books["Days to Complete"].clip(lower=1)
        read_books["Pages per Day"] = read_books["Number of Pages"] / read_books["Days to Complete"]

        # Check if all invalid
        if len(read_books) == 0:
            print("No valid date data to analyze.\n")
            return

        # Overall average stats for pages and days
        avg_pages_per_day = read_books["Pages per Day"].mean()
        avg_days_to_complete = read_books["Days to Complete"].mean()
        avg_book_length = read_books["Number of Pages"].mean()

        print(f"Average pages per day: {avg_pages_per_day:.2f}")
        print(f"Average days to complete a book: {avg_days_to_complete:.1f} days")
        print(f"Average book length: {avg_book_length:.0f} pages\n")

        # Fastest and slowest reads
        fastest_idx = read_books["Pages per Day"].idxmax()
        slowest_idx = read_books["Pages per Day"].idxmin()

        if pd.notna(fastest_idx) and pd.notna(slowest_idx):
            fastest = read_books.loc[fastest_idx]
            slowest = read_books.loc[slowest_idx]

            print(f"Fastest read: '{fastest['Title']}' ({fastest['Pages per Day']:.1f} pages/day)")
            print(f"Slowest read: '{slowest['Title']}' ({slowest['Pages per Day']:.1f} pages/day)\n")

        # Books per week per inputted year
        year_reads = read_books[read_books["Date Read"].dt.year == int(self.year)]
        if len(year_reads) > 0:
            date_range = year_reads["Date Read"].max() - year_reads["Date Read"].min()
            weeks_elapsed = date_range.days / 7
            if weeks_elapsed > 0:
                books_per_week = len(year_reads) / weeks_elapsed
                print(f"Books per week (this year): {books_per_week:.2f}\n")

        # Longest reading gap
        sorted_reads = read_books.sort_values("Date Read")
        if len(sorted_reads) > 1:
            gaps = sorted_reads["Date Read"].diff()
            longest_gap = gaps.max().days
            print(f"Longest reading gap: {longest_gap} days\n")

    # Genre and author analysis
    def readingGenresAuthors(self):
        print("="*60)
        print("AUTHOR & GENRE ANALYSIS")
        print("="*60 + "\n")

        # Separate by shelf type
        read_books = self.df[(self.df["Exclusive Shelf"] == "read") | (self.df["Exclusive Shelf"] == "currently-reading")]
        to_read_books = self.df[self.df["Exclusive Shelf"] == "to-read"]

        # === READ BOOKS ANALYSIS ===
        print("BOOKS YOU'VE READ (or are reading):\n")

        top_authors_read = read_books["Author"].value_counts().head(10)
        if len(top_authors_read) > 0:
            print("Top 10 Authors by Books Completed:")
            for i, (author, count) in enumerate(top_authors_read.items(), 1):
                print(f"  {i}. {author}: {count} books")
            print()

        # Average rating by author for read books (including rereads)
        rated_read = read_books[read_books["My Rating"] != 0]
        if len(rated_read) > 0:
            author_stats_read = rated_read.groupby("Author").agg({
                "My Rating": ["mean", "count"]
            }).round(2)
            author_stats_read.columns = ["Avg Rating", "Count"]
            author_stats_read = author_stats_read[author_stats_read["Count"] > 1].sort_values("Avg Rating", ascending=False).head(10)

            if len(author_stats_read) > 0:
                print("Top Rated Authors (with 2+ books read):")
                for author, row in author_stats_read.iterrows():
                    print(f"  {author}: {row['Avg Rating']:.2f}/5 ({int(row['Count'])} books)")
                print()

        # === TO-READ BOOKS ANALYSIS ===
        print("BOOKS IN YOUR TO-READ LIST:\n")

        if len(to_read_books) > 0:
            top_authors_tbr = to_read_books["Author"].value_counts().head(10)
            print(f"Most Common Authors in To-Read ({len(to_read_books)} total books):")
            for i, (author, count) in enumerate(top_authors_tbr.items(), 1):
                print(f"  {i}. {author}: {count} books")
            print()
        else:
            print("Your To-Read list is empty, go find some :(\n")

        # === GENRE/SHELF ANALYSIS ===
        print("GENRE/SHELF BREAKDOWN:\n")

        print("All Books by Shelf/Genre:")
        shelf_counts = {}
        for shelves in self.df["Bookshelves"]:
            if shelves != "":
                for shelf in shelves.split(", "):
                    shelf_counts[shelf] = shelf_counts.get(shelf, 0) + 1

        for shelf, count in sorted(shelf_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"  {shelf}: {count} books")

        print()

        # === REREAD ANALYSIS ===
        reread_books = self.df[self.df["Read Count"] > 1]
        if len(reread_books) > 0:
            print(f"Books You've Reread: {len(reread_books)}")
            for idx, book in reread_books.iterrows():
                print(f"  '{book['Title']}' - read {int(book['Read Count'])} times")
            print()


    # Rating comparison and recommendations
    def comparingRatingsRecommendingBooks(self):
        print("="*60)
        print("RATINGS & RECOMMENDATIONS")
        print("="*60 + "\n")

        # Average rating for read books
        read_books = self.df[(self.df["Date Read"] != "") & (self.df["My Rating"] != 0)]
        if len(read_books) > 0:
            avg_your_rating = read_books["My Rating"].mean()
            print(f"Your average rating for books read: {avg_your_rating:.2f}/5.0\n")

        # To-read books sorted by potential interest
        to_read = self.df[self.df["Exclusive Shelf"] == "to-read"].copy()

        if len(to_read) > 0:
            print(f"Books in your To-Read list: {len(to_read)}\n")

            # Identify favorite authors
            read_authors = read_books["Author"].value_counts()
            favorite_authors = read_authors.head(5).index.tolist()

            # Find to-read books by favorite authors
            fav_author_to_read = to_read[to_read["Author"].isin(favorite_authors)]
            if len(fav_author_to_read) > 0:
                print(f"To-Read books by your favorite authors:")
                for idx, book in fav_author_to_read.head(5).iterrows():
                    print(f"  - '{book['Title']}' by {book['Author']}")
                print()

        # Currently reading
        currently_reading = self.df[self.df["Exclusive Shelf"] == "currently-reading"]
        if len(currently_reading) > 0:
            print(f"\nCurrently Reading ({len(currently_reading)} book(s)):")
            for idx, book in currently_reading.iterrows():
                print(f"  - '{book['Title']}' by {book['Author']} ({book['Number of Pages']} pages)")

    # Data visualization
    def statsDataVisualization(self):
        import matplotlib.pyplot as plt

        print("\n" + "="*60)
        print("GENERATING VISUALIZATIONS")
        print("="*60 + "\n")

        fig = plt.figure(figsize=(16, 12))

        # === VISUALIZATION 1: Books per month with trend line ===
        ax1 = plt.subplot(2, 2, 1)

        monthNames = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
        monthBooks = []
        currentMonthNum = int(self.month)

        for monthNum in range(1, 13):
            count = len(self.df[self.df["Date Read"].str.contains(f"{self.year}/{monthNum:02d}", na=False)])
            monthBooks.append(count)

        months = monthNames[:currentMonthNum]
        books = monthBooks[:currentMonthNum]

        bars = ax1.bar(months, books, color='steelblue', alpha=0.7, edgecolor='navy')

        # Add trend line
        if len(books) > 1:
            z = np.polyfit(range(len(books)), books, 1)
            p = np.poly1d(z)
            ax1.plot(range(len(books)), p(range(len(books))), "r--", linewidth=2, label='Trend')
            ax1.legend()

        ax1.set_ylabel('Number of Books', fontsize=11, fontweight='bold')
        ax1.set_xlabel('Month', fontsize=11, fontweight='bold')
        ax1.set_title('Books Read per Month (This Year)', fontsize=13, fontweight='bold')
        ax1.grid(axis='y', alpha=0.3)

        # === VISUALIZATION 2: Author demographic pie chart (this year) ===
        ax2 = plt.subplot(2, 2, 2)

        year_reads = self.df[(self.df["Date Read"].str.contains(self.year, na=False)) & (self.df["Date Read"] != "")]
        author_counts = year_reads["Author"].value_counts()

        if len(author_counts) > 0:
            # Show top 7 authors, group rest as "Other"
            if len(author_counts) > 7:
                top_authors = author_counts.head(7)
                other_count = author_counts.iloc[7:].sum()
                plot_data = pd.concat([top_authors, pd.Series({'Other': other_count})])
            else:
                plot_data = author_counts

            colors = plt.cm.Set3(range(len(plot_data)))
            wedges, texts, autotexts = ax2.pie(plot_data.values, labels=plot_data.index, autopct='%1.1f%%',
                                                colors=colors, startangle=90)
            for autotext in autotexts:
                autotext.set_color('black')
                autotext.set_fontweight('bold')
                autotext.set_fontsize(9)

            ax2.set_title(f'Author Demographics (Books Read This Year: {len(year_reads)})',
                         fontsize=13, fontweight='bold')

        # === VISUALIZATION 3: Reading speed by year ===
        ax3 = plt.subplot(2, 2, 3)

        # Get all years from data
        all_reads = self.df[self.df["Date Read"] != ""].copy()
        all_reads["Date Read"] = pd.to_datetime(all_reads["Date Read"], format="%Y/%m/%d", errors='coerce')
        all_reads["Date Added"] = pd.to_datetime(all_reads["Date Added"], format="%Y/%m/%d", errors='coerce')
        all_reads = all_reads.dropna(subset=["Date Read", "Date Added"])
        all_reads["Year"] = all_reads["Date Read"].dt.year

        # Calculate pages per day by year
        yearly_speeds = {}
        for year in sorted(all_reads["Year"].unique()):
            year_data = all_reads[all_reads["Year"] == year]
            year_data["Days"] = (year_data["Date Read"] - year_data["Date Added"]).dt.days.clip(lower=1)
            avg_speed = (year_data["Number of Pages"] / year_data["Days"]).mean()
            yearly_speeds[int(year)] = avg_speed

        if len(yearly_speeds) > 0:
            years = list(yearly_speeds.keys())
            speeds = list(yearly_speeds.values())

            ax3.plot(years, speeds, marker='o', linewidth=2, markersize=8, color='darkgreen')
            ax3.fill_between(years, speeds, alpha=0.3, color='green')
            ax3.set_ylabel('Pages/Day', fontsize=11, fontweight='bold')
            ax3.set_xlabel('Year', fontsize=11, fontweight='bold')
            ax3.set_title('Reading Speed Trend (Pages per Day)', fontsize=13, fontweight='bold')
            ax3.grid(True, alpha=0.3)

            # Add value labels on points
            for year, speed in zip(years, speeds):
                ax3.text(year, speed + 0.1, f'{speed:.2f}', ha='center', fontsize=9, fontweight='bold')

        # === VISUALIZATION 4: Top authors bar chart (this year) ===
        ax4 = plt.subplot(2, 2, 4)

        top_5_authors = author_counts.head(5)
        if len(top_5_authors) > 0:
            bars = ax4.barh(range(len(top_5_authors)), top_5_authors.values, color='coral', alpha=0.8, edgecolor='darkred')
            ax4.set_yticks(range(len(top_5_authors)))
            ax4.set_yticklabels(top_5_authors.index)
            ax4.set_xlabel('Books Read', fontsize=11, fontweight='bold')
            ax4.set_title('Top 5 Authors (This Year)', fontsize=13, fontweight='bold')
            ax4.grid(axis='x', alpha=0.3)

            # Add value labels
            for i, v in enumerate(top_5_authors.values):
                ax4.text(v + 0.05, i, str(int(v)), va='center', fontweight='bold')

        plt.tight_layout()
        plt.show()

        print("Visualizations displayed in new window\n")

# Running class
if (__name__ == "__main__"):
    app = Main("goodreads_data.csv")