-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
169 lines (160 loc) · 7.31 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
This script scans a specified directory for audio files, and for each file,
finds lyrics from Lyricsify.com or Genius.com (as a fallback),
and saves them to the file's metadata.
"""
import sys
import urllib
import json
from bs4 import BeautifulSoup
import requests
import os
import re
import eyed3
from dotenv import load_dotenv
load_dotenv()
def lyricsify_find_song_lyrics(query):
"""
Return song lyrics from Lyricsify.com for the first song found using the provided search string.
If not found, return None.
"""
# Search Lyricsify for the song using web scraping
link = BeautifulSoup(
requests.get(url="https://www.lyricsify.com/search?q=" +
query.replace(
" - ", "+").replace(" ", "+"),
headers={
"User-Agent": ""
}).text,
"html.parser").find("a", class_="title")
# If not found, return None
if link is None:
return None
# Scrape the song URL for the lyrics text
song_html = BeautifulSoup(
requests.get(url="https://www.lyricsify.com" + link.attrs['href'],
headers={
"User-Agent": ""
}).text,
"html.parser")
# If the artist or song name does not exist in the query, return None
artist_title = song_html.find("h1").string[:-7]
sep_ind = artist_title.find("-")
artist = None if sep_ind < 0 else artist_title[0:sep_ind].strip()
title = artist_title if sep_ind < 0 else artist_title[sep_ind + 1:].strip()
query_lower = query.lower()
if query_lower.find(title.lower()) < 0 or (sep_ind >= 0 and query_lower.find(artist.lower()) < 0):
return None
# Return the lyrics text
return "".join(song_html.find("div", id="entry").strings)
def genius_find_song_lyrics(query, access_token):
"""
Return song lyrics from Genius.com for the first song found using the provided search string.
If not found, return None.
Requires a Genius.com access token.
"""
# Search Genius for the song using their API
results = json.loads(requests.get(url="https://api.genius.com/search?q=" + urllib.parse.quote(query), headers={
"Authorization": "Bearer " + access_token,
"User-Agent": ""
}).text)
# If no hits, return None
if len(results["response"]["hits"]) <= 0:
return None
# If the song has no URL or the artist or song name does not exist in the query, return None
song = results["response"]["hits"][0]["result"]
query_lower = query.lower()
if song["url"] is None or query_lower.find(song["title"].lower()) < 0 or query_lower.find(song["primary_artist"]["name"].lower()) < 0:
return None
# Scrape the song URL for the lyrics text
page = requests.get(song["url"])
html = BeautifulSoup(page.text, "html.parser")
target_div = html.find("div", id="lyrics-root")
# This ususally means the song is an instrumental (exists on the site and was found, but no lyrics)
if target_div is None:
lyrics = ["[Instrumental]"]
else:
lyrics = "\n".join(
html.find("div", id="lyrics-root").strings).split("\n")[1:-2]
# The extracted lyrics text is mangled, needs some processing before it is returned...
indices = []
for i, lyric in enumerate(lyrics):
if lyric[0] == "[":
indices.append(i)
inserted = 0
for i in indices:
lyrics.insert(i+inserted, "")
inserted += 1
final_lyrics = []
for i, lyric in enumerate(lyrics):
if (i < (len(lyrics) - 1) and (lyrics[i+1] == ")" or lyrics[i+1] == "]")) or lyric == ")" or lyric == "]" or (i > 0 and lyrics[i-1].endswith(" ") or lyric.startswith(" ")):
final_lyrics[len(final_lyrics) -
1] = final_lyrics[len(final_lyrics)-1] + lyric
else:
final_lyrics.append(lyric)
return "[ti:" + song["title_with_featured"] + "]\n[ar:" + song["primary_artist"]["name"] + "]\n" + "\n".join(final_lyrics)
# First, ensure user input exists
genius_access_token = os.getenv("GENIUS_ACCESS_TOKEN")
if len(genius_access_token) == 0:
genius_access_token = None
if genius_access_token is None:
print("Note: The GENIUS_ACCESS_TOKEN environment variable has not been defined. Only Lyricsify.com will be used as a data source.")
if (len(sys.argv) < 2):
raise NameError(
"The song directory path has not been provided as a parameter.")
song_dir = sys.argv[1]
# For each file in the songs directory, grab the artist/title and use them to find Lyricsify.com lyrics (with Genius.com as a fallback) and save them to the file
files = [os.path.splitext(each) for each in os.listdir(song_dir)]
# To suppress CRC check failed warnings - as a pre-existing CRC issue should not affect lyrics
eyed3.log.setLevel("ERROR")
for i, file in enumerate(files):
audio_file = eyed3.load(song_dir + "/" + file[0] + file[1])
if audio_file is None:
print(str(i+1) + "\tof " + str(len(files)) + " : Failed : Unsupported file format : " +
file[0] + file[1])
continue
if audio_file.tag is None:
audio_file.initTag()
temp_ind = file[0].find("-")
if len(file[0]) > 0 and temp_ind > 0 and not file[0].endswith("-"):
audio_file.tag.artist = file[0][0:temp_ind]
audio_file.tag.title = file[0][temp_ind+1:]
print(str(i+1) + "\tof " + str(len(files)) +
" : Warning : Artist/Title inferred from file name : " + file[0] + file[1])
else:
print(str(i+1) + "\tof " + str(len(files)) + " : Failed : Artist/Title could not be found : " +
file[0] + file[1])
continue
existing_lyrics = ""
for lyric in audio_file.tag.lyrics:
existing_lyrics += lyric.text
if len(existing_lyrics.strip()) > 0:
print(str(i+1) + "\tof " + str(len(files)) + " : Warning : File already has lyrics - skipped : " +
file[0] + file[1])
continue
# Note: re.sub... removes anything in brackets - used for "(feat. ...) as this improves search results"
query = re.sub(r" ?\([^)]+\)", "",
audio_file.tag.artist + " - " + audio_file.tag.title)
site_used = "Lyricsify"
try:
lyrics = lyricsify_find_song_lyrics(query)
except Exception as e:
print("Error getting Lyricsify lyrics for: " + file[0] + file[1])
raise e
if lyrics is None and genius_access_token is not None:
site_used = "Genius "
try:
lyrics = genius_find_song_lyrics(query, genius_access_token)
except Exception as e:
print("Error getting Lyricsify lyrics for: " + file[0] + file[1])
raise e
if lyrics is not None:
audio_file.tag.lyrics.set(lyrics)
audio_file.tag.save()
print(str(i+1) + "\tof " + str(len(files)) + " : Success : Lyrics from " + site_used + " saved to : " +
file[0] + file[1])
else:
print(str(i+1) + "\tof " + str(len(files)) + " : Failed : Lyrics not found for : " +
file[0] + file[1])
# To generate lrc files from AutoLyricize-processed audio files if needed (bash script, requires exiftool):
# for f in *; do lrc="$(exiftool -lyrics "$f" | tail -c +35 | sed 's/\.\./\n/g' | sed 's/\.\[/\n[/g')"; if [ -n "$lrc" ]; then echo "$lrc" > "${f%.*}".lrc; fi; done