legacy_scrapeAll

.py

School

Rutgers University *

*We aren’t endorsed by this school

Course

213

Subject

Computer Science

Date

May 2, 2024

Type

py

Pages

23

Uploaded by PrivateDragonMaster1090 on coursehero.com

import json import sys from bs4 import BeautifulSoup import requests as requests from tqdm import tqdm import datetime import operator #http://sis.rutgers.edu/oldsoc/courses.json? subject=198&semester=12022&campus=NB&level=U #http://sis.rutgers.edu/oldsoc/courses.json? subject=198&semester=12022&campus=NB,NK,CM&level=U # above works but only gets subject 198 which isn't very useful #http://sis.rutgers.edu/oldsoc/courses.json? semester=12022&campus=CM&level=U,G #break down api... #base url base_url = "http://sis.rutgers.edu/oldsoc/courses.json" #ALL COURSES ARE 20MB in unzipped form #surprisingly only 871kb gzipped #NEW API ALERT BABYYYY #https://sis.rutgers.edu/soc/api/courses.json? year=2022&term=9&campus=NB #this does work #split term off #get all courses for new brunswick in one file here lets gooo currentDayList = [] locationList = [] campusList = [] buildingCodeList = [] #get year argument from command line #expecting 4 digit year year = sys.argv[1] #expecting 1 digit term code #7 - summer #9 - fall #0 - winter
#1 - spring term = sys.argv[2] #expecting NB,NK,CM schoolcampus = sys.argv[3] #if unset, use default (figure out from current date) # currCalendarYear = datetime.datetime.now().year # currMonth = datetime.datetime.now().month # currDay = datetime.datetime.now().day #found this from #https://github.com/anxious-engineer/Rutgers-Course-API/blob/ 56af65bd17c5ddf512be5bb2a3d28f05c8d50085/DB-PoC/rusoc/api.py month_to_semester_map = { '1' : '1', 1 : '1', '2' : '1', 2 : '1', '3' : '1', 3 : '1', '4' : '1', 4 : '1', '5' : '1', 5 : '1', '6' : '1', 6 : '1', '7' : '7', 7 : '7', '8' : '7', 8 : '7', '9' : '9', 9 : '9', '10' : '9', 10 : '9', '11' : '9', 11 : '9', '12' : '9', 12 : '9', } # if year == "": # #get current school year # year = str(currCalendarYear) # if term == "": # #determine which term it is
# #after jan 15th and before may 2 is spring session # if currMonth >= 1 and currMonth <= 4: # #spring # term = "1" # elif currMonth >= 5 and currMonth <= 8: # #summer # term = "5" # elif currMonth >= 9 and currMonth <= 12: # #fall # term = "9" # if schoolcampus == "": # schoolcampus = "NB" #https://sis.rutgers.edu/soc/?term=92022 #this can convert PH -> to PHARMACY & id:3750 #nameConversion nameConvertUrl = "https://sis.rutgers.edu/soc/? term="+term+""+year print(nameConvertUrl) # r = requests.get("https://sis.rutgers.edu/soc/?term=92022") r = requests.get(nameConvertUrl) soup = BeautifulSoup(r.content, features="html.parser") jsonDiv = json.loads(soup.find('div', {"id": "initJsonData"}).text) buildingCodes = [d["code"] for d in jsonDiv["buildings"]] buildingNames = [d["name"] for d in jsonDiv["buildings"]] buildingIDs = [d["id"] for d in jsonDiv["buildings"]] def getBuildingID(buildingCodeAbbrev): tempIndex = buildingCodes.index(buildingCodeAbbrev) return { "name": buildingNames[tempIndex], "id": buildingIDs[tempIndex] } #print(dict.keys(jsonDiv)) #print(json.dumps(jsonDiv, indent=2)) #print(json.dumps(jsonDiv['buildings'], indent=2))
#get coords of buildings #for now, import file. later, pull directly from URL buildingLayer = open("fromMap/buildings-parking-layer.json") buildingLayerJson = json.load(buildingLayer) #print(json.dumps(buildingLayerJson, indent=2)) #print(dict.keys(buildingLayerJson)) buildingLayerIndex = [d["id"] for d in buildingLayerJson['features']] def getBuildingGeoData(buildingID): if buildingID in buildingLayerIndex: curr = buildingLayerJson['features'] [buildingLayerIndex.index(buildingID)] return { "geometry": curr['geometry'], "properties": curr['properties'] } else: return #get coords of regions #NOT HELPFUL! # campusLayer = open("fromMap/districts.json") # campusLayerJSON = json.load(campusLayer) # #print(json.dumps(buildingLayerJson, indent=2)) # #print(dict.keys(buildingLayerJson)) # campusLayerIndex = [d["id"] for d in campusLayerJSON['features']] # print(campusLayerIndex) # def getDistrictGeoData(campusID): # campusID = int(campusID) # if campusID in campusLayerIndex: # #print("Campus id:"+str(campusID)) # curr = campusLayerJSON['features'] [campusLayerIndex.index(campusID)] # return { # "geometry": curr['geometry'], # "properties": curr['properties'] # } # else: # return #MAP API ALERT!!! #https://maps.rutgers.edu/#/?click=true&selected=3117 map_base_url = ""
#IMAGE API image_base_url = "https://storage.googleapis.com/rutgers-campus- map-building-images-prod/" image_end_url = "/00.jpg" #IMPORT NOW DOWNLOAD LATER # coursedataFile = open("fromSIS/courses2.json") # coursedataJSON = json.load(coursedataFile) #download #https://sis.rutgers.edu/soc/api/courses.json? year=2022&term=9&campus=NB,NK,CM #could look into #https://scheduling.rutgers.edu/scheduling/class-scheduling/ standard-course-periods-0 #this page to help compress the data #since most classes follow the standard course periods listed here, #the abbreviations can be used to express the standard class times #and the full times can be used for the exceptions #Standard Course Periods # 80min Periods 55min Periods 180min Periods # 1 8:30 9:40am 1,2 8:30 11:40am 2* 10:35 1:20pm # 3 12:10 1:20pm 3,4 12:10 3:20pm 4* 2:15 5:00pm # 5 3:50 5:00pm 5,6 3:50 7:00pm 6* 5:55 8:40pm # 7 7:30 8:40pm Grad Eve 6:00 10:40pm 8* 9:35 10:30pm #Standard period combinations: 2x80min, 2x55min # Monday Tuesday Wednesday Thursday Friday # 1 MTh1 TF1 W1F6 MTh1 TF1 # 2 MTh2 TF2 W2F5 MTh2 TF2 # 3 MTh3 TF3 W3F4 MTh3 TF3 # 4 MW4 TTh4 MW4 TTh4 W3F4
# 5 MW5 TTh5 MW5 TTh5 W2F5 # 6 MW6 TTh6 MW6 TTh6 W1F6 # 7 MW7 TTh7 MW7 TTh7 # 8 MW8 TTh8 MW8 TTh8 # Standard Period Combinations: 3x80min, 3x55min # All meetings should take place in the same room # Monday Tuesday Wednesday Thursday Friday # 1 MWTh1 TWF1 MWTh1 MWTh1 TWF1 # TWF1 # 2 MWTh2 TWF2 MWTh2 MWTh2 TWF2 # TWF2 # 3 MWTh3 TWF3 MWTh3 MWTh3 TWF3 # TWF3 # 4 MWF4 TThF4 MWF4 TThF4 MWF4 # TThF4 # 5 MWF5 TThF5 MWF5 TThF5 MWF5 # TThF5 # 6 MWF6 TThF6 MWF6 TThF6 MWF6 # TThF6 #https://scheduling.rutgers.edu/scheduling/academic-calendar scheduleURL = "https://scheduling.rutgers.edu/scheduling/academic-calendar" sched = requests.get(scheduleURL) schedSoup = BeautifulSoup(sched.content, features="html.parser") #find div.responsive-table__scroll table.pretty-table.responsive- enabled # schedTable = schedSoup.find_all('div', {"class": "responsive- table__scroll"}) # print(schedSoup.body.find({"class": "responsive-table"})) #found it! # print(schedSoup.table) #.find('table', {"class": "pretty-table responsive-enabled"}) #this is for new api, which does not support multiple campuses, so I am using old api
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help